1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the interfaces that X86 uses to lower LLVM code into a
10 // selection DAG.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "X86ISelLowering.h"
15 #include "MCTargetDesc/X86ShuffleDecode.h"
16 #include "X86.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86TargetMachine.h"
23 #include "X86TargetObjectFile.h"
24 #include "llvm/ADT/SmallBitVector.h"
25 #include "llvm/ADT/SmallSet.h"
26 #include "llvm/ADT/Statistic.h"
27 #include "llvm/ADT/StringExtras.h"
28 #include "llvm/ADT/StringSwitch.h"
29 #include "llvm/Analysis/BlockFrequencyInfo.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/Analysis/ObjCARCUtil.h"
32 #include "llvm/Analysis/ProfileSummaryInfo.h"
33 #include "llvm/Analysis/VectorUtils.h"
34 #include "llvm/CodeGen/IntrinsicLowering.h"
35 #include "llvm/CodeGen/MachineFrameInfo.h"
36 #include "llvm/CodeGen/MachineFunction.h"
37 #include "llvm/CodeGen/MachineInstrBuilder.h"
38 #include "llvm/CodeGen/MachineJumpTableInfo.h"
39 #include "llvm/CodeGen/MachineLoopInfo.h"
40 #include "llvm/CodeGen/MachineModuleInfo.h"
41 #include "llvm/CodeGen/MachineRegisterInfo.h"
42 #include "llvm/CodeGen/TargetLowering.h"
43 #include "llvm/CodeGen/WinEHFuncInfo.h"
44 #include "llvm/IR/CallingConv.h"
45 #include "llvm/IR/Constants.h"
46 #include "llvm/IR/DerivedTypes.h"
47 #include "llvm/IR/DiagnosticInfo.h"
48 #include "llvm/IR/Function.h"
49 #include "llvm/IR/GlobalAlias.h"
50 #include "llvm/IR/GlobalVariable.h"
51 #include "llvm/IR/Instructions.h"
52 #include "llvm/IR/Intrinsics.h"
53 #include "llvm/IR/IRBuilder.h"
54 #include "llvm/MC/MCAsmInfo.h"
55 #include "llvm/MC/MCContext.h"
56 #include "llvm/MC/MCExpr.h"
57 #include "llvm/MC/MCSymbol.h"
58 #include "llvm/Support/CommandLine.h"
59 #include "llvm/Support/Debug.h"
60 #include "llvm/Support/ErrorHandling.h"
61 #include "llvm/Support/KnownBits.h"
62 #include "llvm/Support/MathExtras.h"
63 #include "llvm/Target/TargetOptions.h"
64 #include <algorithm>
65 #include <bitset>
66 #include <cctype>
67 #include <numeric>
68 using namespace llvm;
69 
70 #define DEBUG_TYPE "x86-isel"
71 
72 STATISTIC(NumTailCalls, "Number of tail calls");
73 
74 static cl::opt<int> ExperimentalPrefLoopAlignment(
75     "x86-experimental-pref-loop-alignment", cl::init(4),
76     cl::desc(
77         "Sets the preferable loop alignment for experiments (as log2 bytes)"
78         "(the last x86-experimental-pref-loop-alignment bits"
79         " of the loop header PC will be 0)."),
80     cl::Hidden);
81 
82 static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
83     "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
84     cl::desc(
85         "Sets the preferable loop alignment for experiments (as log2 bytes) "
86         "for innermost loops only. If specified, this option overrides "
87         "alignment set by x86-experimental-pref-loop-alignment."),
88     cl::Hidden);
89 
90 static cl::opt<bool> MulConstantOptimization(
91     "mul-constant-optimization", cl::init(true),
92     cl::desc("Replace 'mul x, Const' with more effective instructions like "
93              "SHIFT, LEA, etc."),
94     cl::Hidden);
95 
96 static cl::opt<bool> ExperimentalUnorderedISEL(
97     "x86-experimental-unordered-atomic-isel", cl::init(false),
98     cl::desc("Use LoadSDNode and StoreSDNode instead of "
99              "AtomicSDNode for unordered atomic loads and "
100              "stores respectively."),
101     cl::Hidden);
102 
103 /// Call this when the user attempts to do something unsupported, like
104 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
105 /// report_fatal_error, so calling code should attempt to recover without
106 /// crashing.
errorUnsupported(SelectionDAG & DAG,const SDLoc & dl,const char * Msg)107 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
108                              const char *Msg) {
109   MachineFunction &MF = DAG.getMachineFunction();
110   DAG.getContext()->diagnose(
111       DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
112 }
113 
X86TargetLowering(const X86TargetMachine & TM,const X86Subtarget & STI)114 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
115                                      const X86Subtarget &STI)
116     : TargetLowering(TM), Subtarget(STI) {
117   bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
118   X86ScalarSSEf64 = Subtarget.hasSSE2();
119   X86ScalarSSEf32 = Subtarget.hasSSE1();
120   MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
121 
122   // Set up the TargetLowering object.
123 
124   // X86 is weird. It always uses i8 for shift amounts and setcc results.
125   setBooleanContents(ZeroOrOneBooleanContent);
126   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
127   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
128 
129   // For 64-bit, since we have so many registers, use the ILP scheduler.
130   // For 32-bit, use the register pressure specific scheduling.
131   // For Atom, always use ILP scheduling.
132   if (Subtarget.isAtom())
133     setSchedulingPreference(Sched::ILP);
134   else if (Subtarget.is64Bit())
135     setSchedulingPreference(Sched::ILP);
136   else
137     setSchedulingPreference(Sched::RegPressure);
138   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
139   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
140 
141   // Bypass expensive divides and use cheaper ones.
142   if (TM.getOptLevel() >= CodeGenOpt::Default) {
143     if (Subtarget.hasSlowDivide32())
144       addBypassSlowDiv(32, 8);
145     if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
146       addBypassSlowDiv(64, 32);
147   }
148 
149   // Setup Windows compiler runtime calls.
150   if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
151     static const struct {
152       const RTLIB::Libcall Op;
153       const char * const Name;
154       const CallingConv::ID CC;
155     } LibraryCalls[] = {
156       { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
157       { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
158       { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
159       { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
160       { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
161     };
162 
163     for (const auto &LC : LibraryCalls) {
164       setLibcallName(LC.Op, LC.Name);
165       setLibcallCallingConv(LC.Op, LC.CC);
166     }
167   }
168 
169   if (Subtarget.getTargetTriple().isOSMSVCRT()) {
170     // MSVCRT doesn't have powi; fall back to pow
171     setLibcallName(RTLIB::POWI_F32, nullptr);
172     setLibcallName(RTLIB::POWI_F64, nullptr);
173   }
174 
175   // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
176   // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
177   // FIXME: Should we be limiting the atomic size on other configs? Default is
178   // 1024.
179   if (!Subtarget.hasCmpxchg8b())
180     setMaxAtomicSizeInBitsSupported(32);
181 
182   // Set up the register classes.
183   addRegisterClass(MVT::i8, &X86::GR8RegClass);
184   addRegisterClass(MVT::i16, &X86::GR16RegClass);
185   addRegisterClass(MVT::i32, &X86::GR32RegClass);
186   if (Subtarget.is64Bit())
187     addRegisterClass(MVT::i64, &X86::GR64RegClass);
188 
189   for (MVT VT : MVT::integer_valuetypes())
190     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
191 
192   // We don't accept any truncstore of integer registers.
193   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
194   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
195   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
196   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
197   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
198   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
199 
200   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
201 
202   // SETOEQ and SETUNE require checking two conditions.
203   for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
204     setCondCodeAction(ISD::SETOEQ, VT, Expand);
205     setCondCodeAction(ISD::SETUNE, VT, Expand);
206   }
207 
208   // Integer absolute.
209   if (Subtarget.hasCMov()) {
210     setOperationAction(ISD::ABS            , MVT::i16  , Custom);
211     setOperationAction(ISD::ABS            , MVT::i32  , Custom);
212     if (Subtarget.is64Bit())
213       setOperationAction(ISD::ABS          , MVT::i64  , Custom);
214   }
215 
216   // Funnel shifts.
217   for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
218     // For slow shld targets we only lower for code size.
219     LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
220 
221     setOperationAction(ShiftOp             , MVT::i8   , Custom);
222     setOperationAction(ShiftOp             , MVT::i16  , Custom);
223     setOperationAction(ShiftOp             , MVT::i32  , ShiftDoubleAction);
224     if (Subtarget.is64Bit())
225       setOperationAction(ShiftOp           , MVT::i64  , ShiftDoubleAction);
226   }
227 
228   if (!Subtarget.useSoftFloat()) {
229     // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
230     // operation.
231     setOperationAction(ISD::UINT_TO_FP,        MVT::i8, Promote);
232     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
233     setOperationAction(ISD::UINT_TO_FP,        MVT::i16, Promote);
234     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
235     // We have an algorithm for SSE2, and we turn this into a 64-bit
236     // FILD or VCVTUSI2SS/SD for other targets.
237     setOperationAction(ISD::UINT_TO_FP,        MVT::i32, Custom);
238     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
239     // We have an algorithm for SSE2->double, and we turn this into a
240     // 64-bit FILD followed by conditional FADD for other targets.
241     setOperationAction(ISD::UINT_TO_FP,        MVT::i64, Custom);
242     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
243 
244     // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
245     // this operation.
246     setOperationAction(ISD::SINT_TO_FP,        MVT::i8, Promote);
247     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
248     // SSE has no i16 to fp conversion, only i32. We promote in the handler
249     // to allow f80 to use i16 and f64 to use i16 with sse1 only
250     setOperationAction(ISD::SINT_TO_FP,        MVT::i16, Custom);
251     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
252     // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
253     setOperationAction(ISD::SINT_TO_FP,        MVT::i32, Custom);
254     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
255     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
256     // are Legal, f80 is custom lowered.
257     setOperationAction(ISD::SINT_TO_FP,        MVT::i64, Custom);
258     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
259 
260     // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
261     // this operation.
262     setOperationAction(ISD::FP_TO_SINT,        MVT::i8,  Promote);
263     // FIXME: This doesn't generate invalid exception when it should. PR44019.
264     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8,  Promote);
265     setOperationAction(ISD::FP_TO_SINT,        MVT::i16, Custom);
266     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
267     setOperationAction(ISD::FP_TO_SINT,        MVT::i32, Custom);
268     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
269     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
270     // are Legal, f80 is custom lowered.
271     setOperationAction(ISD::FP_TO_SINT,        MVT::i64, Custom);
272     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
273 
274     // Handle FP_TO_UINT by promoting the destination to a larger signed
275     // conversion.
276     setOperationAction(ISD::FP_TO_UINT,        MVT::i8,  Promote);
277     // FIXME: This doesn't generate invalid exception when it should. PR44019.
278     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8,  Promote);
279     setOperationAction(ISD::FP_TO_UINT,        MVT::i16, Promote);
280     // FIXME: This doesn't generate invalid exception when it should. PR44019.
281     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
282     setOperationAction(ISD::FP_TO_UINT,        MVT::i32, Custom);
283     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
284     setOperationAction(ISD::FP_TO_UINT,        MVT::i64, Custom);
285     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
286 
287     setOperationAction(ISD::LRINT,             MVT::f32, Custom);
288     setOperationAction(ISD::LRINT,             MVT::f64, Custom);
289     setOperationAction(ISD::LLRINT,            MVT::f32, Custom);
290     setOperationAction(ISD::LLRINT,            MVT::f64, Custom);
291 
292     if (!Subtarget.is64Bit()) {
293       setOperationAction(ISD::LRINT,  MVT::i64, Custom);
294       setOperationAction(ISD::LLRINT, MVT::i64, Custom);
295     }
296   }
297 
298   if (Subtarget.hasSSE2()) {
299     // Custom lowering for saturating float to int conversions.
300     // We handle promotion to larger result types manually.
301     for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
302       setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
303       setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
304     }
305     if (Subtarget.is64Bit()) {
306       setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
307       setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
308     }
309   }
310 
311   // Handle address space casts between mixed sized pointers.
312   setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
313   setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
314 
315   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
316   if (!X86ScalarSSEf64) {
317     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
318     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
319     if (Subtarget.is64Bit()) {
320       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
321       // Without SSE, i64->f64 goes through memory.
322       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
323     }
324   } else if (!Subtarget.is64Bit())
325     setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);
326 
327   // Scalar integer divide and remainder are lowered to use operations that
328   // produce two results, to match the available instructions. This exposes
329   // the two-result form to trivial CSE, which is able to combine x/y and x%y
330   // into a single instruction.
331   //
332   // Scalar integer multiply-high is also lowered to use two-result
333   // operations, to match the available instructions. However, plain multiply
334   // (low) operations are left as Legal, as there are single-result
335   // instructions for this in x86. Using the two-result multiply instructions
336   // when both high and low results are needed must be arranged by dagcombine.
337   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
338     setOperationAction(ISD::MULHS, VT, Expand);
339     setOperationAction(ISD::MULHU, VT, Expand);
340     setOperationAction(ISD::SDIV, VT, Expand);
341     setOperationAction(ISD::UDIV, VT, Expand);
342     setOperationAction(ISD::SREM, VT, Expand);
343     setOperationAction(ISD::UREM, VT, Expand);
344   }
345 
346   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
347   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
348   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
349                    MVT::i8,  MVT::i16, MVT::i32, MVT::i64 }) {
350     setOperationAction(ISD::BR_CC,     VT, Expand);
351     setOperationAction(ISD::SELECT_CC, VT, Expand);
352   }
353   if (Subtarget.is64Bit())
354     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
355   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
356   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
357   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
358 
359   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
360   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
361   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
362   setOperationAction(ISD::FREM             , MVT::f128 , Expand);
363 
364   if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
365     setOperationAction(ISD::FLT_ROUNDS_    , MVT::i32  , Custom);
366     setOperationAction(ISD::SET_ROUNDING   , MVT::Other, Custom);
367   }
368 
369   // Promote the i8 variants and force them on up to i32 which has a shorter
370   // encoding.
371   setOperationPromotedToType(ISD::CTTZ           , MVT::i8   , MVT::i32);
372   setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
373 
374   if (Subtarget.hasBMI()) {
375     // Promote the i16 zero undef variant and force it on up to i32 when tzcnt
376     // is enabled.
377     setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16, MVT::i32);
378   } else {
379     setOperationAction(ISD::CTTZ, MVT::i16, Custom);
380     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
381     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Legal);
382     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Legal);
383     if (Subtarget.is64Bit()) {
384       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
385       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
386     }
387   }
388 
389   if (Subtarget.hasLZCNT()) {
390     // When promoting the i8 variants, force them to i32 for a shorter
391     // encoding.
392     setOperationPromotedToType(ISD::CTLZ           , MVT::i8   , MVT::i32);
393     setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
394   } else {
395     for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
396       if (VT == MVT::i64 && !Subtarget.is64Bit())
397         continue;
398       setOperationAction(ISD::CTLZ           , VT, Custom);
399       setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
400     }
401   }
402 
403   for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
404                   ISD::STRICT_FP_TO_FP16}) {
405     // Special handling for half-precision floating point conversions.
406     // If we don't have F16C support, then lower half float conversions
407     // into library calls.
408     setOperationAction(
409         Op, MVT::f32,
410         (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
411     // There's never any support for operations beyond MVT::f32.
412     setOperationAction(Op, MVT::f64, Expand);
413     setOperationAction(Op, MVT::f80, Expand);
414     setOperationAction(Op, MVT::f128, Expand);
415   }
416 
417   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
418   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
419   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
420   setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
421   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
422   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
423   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
424   setTruncStoreAction(MVT::f128, MVT::f16, Expand);
425 
426   setOperationAction(ISD::PARITY, MVT::i8, Custom);
427   if (Subtarget.hasPOPCNT()) {
428     setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
429   } else {
430     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
431     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
432     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
433     if (Subtarget.is64Bit())
434       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
435     else
436       setOperationAction(ISD::CTPOP        , MVT::i64  , Custom);
437 
438     setOperationAction(ISD::PARITY, MVT::i16, Custom);
439     setOperationAction(ISD::PARITY, MVT::i32, Custom);
440     if (Subtarget.is64Bit())
441       setOperationAction(ISD::PARITY, MVT::i64, Custom);
442   }
443 
444   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
445 
446   if (!Subtarget.hasMOVBE())
447     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
448 
449   // X86 wants to expand cmov itself.
450   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
451     setOperationAction(ISD::SELECT, VT, Custom);
452     setOperationAction(ISD::SETCC, VT, Custom);
453     setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
454     setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
455   }
456   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
457     if (VT == MVT::i64 && !Subtarget.is64Bit())
458       continue;
459     setOperationAction(ISD::SELECT, VT, Custom);
460     setOperationAction(ISD::SETCC,  VT, Custom);
461   }
462 
463   // Custom action for SELECT MMX and expand action for SELECT_CC MMX
464   setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
465   setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
466 
467   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
468   // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
469   // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
470   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
471   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
472   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
473   if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
474     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
475 
476   // Darwin ABI issue.
477   for (auto VT : { MVT::i32, MVT::i64 }) {
478     if (VT == MVT::i64 && !Subtarget.is64Bit())
479       continue;
480     setOperationAction(ISD::ConstantPool    , VT, Custom);
481     setOperationAction(ISD::JumpTable       , VT, Custom);
482     setOperationAction(ISD::GlobalAddress   , VT, Custom);
483     setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
484     setOperationAction(ISD::ExternalSymbol  , VT, Custom);
485     setOperationAction(ISD::BlockAddress    , VT, Custom);
486   }
487 
488   // 64-bit shl, sra, srl (iff 32-bit x86)
489   for (auto VT : { MVT::i32, MVT::i64 }) {
490     if (VT == MVT::i64 && !Subtarget.is64Bit())
491       continue;
492     setOperationAction(ISD::SHL_PARTS, VT, Custom);
493     setOperationAction(ISD::SRA_PARTS, VT, Custom);
494     setOperationAction(ISD::SRL_PARTS, VT, Custom);
495   }
496 
497   if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
498     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
499 
500   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
501 
502   // Expand certain atomics
503   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
504     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
505     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
506     setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
507     setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
508     setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
509     setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
510     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
511   }
512 
513   if (!Subtarget.is64Bit())
514     setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
515 
516   if (Subtarget.hasCmpxchg16b()) {
517     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
518   }
519 
520   // FIXME - use subtarget debug flags
521   if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
522       !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
523       TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
524     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
525   }
526 
527   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
528   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
529 
530   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
531   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
532 
533   setOperationAction(ISD::TRAP, MVT::Other, Legal);
534   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
535   setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
536 
537   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
538   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
539   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
540   bool Is64Bit = Subtarget.is64Bit();
541   setOperationAction(ISD::VAARG,  MVT::Other, Is64Bit ? Custom : Expand);
542   setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
543 
544   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
545   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
546 
547   setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
548 
549   // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
550   setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
551   setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
552 
553   if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
554     // f32 and f64 use SSE.
555     // Set up the FP register classes.
556     addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
557                                                      : &X86::FR32RegClass);
558     addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
559                                                      : &X86::FR64RegClass);
560 
561     // Disable f32->f64 extload as we can only generate this in one instruction
562     // under optsize. So its easier to pattern match (fpext (load)) for that
563     // case instead of needing to emit 2 instructions for extload in the
564     // non-optsize case.
565     setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
566 
567     for (auto VT : { MVT::f32, MVT::f64 }) {
568       // Use ANDPD to simulate FABS.
569       setOperationAction(ISD::FABS, VT, Custom);
570 
571       // Use XORP to simulate FNEG.
572       setOperationAction(ISD::FNEG, VT, Custom);
573 
574       // Use ANDPD and ORPD to simulate FCOPYSIGN.
575       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
576 
577       // These might be better off as horizontal vector ops.
578       setOperationAction(ISD::FADD, VT, Custom);
579       setOperationAction(ISD::FSUB, VT, Custom);
580 
581       // We don't support sin/cos/fmod
582       setOperationAction(ISD::FSIN   , VT, Expand);
583       setOperationAction(ISD::FCOS   , VT, Expand);
584       setOperationAction(ISD::FSINCOS, VT, Expand);
585     }
586 
587     // Lower this to MOVMSK plus an AND.
588     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
589     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
590 
591   } else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&
592              (UseX87 || Is64Bit)) {
593     // Use SSE for f32, x87 for f64.
594     // Set up the FP register classes.
595     addRegisterClass(MVT::f32, &X86::FR32RegClass);
596     if (UseX87)
597       addRegisterClass(MVT::f64, &X86::RFP64RegClass);
598 
599     // Use ANDPS to simulate FABS.
600     setOperationAction(ISD::FABS , MVT::f32, Custom);
601 
602     // Use XORP to simulate FNEG.
603     setOperationAction(ISD::FNEG , MVT::f32, Custom);
604 
605     if (UseX87)
606       setOperationAction(ISD::UNDEF, MVT::f64, Expand);
607 
608     // Use ANDPS and ORPS to simulate FCOPYSIGN.
609     if (UseX87)
610       setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
611     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
612 
613     // We don't support sin/cos/fmod
614     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
615     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
616     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
617 
618     if (UseX87) {
619       // Always expand sin/cos functions even though x87 has an instruction.
620       setOperationAction(ISD::FSIN, MVT::f64, Expand);
621       setOperationAction(ISD::FCOS, MVT::f64, Expand);
622       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
623     }
624   } else if (UseX87) {
625     // f32 and f64 in x87.
626     // Set up the FP register classes.
627     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
628     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
629 
630     for (auto VT : { MVT::f32, MVT::f64 }) {
631       setOperationAction(ISD::UNDEF,     VT, Expand);
632       setOperationAction(ISD::FCOPYSIGN, VT, Expand);
633 
634       // Always expand sin/cos functions even though x87 has an instruction.
635       setOperationAction(ISD::FSIN   , VT, Expand);
636       setOperationAction(ISD::FCOS   , VT, Expand);
637       setOperationAction(ISD::FSINCOS, VT, Expand);
638     }
639   }
640 
641   // Expand FP32 immediates into loads from the stack, save special cases.
642   if (isTypeLegal(MVT::f32)) {
643     if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
644       addLegalFPImmediate(APFloat(+0.0f)); // FLD0
645       addLegalFPImmediate(APFloat(+1.0f)); // FLD1
646       addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
647       addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
648     } else // SSE immediates.
649       addLegalFPImmediate(APFloat(+0.0f)); // xorps
650   }
651   // Expand FP64 immediates into loads from the stack, save special cases.
652   if (isTypeLegal(MVT::f64)) {
653     if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
654       addLegalFPImmediate(APFloat(+0.0)); // FLD0
655       addLegalFPImmediate(APFloat(+1.0)); // FLD1
656       addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
657       addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
658     } else // SSE immediates.
659       addLegalFPImmediate(APFloat(+0.0)); // xorpd
660   }
661   // Handle constrained floating-point operations of scalar.
662   setOperationAction(ISD::STRICT_FADD,      MVT::f32, Legal);
663   setOperationAction(ISD::STRICT_FADD,      MVT::f64, Legal);
664   setOperationAction(ISD::STRICT_FSUB,      MVT::f32, Legal);
665   setOperationAction(ISD::STRICT_FSUB,      MVT::f64, Legal);
666   setOperationAction(ISD::STRICT_FMUL,      MVT::f32, Legal);
667   setOperationAction(ISD::STRICT_FMUL,      MVT::f64, Legal);
668   setOperationAction(ISD::STRICT_FDIV,      MVT::f32, Legal);
669   setOperationAction(ISD::STRICT_FDIV,      MVT::f64, Legal);
670   setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
671   setOperationAction(ISD::STRICT_FP_ROUND,  MVT::f32, Legal);
672   setOperationAction(ISD::STRICT_FP_ROUND,  MVT::f64, Legal);
673   setOperationAction(ISD::STRICT_FSQRT,     MVT::f32, Legal);
674   setOperationAction(ISD::STRICT_FSQRT,     MVT::f64, Legal);
675 
676   // We don't support FMA.
677   setOperationAction(ISD::FMA, MVT::f64, Expand);
678   setOperationAction(ISD::FMA, MVT::f32, Expand);
679 
680   // f80 always uses X87.
681   if (UseX87) {
682     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
683     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
684     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
685     {
686       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
687       addLegalFPImmediate(TmpFlt);  // FLD0
688       TmpFlt.changeSign();
689       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
690 
691       bool ignored;
692       APFloat TmpFlt2(+1.0);
693       TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
694                       &ignored);
695       addLegalFPImmediate(TmpFlt2);  // FLD1
696       TmpFlt2.changeSign();
697       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
698     }
699 
700     // Always expand sin/cos functions even though x87 has an instruction.
701     setOperationAction(ISD::FSIN   , MVT::f80, Expand);
702     setOperationAction(ISD::FCOS   , MVT::f80, Expand);
703     setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
704 
705     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
706     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
707     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
708     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
709     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
710     setOperationAction(ISD::FMA, MVT::f80, Expand);
711     setOperationAction(ISD::LROUND, MVT::f80, Expand);
712     setOperationAction(ISD::LLROUND, MVT::f80, Expand);
713     setOperationAction(ISD::LRINT, MVT::f80, Custom);
714     setOperationAction(ISD::LLRINT, MVT::f80, Custom);
715 
716     // Handle constrained floating-point operations of scalar.
717     setOperationAction(ISD::STRICT_FADD     , MVT::f80, Legal);
718     setOperationAction(ISD::STRICT_FSUB     , MVT::f80, Legal);
719     setOperationAction(ISD::STRICT_FMUL     , MVT::f80, Legal);
720     setOperationAction(ISD::STRICT_FDIV     , MVT::f80, Legal);
721     setOperationAction(ISD::STRICT_FSQRT    , MVT::f80, Legal);
722     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
723     // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
724     // as Custom.
725     setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
726   }
727 
728   // f128 uses xmm registers, but most operations require libcalls.
729   if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
730     addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
731                                                    : &X86::VR128RegClass);
732 
733     addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
734 
735     setOperationAction(ISD::FADD,        MVT::f128, LibCall);
736     setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
737     setOperationAction(ISD::FSUB,        MVT::f128, LibCall);
738     setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
739     setOperationAction(ISD::FDIV,        MVT::f128, LibCall);
740     setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
741     setOperationAction(ISD::FMUL,        MVT::f128, LibCall);
742     setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
743     setOperationAction(ISD::FMA,         MVT::f128, LibCall);
744     setOperationAction(ISD::STRICT_FMA,  MVT::f128, LibCall);
745 
746     setOperationAction(ISD::FABS, MVT::f128, Custom);
747     setOperationAction(ISD::FNEG, MVT::f128, Custom);
748     setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
749 
750     setOperationAction(ISD::FSIN,         MVT::f128, LibCall);
751     setOperationAction(ISD::STRICT_FSIN,  MVT::f128, LibCall);
752     setOperationAction(ISD::FCOS,         MVT::f128, LibCall);
753     setOperationAction(ISD::STRICT_FCOS,  MVT::f128, LibCall);
754     setOperationAction(ISD::FSINCOS,      MVT::f128, LibCall);
755     // No STRICT_FSINCOS
756     setOperationAction(ISD::FSQRT,        MVT::f128, LibCall);
757     setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
758 
759     setOperationAction(ISD::FP_EXTEND,        MVT::f128, Custom);
760     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
761     // We need to custom handle any FP_ROUND with an f128 input, but
762     // LegalizeDAG uses the result type to know when to run a custom handler.
763     // So we have to list all legal floating point result types here.
764     if (isTypeLegal(MVT::f32)) {
765       setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
766       setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
767     }
768     if (isTypeLegal(MVT::f64)) {
769       setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
770       setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
771     }
772     if (isTypeLegal(MVT::f80)) {
773       setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
774       setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
775     }
776 
777     setOperationAction(ISD::SETCC, MVT::f128, Custom);
778 
779     setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
780     setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
781     setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
782     setTruncStoreAction(MVT::f128, MVT::f32, Expand);
783     setTruncStoreAction(MVT::f128, MVT::f64, Expand);
784     setTruncStoreAction(MVT::f128, MVT::f80, Expand);
785   }
786 
787   // Always use a library call for pow.
788   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
789   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
790   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
791   setOperationAction(ISD::FPOW             , MVT::f128 , Expand);
792 
793   setOperationAction(ISD::FLOG, MVT::f80, Expand);
794   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
795   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
796   setOperationAction(ISD::FEXP, MVT::f80, Expand);
797   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
798   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
799   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
800 
801   // Some FP actions are always expanded for vector types.
802   for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
803                    MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
804     setOperationAction(ISD::FSIN,      VT, Expand);
805     setOperationAction(ISD::FSINCOS,   VT, Expand);
806     setOperationAction(ISD::FCOS,      VT, Expand);
807     setOperationAction(ISD::FREM,      VT, Expand);
808     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
809     setOperationAction(ISD::FPOW,      VT, Expand);
810     setOperationAction(ISD::FLOG,      VT, Expand);
811     setOperationAction(ISD::FLOG2,     VT, Expand);
812     setOperationAction(ISD::FLOG10,    VT, Expand);
813     setOperationAction(ISD::FEXP,      VT, Expand);
814     setOperationAction(ISD::FEXP2,     VT, Expand);
815   }
816 
817   // First set operation action for all vector types to either promote
818   // (for widening) or expand (for scalarization). Then we will selectively
819   // turn on ones that can be effectively codegen'd.
820   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
821     setOperationAction(ISD::SDIV, VT, Expand);
822     setOperationAction(ISD::UDIV, VT, Expand);
823     setOperationAction(ISD::SREM, VT, Expand);
824     setOperationAction(ISD::UREM, VT, Expand);
825     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
826     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
827     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
828     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
829     setOperationAction(ISD::FMA,  VT, Expand);
830     setOperationAction(ISD::FFLOOR, VT, Expand);
831     setOperationAction(ISD::FCEIL, VT, Expand);
832     setOperationAction(ISD::FTRUNC, VT, Expand);
833     setOperationAction(ISD::FRINT, VT, Expand);
834     setOperationAction(ISD::FNEARBYINT, VT, Expand);
835     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
836     setOperationAction(ISD::MULHS, VT, Expand);
837     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
838     setOperationAction(ISD::MULHU, VT, Expand);
839     setOperationAction(ISD::SDIVREM, VT, Expand);
840     setOperationAction(ISD::UDIVREM, VT, Expand);
841     setOperationAction(ISD::CTPOP, VT, Expand);
842     setOperationAction(ISD::CTTZ, VT, Expand);
843     setOperationAction(ISD::CTLZ, VT, Expand);
844     setOperationAction(ISD::ROTL, VT, Expand);
845     setOperationAction(ISD::ROTR, VT, Expand);
846     setOperationAction(ISD::BSWAP, VT, Expand);
847     setOperationAction(ISD::SETCC, VT, Expand);
848     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
849     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
850     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
851     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
852     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
853     setOperationAction(ISD::TRUNCATE, VT, Expand);
854     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
855     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
856     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
857     setOperationAction(ISD::SELECT_CC, VT, Expand);
858     for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
859       setTruncStoreAction(InnerVT, VT, Expand);
860 
861       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
862       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
863 
864       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
865       // types, we have to deal with them whether we ask for Expansion or not.
866       // Setting Expand causes its own optimisation problems though, so leave
867       // them legal.
868       if (VT.getVectorElementType() == MVT::i1)
869         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
870 
871       // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
872       // split/scalarized right now.
873       if (VT.getVectorElementType() == MVT::f16)
874         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
875     }
876   }
877 
878   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
879   // with -msoft-float, disable use of MMX as well.
880   if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
881     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
882     // No operations on x86mmx supported, everything uses intrinsics.
883   }
884 
885   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
886     addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
887                                                     : &X86::VR128RegClass);
888 
889     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
890     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
891     setOperationAction(ISD::FCOPYSIGN,          MVT::v4f32, Custom);
892     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
893     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
894     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
895     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
896     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
897 
898     setOperationAction(ISD::LOAD,               MVT::v2f32, Custom);
899     setOperationAction(ISD::STORE,              MVT::v2f32, Custom);
900 
901     setOperationAction(ISD::STRICT_FADD,        MVT::v4f32, Legal);
902     setOperationAction(ISD::STRICT_FSUB,        MVT::v4f32, Legal);
903     setOperationAction(ISD::STRICT_FMUL,        MVT::v4f32, Legal);
904     setOperationAction(ISD::STRICT_FDIV,        MVT::v4f32, Legal);
905     setOperationAction(ISD::STRICT_FSQRT,       MVT::v4f32, Legal);
906   }
907 
908   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
909     addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
910                                                     : &X86::VR128RegClass);
911 
912     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
913     // registers cannot be used even for integer operations.
914     addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
915                                                     : &X86::VR128RegClass);
916     addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
917                                                     : &X86::VR128RegClass);
918     addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
919                                                     : &X86::VR128RegClass);
920     addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
921                                                     : &X86::VR128RegClass);
922 
923     for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
924                      MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
925       setOperationAction(ISD::SDIV, VT, Custom);
926       setOperationAction(ISD::SREM, VT, Custom);
927       setOperationAction(ISD::UDIV, VT, Custom);
928       setOperationAction(ISD::UREM, VT, Custom);
929     }
930 
931     setOperationAction(ISD::MUL,                MVT::v2i8,  Custom);
932     setOperationAction(ISD::MUL,                MVT::v4i8,  Custom);
933     setOperationAction(ISD::MUL,                MVT::v8i8,  Custom);
934 
935     setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
936     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
937     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
938     setOperationAction(ISD::MULHU,              MVT::v4i32, Custom);
939     setOperationAction(ISD::MULHS,              MVT::v4i32, Custom);
940     setOperationAction(ISD::MULHU,              MVT::v16i8, Custom);
941     setOperationAction(ISD::MULHS,              MVT::v16i8, Custom);
942     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
943     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
944     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
945 
946     setOperationAction(ISD::SMULO,              MVT::v16i8, Custom);
947     setOperationAction(ISD::UMULO,              MVT::v16i8, Custom);
948 
949     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
950     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
951     setOperationAction(ISD::FCOPYSIGN,          MVT::v2f64, Custom);
952 
953     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
954       setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
955       setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
956       setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
957       setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
958     }
959 
960     setOperationAction(ISD::UADDSAT,            MVT::v16i8, Legal);
961     setOperationAction(ISD::SADDSAT,            MVT::v16i8, Legal);
962     setOperationAction(ISD::USUBSAT,            MVT::v16i8, Legal);
963     setOperationAction(ISD::SSUBSAT,            MVT::v16i8, Legal);
964     setOperationAction(ISD::UADDSAT,            MVT::v8i16, Legal);
965     setOperationAction(ISD::SADDSAT,            MVT::v8i16, Legal);
966     setOperationAction(ISD::USUBSAT,            MVT::v8i16, Legal);
967     setOperationAction(ISD::SSUBSAT,            MVT::v8i16, Legal);
968     setOperationAction(ISD::USUBSAT,            MVT::v4i32, Custom);
969     setOperationAction(ISD::USUBSAT,            MVT::v2i64, Custom);
970 
971     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
972     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
973     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
974 
975     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
976       setOperationAction(ISD::SETCC,              VT, Custom);
977       setOperationAction(ISD::STRICT_FSETCC,      VT, Custom);
978       setOperationAction(ISD::STRICT_FSETCCS,     VT, Custom);
979       setOperationAction(ISD::CTPOP,              VT, Custom);
980       setOperationAction(ISD::ABS,                VT, Custom);
981 
982       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
983       // setcc all the way to isel and prefer SETGT in some isel patterns.
984       setCondCodeAction(ISD::SETLT, VT, Custom);
985       setCondCodeAction(ISD::SETLE, VT, Custom);
986     }
987 
988     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
989       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
990       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
991       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
992       setOperationAction(ISD::VSELECT,            VT, Custom);
993       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
994     }
995 
996     for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
997       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
998       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
999       setOperationAction(ISD::VSELECT,            VT, Custom);
1000 
1001       if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1002         continue;
1003 
1004       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1005       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1006     }
1007 
1008     // Custom lower v2i64 and v2f64 selects.
1009     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
1010     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
1011     setOperationAction(ISD::SELECT,             MVT::v4i32, Custom);
1012     setOperationAction(ISD::SELECT,             MVT::v8i16, Custom);
1013     setOperationAction(ISD::SELECT,             MVT::v16i8, Custom);
1014 
1015     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
1016     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Custom);
1017     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
1018     setOperationAction(ISD::FP_TO_UINT,         MVT::v2i32, Custom);
1019     setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v4i32, Legal);
1020     setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v2i32, Custom);
1021 
1022     // Custom legalize these to avoid over promotion or custom promotion.
1023     for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1024       setOperationAction(ISD::FP_TO_SINT,        VT, Custom);
1025       setOperationAction(ISD::FP_TO_UINT,        VT, Custom);
1026       setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1027       setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1028     }
1029 
1030     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
1031     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v4i32, Legal);
1032     setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
1033     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v2i32, Custom);
1034 
1035     setOperationAction(ISD::UINT_TO_FP,         MVT::v2i32, Custom);
1036     setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v2i32, Custom);
1037 
1038     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
1039     setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v4i32, Custom);
1040 
1041     // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1042     setOperationAction(ISD::SINT_TO_FP,         MVT::v2f32, Custom);
1043     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v2f32, Custom);
1044     setOperationAction(ISD::UINT_TO_FP,         MVT::v2f32, Custom);
1045     setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v2f32, Custom);
1046 
1047     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
1048     setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v2f32, Custom);
1049     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
1050     setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v2f32, Custom);
1051 
1052     // We want to legalize this to an f64 load rather than an i64 load on
1053     // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1054     // store.
1055     setOperationAction(ISD::LOAD,               MVT::v2i32, Custom);
1056     setOperationAction(ISD::LOAD,               MVT::v4i16, Custom);
1057     setOperationAction(ISD::LOAD,               MVT::v8i8,  Custom);
1058     setOperationAction(ISD::STORE,              MVT::v2i32, Custom);
1059     setOperationAction(ISD::STORE,              MVT::v4i16, Custom);
1060     setOperationAction(ISD::STORE,              MVT::v8i8,  Custom);
1061 
1062     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
1063     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
1064     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
1065     if (!Subtarget.hasAVX512())
1066       setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1067 
1068     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
1069     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
1070     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
1071 
1072     setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1073 
1074     setOperationAction(ISD::TRUNCATE,    MVT::v2i8,  Custom);
1075     setOperationAction(ISD::TRUNCATE,    MVT::v2i16, Custom);
1076     setOperationAction(ISD::TRUNCATE,    MVT::v2i32, Custom);
1077     setOperationAction(ISD::TRUNCATE,    MVT::v4i8,  Custom);
1078     setOperationAction(ISD::TRUNCATE,    MVT::v4i16, Custom);
1079     setOperationAction(ISD::TRUNCATE,    MVT::v8i8,  Custom);
1080 
1081     // In the customized shift lowering, the legal v4i32/v2i64 cases
1082     // in AVX2 will be recognized.
1083     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1084       setOperationAction(ISD::SRL,              VT, Custom);
1085       setOperationAction(ISD::SHL,              VT, Custom);
1086       setOperationAction(ISD::SRA,              VT, Custom);
1087     }
1088 
1089     setOperationAction(ISD::ROTL,               MVT::v4i32, Custom);
1090     setOperationAction(ISD::ROTL,               MVT::v8i16, Custom);
1091 
1092     // With 512-bit registers or AVX512VL+BW, expanding (and promoting the
1093     // shifts) is better.
1094     if (!Subtarget.useAVX512Regs() &&
1095         !(Subtarget.hasBWI() && Subtarget.hasVLX()))
1096       setOperationAction(ISD::ROTL,             MVT::v16i8, Custom);
1097 
1098     setOperationAction(ISD::STRICT_FSQRT,       MVT::v2f64, Legal);
1099     setOperationAction(ISD::STRICT_FADD,        MVT::v2f64, Legal);
1100     setOperationAction(ISD::STRICT_FSUB,        MVT::v2f64, Legal);
1101     setOperationAction(ISD::STRICT_FMUL,        MVT::v2f64, Legal);
1102     setOperationAction(ISD::STRICT_FDIV,        MVT::v2f64, Legal);
1103   }
1104 
1105   if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1106     setOperationAction(ISD::ABS,                MVT::v16i8, Legal);
1107     setOperationAction(ISD::ABS,                MVT::v8i16, Legal);
1108     setOperationAction(ISD::ABS,                MVT::v4i32, Legal);
1109     setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
1110     setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
1111     setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
1112     setOperationAction(ISD::CTLZ,               MVT::v4i32, Custom);
1113     setOperationAction(ISD::CTLZ,               MVT::v2i64, Custom);
1114 
1115     // These might be better off as horizontal vector ops.
1116     setOperationAction(ISD::ADD,                MVT::i16, Custom);
1117     setOperationAction(ISD::ADD,                MVT::i32, Custom);
1118     setOperationAction(ISD::SUB,                MVT::i16, Custom);
1119     setOperationAction(ISD::SUB,                MVT::i32, Custom);
1120   }
1121 
1122   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1123     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1124       setOperationAction(ISD::FFLOOR,            RoundedTy,  Legal);
1125       setOperationAction(ISD::STRICT_FFLOOR,     RoundedTy,  Legal);
1126       setOperationAction(ISD::FCEIL,             RoundedTy,  Legal);
1127       setOperationAction(ISD::STRICT_FCEIL,      RoundedTy,  Legal);
1128       setOperationAction(ISD::FTRUNC,            RoundedTy,  Legal);
1129       setOperationAction(ISD::STRICT_FTRUNC,     RoundedTy,  Legal);
1130       setOperationAction(ISD::FRINT,             RoundedTy,  Legal);
1131       setOperationAction(ISD::STRICT_FRINT,      RoundedTy,  Legal);
1132       setOperationAction(ISD::FNEARBYINT,        RoundedTy,  Legal);
1133       setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy,  Legal);
1134       setOperationAction(ISD::FROUNDEVEN,        RoundedTy,  Legal);
1135       setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy,  Legal);
1136 
1137       setOperationAction(ISD::FROUND,            RoundedTy,  Custom);
1138     }
1139 
1140     setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
1141     setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
1142     setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
1143     setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
1144     setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
1145     setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
1146     setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
1147     setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
1148 
1149     setOperationAction(ISD::UADDSAT,            MVT::v4i32, Custom);
1150 
1151     // FIXME: Do we need to handle scalar-to-vector here?
1152     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
1153 
1154     // We directly match byte blends in the backend as they match the VSELECT
1155     // condition form.
1156     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
1157 
1158     // SSE41 brings specific instructions for doing vector sign extend even in
1159     // cases where we don't have SRA.
1160     for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1161       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1162       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1163     }
1164 
1165     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1166     for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1167       setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8,  Legal);
1168       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8,  Legal);
1169       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8,  Legal);
1170       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1171       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1172       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1173     }
1174 
1175     // i8 vectors are custom because the source register and source
1176     // source memory operand types are not the same width.
1177     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
1178 
1179     if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1180       // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1181       // do the pre and post work in the vector domain.
1182       setOperationAction(ISD::UINT_TO_FP,        MVT::v4i64, Custom);
1183       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
1184       // We need to mark SINT_TO_FP as Custom even though we want to expand it
1185       // so that DAG combine doesn't try to turn it into uint_to_fp.
1186       setOperationAction(ISD::SINT_TO_FP,        MVT::v4i64, Custom);
1187       setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
1188     }
1189   }
1190 
1191   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1192     setOperationAction(ISD::UADDSAT,            MVT::v2i64, Custom);
1193   }
1194 
1195   if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1196     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
1197                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1198       setOperationAction(ISD::ROTL, VT, Custom);
1199 
1200     // XOP can efficiently perform BITREVERSE with VPPERM.
1201     for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1202       setOperationAction(ISD::BITREVERSE, VT, Custom);
1203 
1204     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
1205                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1206       setOperationAction(ISD::BITREVERSE, VT, Custom);
1207   }
1208 
1209   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1210     bool HasInt256 = Subtarget.hasInt256();
1211 
1212     addRegisterClass(MVT::v32i8,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1213                                                      : &X86::VR256RegClass);
1214     addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1215                                                      : &X86::VR256RegClass);
1216     addRegisterClass(MVT::v8i32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1217                                                      : &X86::VR256RegClass);
1218     addRegisterClass(MVT::v8f32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1219                                                      : &X86::VR256RegClass);
1220     addRegisterClass(MVT::v4i64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1221                                                      : &X86::VR256RegClass);
1222     addRegisterClass(MVT::v4f64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1223                                                      : &X86::VR256RegClass);
1224 
1225     for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1226       setOperationAction(ISD::FFLOOR,            VT, Legal);
1227       setOperationAction(ISD::STRICT_FFLOOR,     VT, Legal);
1228       setOperationAction(ISD::FCEIL,             VT, Legal);
1229       setOperationAction(ISD::STRICT_FCEIL,      VT, Legal);
1230       setOperationAction(ISD::FTRUNC,            VT, Legal);
1231       setOperationAction(ISD::STRICT_FTRUNC,     VT, Legal);
1232       setOperationAction(ISD::FRINT,             VT, Legal);
1233       setOperationAction(ISD::STRICT_FRINT,      VT, Legal);
1234       setOperationAction(ISD::FNEARBYINT,        VT, Legal);
1235       setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1236       setOperationAction(ISD::FROUNDEVEN,        VT, Legal);
1237       setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1238 
1239       setOperationAction(ISD::FROUND,            VT, Custom);
1240 
1241       setOperationAction(ISD::FNEG,              VT, Custom);
1242       setOperationAction(ISD::FABS,              VT, Custom);
1243       setOperationAction(ISD::FCOPYSIGN,         VT, Custom);
1244     }
1245 
1246     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1247     // even though v8i16 is a legal type.
1248     setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v8i16, MVT::v8i32);
1249     setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v8i16, MVT::v8i32);
1250     setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1251     setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1252     setOperationAction(ISD::FP_TO_SINT,                MVT::v8i32, Legal);
1253     setOperationAction(ISD::FP_TO_UINT,                MVT::v8i32, Custom);
1254     setOperationAction(ISD::STRICT_FP_TO_SINT,         MVT::v8i32, Legal);
1255 
1256     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1257     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v8i32, Legal);
1258 
1259     setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v4f32, Legal);
1260     setOperationAction(ISD::STRICT_FADD,        MVT::v8f32, Legal);
1261     setOperationAction(ISD::STRICT_FADD,        MVT::v4f64, Legal);
1262     setOperationAction(ISD::STRICT_FSUB,        MVT::v8f32, Legal);
1263     setOperationAction(ISD::STRICT_FSUB,        MVT::v4f64, Legal);
1264     setOperationAction(ISD::STRICT_FMUL,        MVT::v8f32, Legal);
1265     setOperationAction(ISD::STRICT_FMUL,        MVT::v4f64, Legal);
1266     setOperationAction(ISD::STRICT_FDIV,        MVT::v8f32, Legal);
1267     setOperationAction(ISD::STRICT_FDIV,        MVT::v4f64, Legal);
1268     setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v4f64, Legal);
1269     setOperationAction(ISD::STRICT_FSQRT,       MVT::v8f32, Legal);
1270     setOperationAction(ISD::STRICT_FSQRT,       MVT::v4f64, Legal);
1271 
1272     if (!Subtarget.hasAVX512())
1273       setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1274 
1275     // In the customized shift lowering, the legal v8i32/v4i64 cases
1276     // in AVX2 will be recognized.
1277     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1278       setOperationAction(ISD::SRL, VT, Custom);
1279       setOperationAction(ISD::SHL, VT, Custom);
1280       setOperationAction(ISD::SRA, VT, Custom);
1281     }
1282 
1283     // These types need custom splitting if their input is a 128-bit vector.
1284     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i64,  Custom);
1285     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i32, Custom);
1286     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i64,  Custom);
1287     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i32, Custom);
1288 
1289     setOperationAction(ISD::ROTL,              MVT::v8i32,  Custom);
1290     setOperationAction(ISD::ROTL,              MVT::v16i16, Custom);
1291 
1292     // With BWI, expanding (and promoting the shifts) is the better.
1293     if (!Subtarget.useBWIRegs())
1294       setOperationAction(ISD::ROTL,            MVT::v32i8,  Custom);
1295 
1296     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1297     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1298     setOperationAction(ISD::SELECT,            MVT::v8i32, Custom);
1299     setOperationAction(ISD::SELECT,            MVT::v16i16, Custom);
1300     setOperationAction(ISD::SELECT,            MVT::v32i8, Custom);
1301     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1302 
1303     for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1304       setOperationAction(ISD::SIGN_EXTEND,     VT, Custom);
1305       setOperationAction(ISD::ZERO_EXTEND,     VT, Custom);
1306       setOperationAction(ISD::ANY_EXTEND,      VT, Custom);
1307     }
1308 
1309     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1310     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1311     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1312     setOperationAction(ISD::BITREVERSE,        MVT::v32i8, Custom);
1313 
1314     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1315       setOperationAction(ISD::SETCC,           VT, Custom);
1316       setOperationAction(ISD::STRICT_FSETCC,   VT, Custom);
1317       setOperationAction(ISD::STRICT_FSETCCS,  VT, Custom);
1318       setOperationAction(ISD::CTPOP,           VT, Custom);
1319       setOperationAction(ISD::CTLZ,            VT, Custom);
1320 
1321       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1322       // setcc all the way to isel and prefer SETGT in some isel patterns.
1323       setCondCodeAction(ISD::SETLT, VT, Custom);
1324       setCondCodeAction(ISD::SETLE, VT, Custom);
1325     }
1326 
1327     if (Subtarget.hasAnyFMA()) {
1328       for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1329                        MVT::v2f64, MVT::v4f64 }) {
1330         setOperationAction(ISD::FMA, VT, Legal);
1331         setOperationAction(ISD::STRICT_FMA, VT, Legal);
1332       }
1333     }
1334 
1335     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1336       setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1337       setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1338     }
1339 
1340     setOperationAction(ISD::MUL,       MVT::v4i64,  Custom);
1341     setOperationAction(ISD::MUL,       MVT::v8i32,  HasInt256 ? Legal : Custom);
1342     setOperationAction(ISD::MUL,       MVT::v16i16, HasInt256 ? Legal : Custom);
1343     setOperationAction(ISD::MUL,       MVT::v32i8,  Custom);
1344 
1345     setOperationAction(ISD::MULHU,     MVT::v8i32,  Custom);
1346     setOperationAction(ISD::MULHS,     MVT::v8i32,  Custom);
1347     setOperationAction(ISD::MULHU,     MVT::v16i16, HasInt256 ? Legal : Custom);
1348     setOperationAction(ISD::MULHS,     MVT::v16i16, HasInt256 ? Legal : Custom);
1349     setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);
1350     setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
1351 
1352     setOperationAction(ISD::SMULO,     MVT::v32i8, Custom);
1353     setOperationAction(ISD::UMULO,     MVT::v32i8, Custom);
1354 
1355     setOperationAction(ISD::ABS,       MVT::v4i64,  Custom);
1356     setOperationAction(ISD::SMAX,      MVT::v4i64,  Custom);
1357     setOperationAction(ISD::UMAX,      MVT::v4i64,  Custom);
1358     setOperationAction(ISD::SMIN,      MVT::v4i64,  Custom);
1359     setOperationAction(ISD::UMIN,      MVT::v4i64,  Custom);
1360 
1361     setOperationAction(ISD::UADDSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1362     setOperationAction(ISD::SADDSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1363     setOperationAction(ISD::USUBSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1364     setOperationAction(ISD::SSUBSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1365     setOperationAction(ISD::UADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1366     setOperationAction(ISD::SADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1367     setOperationAction(ISD::USUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1368     setOperationAction(ISD::SSUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1369     setOperationAction(ISD::UADDSAT,   MVT::v8i32, Custom);
1370     setOperationAction(ISD::USUBSAT,   MVT::v8i32, Custom);
1371     setOperationAction(ISD::UADDSAT,   MVT::v4i64, Custom);
1372     setOperationAction(ISD::USUBSAT,   MVT::v4i64, Custom);
1373 
1374     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1375       setOperationAction(ISD::ABS,  VT, HasInt256 ? Legal : Custom);
1376       setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1377       setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1378       setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1379       setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1380     }
1381 
1382     for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1383       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1384       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1385     }
1386 
1387     if (HasInt256) {
1388       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1389       // when we have a 256bit-wide blend with immediate.
1390       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1391       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
1392 
1393       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1394       for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1395         setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1396         setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i8,  Legal);
1397         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i8,  Legal);
1398         setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i16, Legal);
1399         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i16, Legal);
1400         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i32, Legal);
1401       }
1402     }
1403 
1404     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1405                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1406       setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);
1407       setOperationAction(ISD::MSTORE, VT, Legal);
1408     }
1409 
1410     // Extract subvector is special because the value type
1411     // (result) is 128-bit but the source is 256-bit wide.
1412     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1413                      MVT::v4f32, MVT::v2f64 }) {
1414       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1415     }
1416 
1417     // Custom lower several nodes for 256-bit types.
1418     for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1419                     MVT::v8f32, MVT::v4f64 }) {
1420       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1421       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1422       setOperationAction(ISD::VSELECT,            VT, Custom);
1423       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1424       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1425       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1426       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
1427       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1428       setOperationAction(ISD::STORE,              VT, Custom);
1429     }
1430 
1431     if (HasInt256) {
1432       setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1433 
1434       // Custom legalize 2x32 to get a little better code.
1435       setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1436       setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1437 
1438       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1439                        MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1440         setOperationAction(ISD::MGATHER,  VT, Custom);
1441     }
1442   }
1443 
1444   // This block controls legalization of the mask vector sizes that are
1445   // available with AVX512. 512-bit vectors are in a separate block controlled
1446   // by useAVX512Regs.
1447   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1448     addRegisterClass(MVT::v1i1,   &X86::VK1RegClass);
1449     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1450     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1451     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1452     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1453 
1454     setOperationAction(ISD::SELECT,             MVT::v1i1, Custom);
1455     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1456     setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i1, Custom);
1457 
1458     setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v8i1,  MVT::v8i32);
1459     setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v8i1,  MVT::v8i32);
1460     setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v4i1,  MVT::v4i32);
1461     setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v4i1,  MVT::v4i32);
1462     setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1,  MVT::v8i32);
1463     setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1,  MVT::v8i32);
1464     setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1,  MVT::v4i32);
1465     setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1,  MVT::v4i32);
1466     setOperationAction(ISD::FP_TO_SINT,                MVT::v2i1,  Custom);
1467     setOperationAction(ISD::FP_TO_UINT,                MVT::v2i1,  Custom);
1468     setOperationAction(ISD::STRICT_FP_TO_SINT,         MVT::v2i1,  Custom);
1469     setOperationAction(ISD::STRICT_FP_TO_UINT,         MVT::v2i1,  Custom);
1470 
1471     // There is no byte sized k-register load or store without AVX512DQ.
1472     if (!Subtarget.hasDQI()) {
1473       setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1474       setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1475       setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1476       setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1477 
1478       setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1479       setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1480       setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1481       setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1482     }
1483 
1484     // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1485     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1486       setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1487       setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1488       setOperationAction(ISD::ANY_EXTEND,  VT, Custom);
1489     }
1490 
1491     for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1492       setOperationAction(ISD::VSELECT,          VT, Expand);
1493 
1494     for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1495       setOperationAction(ISD::SETCC,            VT, Custom);
1496       setOperationAction(ISD::STRICT_FSETCC,    VT, Custom);
1497       setOperationAction(ISD::STRICT_FSETCCS,   VT, Custom);
1498       setOperationAction(ISD::SELECT,           VT, Custom);
1499       setOperationAction(ISD::TRUNCATE,         VT, Custom);
1500 
1501       setOperationAction(ISD::BUILD_VECTOR,     VT, Custom);
1502       setOperationAction(ISD::CONCAT_VECTORS,   VT, Custom);
1503       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1504       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1505       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1506       setOperationAction(ISD::VECTOR_SHUFFLE,   VT,  Custom);
1507     }
1508 
1509     for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1510       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1511   }
1512 
1513   // This block controls legalization for 512-bit operations with 32/64 bit
1514   // elements. 512-bits can be disabled based on prefer-vector-width and
1515   // required-vector-width function attributes.
1516   if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1517     bool HasBWI = Subtarget.hasBWI();
1518 
1519     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1520     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1521     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1522     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1523     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1524     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1525 
1526     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1527       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8,  Legal);
1528       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1529       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i8,   Legal);
1530       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
1531       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
1532       if (HasBWI)
1533         setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1534     }
1535 
1536     for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1537       setOperationAction(ISD::FNEG,  VT, Custom);
1538       setOperationAction(ISD::FABS,  VT, Custom);
1539       setOperationAction(ISD::FMA,   VT, Legal);
1540       setOperationAction(ISD::STRICT_FMA, VT, Legal);
1541       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1542     }
1543 
1544     for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
1545       setOperationPromotedToType(ISD::FP_TO_SINT       , VT, MVT::v16i32);
1546       setOperationPromotedToType(ISD::FP_TO_UINT       , VT, MVT::v16i32);
1547       setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
1548       setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
1549     }
1550     setOperationAction(ISD::FP_TO_SINT,        MVT::v16i32, Legal);
1551     setOperationAction(ISD::FP_TO_UINT,        MVT::v16i32, Legal);
1552     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
1553     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
1554     setOperationAction(ISD::SINT_TO_FP,        MVT::v16i32, Legal);
1555     setOperationAction(ISD::UINT_TO_FP,        MVT::v16i32, Legal);
1556     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
1557     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);
1558 
1559     setOperationAction(ISD::STRICT_FADD,      MVT::v16f32, Legal);
1560     setOperationAction(ISD::STRICT_FADD,      MVT::v8f64,  Legal);
1561     setOperationAction(ISD::STRICT_FSUB,      MVT::v16f32, Legal);
1562     setOperationAction(ISD::STRICT_FSUB,      MVT::v8f64,  Legal);
1563     setOperationAction(ISD::STRICT_FMUL,      MVT::v16f32, Legal);
1564     setOperationAction(ISD::STRICT_FMUL,      MVT::v8f64,  Legal);
1565     setOperationAction(ISD::STRICT_FDIV,      MVT::v16f32, Legal);
1566     setOperationAction(ISD::STRICT_FDIV,      MVT::v8f64,  Legal);
1567     setOperationAction(ISD::STRICT_FSQRT,     MVT::v16f32, Legal);
1568     setOperationAction(ISD::STRICT_FSQRT,     MVT::v8f64,  Legal);
1569     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64,  Legal);
1570     setOperationAction(ISD::STRICT_FP_ROUND,  MVT::v8f32,  Legal);
1571 
1572     setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
1573     setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
1574     setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
1575     setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
1576     setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
1577     if (HasBWI)
1578       setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
1579 
1580     // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1581     // to 512-bit rather than use the AVX2 instructions so that we can use
1582     // k-masks.
1583     if (!Subtarget.hasVLX()) {
1584       for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1585            MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1586         setOperationAction(ISD::MLOAD,  VT, Custom);
1587         setOperationAction(ISD::MSTORE, VT, Custom);
1588       }
1589     }
1590 
1591     setOperationAction(ISD::TRUNCATE,    MVT::v8i32,  Legal);
1592     setOperationAction(ISD::TRUNCATE,    MVT::v16i16, Legal);
1593     setOperationAction(ISD::TRUNCATE,    MVT::v32i8,  HasBWI ? Legal : Custom);
1594     setOperationAction(ISD::TRUNCATE,    MVT::v16i64, Custom);
1595     setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1596     setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1597     setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64,  Custom);
1598     setOperationAction(ISD::ANY_EXTEND,  MVT::v32i16, Custom);
1599     setOperationAction(ISD::ANY_EXTEND,  MVT::v16i32, Custom);
1600     setOperationAction(ISD::ANY_EXTEND,  MVT::v8i64,  Custom);
1601     setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1602     setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1603     setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64,  Custom);
1604 
1605     if (HasBWI) {
1606       // Extends from v64i1 masks to 512-bit vectors.
1607       setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
1608       setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
1609       setOperationAction(ISD::ANY_EXTEND,         MVT::v64i8, Custom);
1610     }
1611 
1612     for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1613       setOperationAction(ISD::FFLOOR,            VT, Legal);
1614       setOperationAction(ISD::STRICT_FFLOOR,     VT, Legal);
1615       setOperationAction(ISD::FCEIL,             VT, Legal);
1616       setOperationAction(ISD::STRICT_FCEIL,      VT, Legal);
1617       setOperationAction(ISD::FTRUNC,            VT, Legal);
1618       setOperationAction(ISD::STRICT_FTRUNC,     VT, Legal);
1619       setOperationAction(ISD::FRINT,             VT, Legal);
1620       setOperationAction(ISD::STRICT_FRINT,      VT, Legal);
1621       setOperationAction(ISD::FNEARBYINT,        VT, Legal);
1622       setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1623       setOperationAction(ISD::FROUNDEVEN,        VT, Legal);
1624       setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1625 
1626       setOperationAction(ISD::FROUND,            VT, Custom);
1627     }
1628 
1629     for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1630       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1631       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1632     }
1633 
1634     setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1635     setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1636     setOperationAction(ISD::ADD, MVT::v64i8,  HasBWI ? Legal : Custom);
1637     setOperationAction(ISD::SUB, MVT::v64i8,  HasBWI ? Legal : Custom);
1638 
1639     setOperationAction(ISD::MUL, MVT::v8i64,  Custom);
1640     setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1641     setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1642     setOperationAction(ISD::MUL, MVT::v64i8,  Custom);
1643 
1644     setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1645     setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1646     setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1647     setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1648     setOperationAction(ISD::MULHS, MVT::v64i8,  Custom);
1649     setOperationAction(ISD::MULHU, MVT::v64i8,  Custom);
1650 
1651     setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1652     setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1653 
1654     setOperationAction(ISD::BITREVERSE, MVT::v64i8,  Custom);
1655 
1656     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1657       setOperationAction(ISD::SRL,              VT, Custom);
1658       setOperationAction(ISD::SHL,              VT, Custom);
1659       setOperationAction(ISD::SRA,              VT, Custom);
1660       setOperationAction(ISD::SETCC,            VT, Custom);
1661 
1662       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1663       // setcc all the way to isel and prefer SETGT in some isel patterns.
1664       setCondCodeAction(ISD::SETLT, VT, Custom);
1665       setCondCodeAction(ISD::SETLE, VT, Custom);
1666     }
1667     for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1668       setOperationAction(ISD::SMAX,             VT, Legal);
1669       setOperationAction(ISD::UMAX,             VT, Legal);
1670       setOperationAction(ISD::SMIN,             VT, Legal);
1671       setOperationAction(ISD::UMIN,             VT, Legal);
1672       setOperationAction(ISD::ABS,              VT, Legal);
1673       setOperationAction(ISD::CTPOP,            VT, Custom);
1674       setOperationAction(ISD::ROTL,             VT, Custom);
1675       setOperationAction(ISD::ROTR,             VT, Custom);
1676       setOperationAction(ISD::STRICT_FSETCC,    VT, Custom);
1677       setOperationAction(ISD::STRICT_FSETCCS,   VT, Custom);
1678     }
1679 
1680     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1681       setOperationAction(ISD::ABS,     VT, HasBWI ? Legal : Custom);
1682       setOperationAction(ISD::CTPOP,   VT, Subtarget.hasBITALG() ? Legal : Custom);
1683       setOperationAction(ISD::CTLZ,    VT, Custom);
1684       setOperationAction(ISD::SMAX,    VT, HasBWI ? Legal : Custom);
1685       setOperationAction(ISD::UMAX,    VT, HasBWI ? Legal : Custom);
1686       setOperationAction(ISD::SMIN,    VT, HasBWI ? Legal : Custom);
1687       setOperationAction(ISD::UMIN,    VT, HasBWI ? Legal : Custom);
1688       setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
1689       setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
1690       setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
1691       setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
1692     }
1693 
1694     if (Subtarget.hasDQI()) {
1695       setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1696       setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1697       setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
1698       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
1699       setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1700       setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1701       setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
1702       setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);
1703 
1704       setOperationAction(ISD::MUL,        MVT::v8i64, Legal);
1705     }
1706 
1707     if (Subtarget.hasCDI()) {
1708       // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1709       for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1710         setOperationAction(ISD::CTLZ,            VT, Legal);
1711       }
1712     } // Subtarget.hasCDI()
1713 
1714     if (Subtarget.hasVPOPCNTDQ()) {
1715       for (auto VT : { MVT::v16i32, MVT::v8i64 })
1716         setOperationAction(ISD::CTPOP, VT, Legal);
1717     }
1718 
1719     // Extract subvector is special because the value type
1720     // (result) is 256-bit but the source is 512-bit wide.
1721     // 128-bit was made Legal under AVX1.
1722     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1723                      MVT::v8f32, MVT::v4f64 })
1724       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1725 
1726     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1727                      MVT::v16f32, MVT::v8f64 }) {
1728       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1729       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
1730       setOperationAction(ISD::SELECT,             VT, Custom);
1731       setOperationAction(ISD::VSELECT,            VT, Custom);
1732       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1733       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1734       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1735       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1736       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1737     }
1738 
1739     for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1740       setOperationAction(ISD::MLOAD,               VT, Legal);
1741       setOperationAction(ISD::MSTORE,              VT, Legal);
1742       setOperationAction(ISD::MGATHER,             VT, Custom);
1743       setOperationAction(ISD::MSCATTER,            VT, Custom);
1744     }
1745     if (HasBWI) {
1746       for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1747         setOperationAction(ISD::MLOAD,        VT, Legal);
1748         setOperationAction(ISD::MSTORE,       VT, Legal);
1749       }
1750     } else {
1751       setOperationAction(ISD::STORE, MVT::v32i16, Custom);
1752       setOperationAction(ISD::STORE, MVT::v64i8,  Custom);
1753     }
1754 
1755     if (Subtarget.hasVBMI2()) {
1756       for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1757                        MVT::v16i16, MVT::v8i32, MVT::v4i64,
1758                        MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1759         setOperationAction(ISD::FSHL, VT, Custom);
1760         setOperationAction(ISD::FSHR, VT, Custom);
1761       }
1762 
1763       setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
1764       setOperationAction(ISD::ROTR, MVT::v8i16,  Custom);
1765       setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
1766       setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
1767     }
1768   }// useAVX512Regs
1769 
1770   // This block controls legalization for operations that don't have
1771   // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1772   // narrower widths.
1773   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1774     // These operations are handled on non-VLX by artificially widening in
1775     // isel patterns.
1776 
1777     setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
1778                        Subtarget.hasVLX() ? Legal : Custom);
1779     setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
1780                        Subtarget.hasVLX() ? Legal : Custom);
1781     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
1782                        Subtarget.hasVLX() ? Legal : Custom);
1783     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
1784                        Subtarget.hasVLX() ? Legal : Custom);
1785     setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v2i32, Custom);
1786     setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
1787                        Subtarget.hasVLX() ? Legal : Custom);
1788     setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
1789                        Subtarget.hasVLX() ? Legal : Custom);
1790     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
1791                        Subtarget.hasVLX() ? Legal : Custom);
1792     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
1793                        Subtarget.hasVLX() ? Legal : Custom);
1794 
1795     if (Subtarget.hasDQI()) {
1796       // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1797       // v2f32 UINT_TO_FP is already custom under SSE2.
1798       assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
1799              isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
1800              "Unexpected operation action!");
1801       // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1802       setOperationAction(ISD::FP_TO_SINT,        MVT::v2f32, Custom);
1803       setOperationAction(ISD::FP_TO_UINT,        MVT::v2f32, Custom);
1804       setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
1805       setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
1806     }
1807 
1808     for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1809       setOperationAction(ISD::SMAX, VT, Legal);
1810       setOperationAction(ISD::UMAX, VT, Legal);
1811       setOperationAction(ISD::SMIN, VT, Legal);
1812       setOperationAction(ISD::UMIN, VT, Legal);
1813       setOperationAction(ISD::ABS,  VT, Legal);
1814     }
1815 
1816     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1817       setOperationAction(ISD::ROTL,     VT, Custom);
1818       setOperationAction(ISD::ROTR,     VT, Custom);
1819     }
1820 
1821     // Custom legalize 2x32 to get a little better code.
1822     setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1823     setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1824 
1825     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1826                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1827       setOperationAction(ISD::MSCATTER, VT, Custom);
1828 
1829     if (Subtarget.hasDQI()) {
1830       for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1831         setOperationAction(ISD::SINT_TO_FP, VT,
1832                            Subtarget.hasVLX() ? Legal : Custom);
1833         setOperationAction(ISD::UINT_TO_FP, VT,
1834                            Subtarget.hasVLX() ? Legal : Custom);
1835         setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
1836                            Subtarget.hasVLX() ? Legal : Custom);
1837         setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
1838                            Subtarget.hasVLX() ? Legal : Custom);
1839         setOperationAction(ISD::FP_TO_SINT, VT,
1840                            Subtarget.hasVLX() ? Legal : Custom);
1841         setOperationAction(ISD::FP_TO_UINT, VT,
1842                            Subtarget.hasVLX() ? Legal : Custom);
1843         setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
1844                            Subtarget.hasVLX() ? Legal : Custom);
1845         setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
1846                            Subtarget.hasVLX() ? Legal : Custom);
1847         setOperationAction(ISD::MUL,               VT, Legal);
1848       }
1849     }
1850 
1851     if (Subtarget.hasCDI()) {
1852       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1853         setOperationAction(ISD::CTLZ,            VT, Legal);
1854       }
1855     } // Subtarget.hasCDI()
1856 
1857     if (Subtarget.hasVPOPCNTDQ()) {
1858       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1859         setOperationAction(ISD::CTPOP, VT, Legal);
1860     }
1861   }
1862 
1863   // This block control legalization of v32i1/v64i1 which are available with
1864   // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1865   // useBWIRegs.
1866   if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1867     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
1868     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
1869 
1870     for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1871       setOperationAction(ISD::VSELECT,            VT, Expand);
1872       setOperationAction(ISD::TRUNCATE,           VT, Custom);
1873       setOperationAction(ISD::SETCC,              VT, Custom);
1874       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1875       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1876       setOperationAction(ISD::SELECT,             VT, Custom);
1877       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1878       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1879       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1880       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1881     }
1882 
1883     for (auto VT : { MVT::v16i1, MVT::v32i1 })
1884       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1885 
1886     // Extends from v32i1 masks to 256-bit vectors.
1887     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
1888     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
1889     setOperationAction(ISD::ANY_EXTEND,         MVT::v32i8, Custom);
1890 
1891     for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1892       setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);
1893       setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1894     }
1895 
1896     // These operations are handled on non-VLX by artificially widening in
1897     // isel patterns.
1898     // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1899 
1900     if (Subtarget.hasBITALG()) {
1901       for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1902         setOperationAction(ISD::CTPOP, VT, Legal);
1903     }
1904   }
1905 
1906   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1907     setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
1908     setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1909     setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1910     setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);
1911     setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1912 
1913     setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);
1914     setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1915     setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1916     setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
1917     setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1918 
1919     if (Subtarget.hasBWI()) {
1920       setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
1921       setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
1922     }
1923 
1924     setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
1925     setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
1926     setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1927   }
1928 
1929   if (Subtarget.hasAMXTILE()) {
1930     addRegisterClass(MVT::x86amx, &X86::TILERegClass);
1931   }
1932 
1933   // We want to custom lower some of our intrinsics.
1934   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1935   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1936   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1937   if (!Subtarget.is64Bit()) {
1938     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1939   }
1940 
1941   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1942   // handle type legalization for these operations here.
1943   //
1944   // FIXME: We really should do custom legalization for addition and
1945   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1946   // than generic legalization for 64-bit multiplication-with-overflow, though.
1947   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1948     if (VT == MVT::i64 && !Subtarget.is64Bit())
1949       continue;
1950     // Add/Sub/Mul with overflow operations are custom lowered.
1951     setOperationAction(ISD::SADDO, VT, Custom);
1952     setOperationAction(ISD::UADDO, VT, Custom);
1953     setOperationAction(ISD::SSUBO, VT, Custom);
1954     setOperationAction(ISD::USUBO, VT, Custom);
1955     setOperationAction(ISD::SMULO, VT, Custom);
1956     setOperationAction(ISD::UMULO, VT, Custom);
1957 
1958     // Support carry in as value rather than glue.
1959     setOperationAction(ISD::ADDCARRY, VT, Custom);
1960     setOperationAction(ISD::SUBCARRY, VT, Custom);
1961     setOperationAction(ISD::SETCCCARRY, VT, Custom);
1962     setOperationAction(ISD::SADDO_CARRY, VT, Custom);
1963     setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
1964   }
1965 
1966   if (!Subtarget.is64Bit()) {
1967     // These libcalls are not available in 32-bit.
1968     setLibcallName(RTLIB::SHL_I128, nullptr);
1969     setLibcallName(RTLIB::SRL_I128, nullptr);
1970     setLibcallName(RTLIB::SRA_I128, nullptr);
1971     setLibcallName(RTLIB::MUL_I128, nullptr);
1972   }
1973 
1974   // Combine sin / cos into _sincos_stret if it is available.
1975   if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1976       getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1977     setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1978     setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1979   }
1980 
1981   if (Subtarget.isTargetWin64()) {
1982     setOperationAction(ISD::SDIV, MVT::i128, Custom);
1983     setOperationAction(ISD::UDIV, MVT::i128, Custom);
1984     setOperationAction(ISD::SREM, MVT::i128, Custom);
1985     setOperationAction(ISD::UREM, MVT::i128, Custom);
1986   }
1987 
1988   // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1989   // is. We should promote the value to 64-bits to solve this.
1990   // This is what the CRT headers do - `fmodf` is an inline header
1991   // function casting to f64 and calling `fmod`.
1992   if (Subtarget.is32Bit() &&
1993       (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
1994     for (ISD::NodeType Op :
1995          {ISD::FCEIL,  ISD::STRICT_FCEIL,
1996           ISD::FCOS,   ISD::STRICT_FCOS,
1997           ISD::FEXP,   ISD::STRICT_FEXP,
1998           ISD::FFLOOR, ISD::STRICT_FFLOOR,
1999           ISD::FREM,   ISD::STRICT_FREM,
2000           ISD::FLOG,   ISD::STRICT_FLOG,
2001           ISD::FLOG10, ISD::STRICT_FLOG10,
2002           ISD::FPOW,   ISD::STRICT_FPOW,
2003           ISD::FSIN,   ISD::STRICT_FSIN})
2004       if (isOperationExpand(Op, MVT::f32))
2005         setOperationAction(Op, MVT::f32, Promote);
2006 
2007   // We have target-specific dag combine patterns for the following nodes:
2008   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
2009   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
2010   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
2011   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
2012   setTargetDAGCombine(ISD::CONCAT_VECTORS);
2013   setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
2014   setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
2015   setTargetDAGCombine(ISD::BITCAST);
2016   setTargetDAGCombine(ISD::VSELECT);
2017   setTargetDAGCombine(ISD::SELECT);
2018   setTargetDAGCombine(ISD::SHL);
2019   setTargetDAGCombine(ISD::SRA);
2020   setTargetDAGCombine(ISD::SRL);
2021   setTargetDAGCombine(ISD::OR);
2022   setTargetDAGCombine(ISD::AND);
2023   setTargetDAGCombine(ISD::ADD);
2024   setTargetDAGCombine(ISD::FADD);
2025   setTargetDAGCombine(ISD::FSUB);
2026   setTargetDAGCombine(ISD::FNEG);
2027   setTargetDAGCombine(ISD::FMA);
2028   setTargetDAGCombine(ISD::STRICT_FMA);
2029   setTargetDAGCombine(ISD::FMINNUM);
2030   setTargetDAGCombine(ISD::FMAXNUM);
2031   setTargetDAGCombine(ISD::SUB);
2032   setTargetDAGCombine(ISD::LOAD);
2033   setTargetDAGCombine(ISD::MLOAD);
2034   setTargetDAGCombine(ISD::STORE);
2035   setTargetDAGCombine(ISD::MSTORE);
2036   setTargetDAGCombine(ISD::TRUNCATE);
2037   setTargetDAGCombine(ISD::ZERO_EXTEND);
2038   setTargetDAGCombine(ISD::ANY_EXTEND);
2039   setTargetDAGCombine(ISD::SIGN_EXTEND);
2040   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
2041   setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
2042   setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
2043   setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
2044   setTargetDAGCombine(ISD::SINT_TO_FP);
2045   setTargetDAGCombine(ISD::UINT_TO_FP);
2046   setTargetDAGCombine(ISD::STRICT_SINT_TO_FP);
2047   setTargetDAGCombine(ISD::STRICT_UINT_TO_FP);
2048   setTargetDAGCombine(ISD::SETCC);
2049   setTargetDAGCombine(ISD::MUL);
2050   setTargetDAGCombine(ISD::XOR);
2051   setTargetDAGCombine(ISD::MSCATTER);
2052   setTargetDAGCombine(ISD::MGATHER);
2053   setTargetDAGCombine(ISD::FP16_TO_FP);
2054   setTargetDAGCombine(ISD::FP_EXTEND);
2055   setTargetDAGCombine(ISD::STRICT_FP_EXTEND);
2056   setTargetDAGCombine(ISD::FP_ROUND);
2057 
2058   computeRegisterProperties(Subtarget.getRegisterInfo());
2059 
2060   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2061   MaxStoresPerMemsetOptSize = 8;
2062   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2063   MaxStoresPerMemcpyOptSize = 4;
2064   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2065   MaxStoresPerMemmoveOptSize = 4;
2066 
2067   // TODO: These control memcmp expansion in CGP and could be raised higher, but
2068   // that needs to benchmarked and balanced with the potential use of vector
2069   // load/store types (PR33329, PR33914).
2070   MaxLoadsPerMemcmp = 2;
2071   MaxLoadsPerMemcmpOptSize = 2;
2072 
2073   // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
2074   setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment));
2075 
2076   // An out-of-order CPU can speculatively execute past a predictable branch,
2077   // but a conditional move could be stalled by an expensive earlier operation.
2078   PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2079   EnableExtLdPromotion = true;
2080   setPrefFunctionAlignment(Align(16));
2081 
2082   verifyIntrinsicTables();
2083 
2084   // Default to having -disable-strictnode-mutation on
2085   IsStrictFPEnabled = true;
2086 }
2087 
2088 // This has so far only been implemented for 64-bit MachO.
useLoadStackGuardNode() const2089 bool X86TargetLowering::useLoadStackGuardNode() const {
2090   return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2091 }
2092 
useStackGuardXorFP() const2093 bool X86TargetLowering::useStackGuardXorFP() const {
2094   // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2095   return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2096 }
2097 
emitStackGuardXorFP(SelectionDAG & DAG,SDValue Val,const SDLoc & DL) const2098 SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
2099                                                const SDLoc &DL) const {
2100   EVT PtrTy = getPointerTy(DAG.getDataLayout());
2101   unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2102   MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2103   return SDValue(Node, 0);
2104 }
2105 
2106 TargetLoweringBase::LegalizeTypeAction
getPreferredVectorAction(MVT VT) const2107 X86TargetLowering::getPreferredVectorAction(MVT VT) const {
2108   if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2109       !Subtarget.hasBWI())
2110     return TypeSplitVector;
2111 
2112   if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2113       VT.getVectorElementType() != MVT::i1)
2114     return TypeWidenVector;
2115 
2116   return TargetLoweringBase::getPreferredVectorAction(VT);
2117 }
2118 
2119 static std::pair<MVT, unsigned>
handleMaskRegisterForCallingConv(unsigned NumElts,CallingConv::ID CC,const X86Subtarget & Subtarget)2120 handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
2121                                  const X86Subtarget &Subtarget) {
2122   // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
2123   // convention is one that uses k registers.
2124   if (NumElts == 2)
2125     return {MVT::v2i64, 1};
2126   if (NumElts == 4)
2127     return {MVT::v4i32, 1};
2128   if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
2129       CC != CallingConv::Intel_OCL_BI)
2130     return {MVT::v8i16, 1};
2131   if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
2132       CC != CallingConv::Intel_OCL_BI)
2133     return {MVT::v16i8, 1};
2134   // v32i1 passes in ymm unless we have BWI and the calling convention is
2135   // regcall.
2136   if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
2137     return {MVT::v32i8, 1};
2138   // Split v64i1 vectors if we don't have v64i8 available.
2139   if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
2140     if (Subtarget.useAVX512Regs())
2141       return {MVT::v64i8, 1};
2142     return {MVT::v32i8, 2};
2143   }
2144 
2145   // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2146   if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
2147       NumElts > 64)
2148     return {MVT::i8, NumElts};
2149 
2150   return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
2151 }
2152 
getRegisterTypeForCallingConv(LLVMContext & Context,CallingConv::ID CC,EVT VT) const2153 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
2154                                                      CallingConv::ID CC,
2155                                                      EVT VT) const {
2156   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2157       Subtarget.hasAVX512()) {
2158     unsigned NumElts = VT.getVectorNumElements();
2159 
2160     MVT RegisterVT;
2161     unsigned NumRegisters;
2162     std::tie(RegisterVT, NumRegisters) =
2163         handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2164     if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2165       return RegisterVT;
2166   }
2167 
2168   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
2169 }
2170 
getNumRegistersForCallingConv(LLVMContext & Context,CallingConv::ID CC,EVT VT) const2171 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
2172                                                           CallingConv::ID CC,
2173                                                           EVT VT) const {
2174   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2175       Subtarget.hasAVX512()) {
2176     unsigned NumElts = VT.getVectorNumElements();
2177 
2178     MVT RegisterVT;
2179     unsigned NumRegisters;
2180     std::tie(RegisterVT, NumRegisters) =
2181         handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2182     if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2183       return NumRegisters;
2184   }
2185 
2186   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
2187 }
2188 
getVectorTypeBreakdownForCallingConv(LLVMContext & Context,CallingConv::ID CC,EVT VT,EVT & IntermediateVT,unsigned & NumIntermediates,MVT & RegisterVT) const2189 unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
2190     LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
2191     unsigned &NumIntermediates, MVT &RegisterVT) const {
2192   // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2193   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2194       Subtarget.hasAVX512() &&
2195       (!isPowerOf2_32(VT.getVectorNumElements()) ||
2196        (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
2197        VT.getVectorNumElements() > 64)) {
2198     RegisterVT = MVT::i8;
2199     IntermediateVT = MVT::i1;
2200     NumIntermediates = VT.getVectorNumElements();
2201     return NumIntermediates;
2202   }
2203 
2204   // Split v64i1 vectors if we don't have v64i8 available.
2205   if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
2206       CC != CallingConv::X86_RegCall) {
2207     RegisterVT = MVT::v32i8;
2208     IntermediateVT = MVT::v32i1;
2209     NumIntermediates = 2;
2210     return 2;
2211   }
2212 
2213   return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
2214                                               NumIntermediates, RegisterVT);
2215 }
2216 
getSetCCResultType(const DataLayout & DL,LLVMContext & Context,EVT VT) const2217 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
2218                                           LLVMContext& Context,
2219                                           EVT VT) const {
2220   if (!VT.isVector())
2221     return MVT::i8;
2222 
2223   if (Subtarget.hasAVX512()) {
2224     // Figure out what this type will be legalized to.
2225     EVT LegalVT = VT;
2226     while (getTypeAction(Context, LegalVT) != TypeLegal)
2227       LegalVT = getTypeToTransformTo(Context, LegalVT);
2228 
2229     // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
2230     if (LegalVT.getSimpleVT().is512BitVector())
2231       return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2232 
2233     if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
2234       // If we legalized to less than a 512-bit vector, then we will use a vXi1
2235       // compare for vXi32/vXi64 for sure. If we have BWI we will also support
2236       // vXi16/vXi8.
2237       MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
2238       if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
2239         return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2240     }
2241   }
2242 
2243   return VT.changeVectorElementTypeToInteger();
2244 }
2245 
2246 /// Helper for getByValTypeAlignment to determine
2247 /// the desired ByVal argument alignment.
getMaxByValAlign(Type * Ty,Align & MaxAlign)2248 static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
2249   if (MaxAlign == 16)
2250     return;
2251   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2252     if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128)
2253       MaxAlign = Align(16);
2254   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2255     Align EltAlign;
2256     getMaxByValAlign(ATy->getElementType(), EltAlign);
2257     if (EltAlign > MaxAlign)
2258       MaxAlign = EltAlign;
2259   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2260     for (auto *EltTy : STy->elements()) {
2261       Align EltAlign;
2262       getMaxByValAlign(EltTy, EltAlign);
2263       if (EltAlign > MaxAlign)
2264         MaxAlign = EltAlign;
2265       if (MaxAlign == 16)
2266         break;
2267     }
2268   }
2269 }
2270 
2271 /// Return the desired alignment for ByVal aggregate
2272 /// function arguments in the caller parameter area. For X86, aggregates
2273 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
2274 /// are at 4-byte boundaries.
getByValTypeAlignment(Type * Ty,const DataLayout & DL) const2275 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
2276                                                   const DataLayout &DL) const {
2277   if (Subtarget.is64Bit()) {
2278     // Max of 8 and alignment of type.
2279     Align TyAlign = DL.getABITypeAlign(Ty);
2280     if (TyAlign > 8)
2281       return TyAlign.value();
2282     return 8;
2283   }
2284 
2285   Align Alignment(4);
2286   if (Subtarget.hasSSE1())
2287     getMaxByValAlign(Ty, Alignment);
2288   return Alignment.value();
2289 }
2290 
2291 /// It returns EVT::Other if the type should be determined using generic
2292 /// target-independent logic.
2293 /// For vector ops we check that the overall size isn't larger than our
2294 /// preferred vector width.
getOptimalMemOpType(const MemOp & Op,const AttributeList & FuncAttributes) const2295 EVT X86TargetLowering::getOptimalMemOpType(
2296     const MemOp &Op, const AttributeList &FuncAttributes) const {
2297   if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
2298     if (Op.size() >= 16 &&
2299         (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
2300       // FIXME: Check if unaligned 64-byte accesses are slow.
2301       if (Op.size() >= 64 && Subtarget.hasAVX512() &&
2302           (Subtarget.getPreferVectorWidth() >= 512)) {
2303         return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
2304       }
2305       // FIXME: Check if unaligned 32-byte accesses are slow.
2306       if (Op.size() >= 32 && Subtarget.hasAVX() &&
2307           (Subtarget.getPreferVectorWidth() >= 256)) {
2308         // Although this isn't a well-supported type for AVX1, we'll let
2309         // legalization and shuffle lowering produce the optimal codegen. If we
2310         // choose an optimal type with a vector element larger than a byte,
2311         // getMemsetStores() may create an intermediate splat (using an integer
2312         // multiply) before we splat as a vector.
2313         return MVT::v32i8;
2314       }
2315       if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
2316         return MVT::v16i8;
2317       // TODO: Can SSE1 handle a byte vector?
2318       // If we have SSE1 registers we should be able to use them.
2319       if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
2320           (Subtarget.getPreferVectorWidth() >= 128))
2321         return MVT::v4f32;
2322     } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
2323                Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2324       // Do not use f64 to lower memcpy if source is string constant. It's
2325       // better to use i32 to avoid the loads.
2326       // Also, do not use f64 to lower memset unless this is a memset of zeros.
2327       // The gymnastics of splatting a byte value into an XMM register and then
2328       // only using 8-byte stores (because this is a CPU with slow unaligned
2329       // 16-byte accesses) makes that a loser.
2330       return MVT::f64;
2331     }
2332   }
2333   // This is a compromise. If we reach here, unaligned accesses may be slow on
2334   // this target. However, creating smaller, aligned accesses could be even
2335   // slower and would certainly be a lot more code.
2336   if (Subtarget.is64Bit() && Op.size() >= 8)
2337     return MVT::i64;
2338   return MVT::i32;
2339 }
2340 
isSafeMemOpType(MVT VT) const2341 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
2342   if (VT == MVT::f32)
2343     return X86ScalarSSEf32;
2344   if (VT == MVT::f64)
2345     return X86ScalarSSEf64;
2346   return true;
2347 }
2348 
allowsMisalignedMemoryAccesses(EVT VT,unsigned,Align Alignment,MachineMemOperand::Flags Flags,bool * Fast) const2349 bool X86TargetLowering::allowsMisalignedMemoryAccesses(
2350     EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
2351     bool *Fast) const {
2352   if (Fast) {
2353     switch (VT.getSizeInBits()) {
2354     default:
2355       // 8-byte and under are always assumed to be fast.
2356       *Fast = true;
2357       break;
2358     case 128:
2359       *Fast = !Subtarget.isUnalignedMem16Slow();
2360       break;
2361     case 256:
2362       *Fast = !Subtarget.isUnalignedMem32Slow();
2363       break;
2364     // TODO: What about AVX-512 (512-bit) accesses?
2365     }
2366   }
2367   // NonTemporal vector memory ops must be aligned.
2368   if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2369     // NT loads can only be vector aligned, so if its less aligned than the
2370     // minimum vector size (which we can split the vector down to), we might as
2371     // well use a regular unaligned vector load.
2372     // We don't have any NT loads pre-SSE41.
2373     if (!!(Flags & MachineMemOperand::MOLoad))
2374       return (Alignment < 16 || !Subtarget.hasSSE41());
2375     return false;
2376   }
2377   // Misaligned accesses of any size are always allowed.
2378   return true;
2379 }
2380 
2381 /// Return the entry encoding for a jump table in the
2382 /// current function.  The returned value is a member of the
2383 /// MachineJumpTableInfo::JTEntryKind enum.
getJumpTableEncoding() const2384 unsigned X86TargetLowering::getJumpTableEncoding() const {
2385   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2386   // symbol.
2387   if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2388     return MachineJumpTableInfo::EK_Custom32;
2389 
2390   // Otherwise, use the normal jump table encoding heuristics.
2391   return TargetLowering::getJumpTableEncoding();
2392 }
2393 
useSoftFloat() const2394 bool X86TargetLowering::useSoftFloat() const {
2395   return Subtarget.useSoftFloat();
2396 }
2397 
markLibCallAttributes(MachineFunction * MF,unsigned CC,ArgListTy & Args) const2398 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2399                                               ArgListTy &Args) const {
2400 
2401   // Only relabel X86-32 for C / Stdcall CCs.
2402   if (Subtarget.is64Bit())
2403     return;
2404   if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2405     return;
2406   unsigned ParamRegs = 0;
2407   if (auto *M = MF->getFunction().getParent())
2408     ParamRegs = M->getNumberRegisterParameters();
2409 
2410   // Mark the first N int arguments as having reg
2411   for (auto &Arg : Args) {
2412     Type *T = Arg.Ty;
2413     if (T->isIntOrPtrTy())
2414       if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2415         unsigned numRegs = 1;
2416         if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2417           numRegs = 2;
2418         if (ParamRegs < numRegs)
2419           return;
2420         ParamRegs -= numRegs;
2421         Arg.IsInReg = true;
2422       }
2423   }
2424 }
2425 
2426 const MCExpr *
LowerCustomJumpTableEntry(const MachineJumpTableInfo * MJTI,const MachineBasicBlock * MBB,unsigned uid,MCContext & Ctx) const2427 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2428                                              const MachineBasicBlock *MBB,
2429                                              unsigned uid,MCContext &Ctx) const{
2430   assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
2431   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2432   // entries.
2433   return MCSymbolRefExpr::create(MBB->getSymbol(),
2434                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
2435 }
2436 
2437 /// Returns relocation base for the given PIC jumptable.
getPICJumpTableRelocBase(SDValue Table,SelectionDAG & DAG) const2438 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2439                                                     SelectionDAG &DAG) const {
2440   if (!Subtarget.is64Bit())
2441     // This doesn't have SDLoc associated with it, but is not really the
2442     // same as a Register.
2443     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2444                        getPointerTy(DAG.getDataLayout()));
2445   return Table;
2446 }
2447 
2448 /// This returns the relocation base for the given PIC jumptable,
2449 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
2450 const MCExpr *X86TargetLowering::
getPICJumpTableRelocBaseExpr(const MachineFunction * MF,unsigned JTI,MCContext & Ctx) const2451 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2452                              MCContext &Ctx) const {
2453   // X86-64 uses RIP relative addressing based on the jump table label.
2454   if (Subtarget.isPICStyleRIPRel())
2455     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2456 
2457   // Otherwise, the reference is relative to the PIC base.
2458   return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2459 }
2460 
2461 std::pair<const TargetRegisterClass *, uint8_t>
findRepresentativeClass(const TargetRegisterInfo * TRI,MVT VT) const2462 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2463                                            MVT VT) const {
2464   const TargetRegisterClass *RRC = nullptr;
2465   uint8_t Cost = 1;
2466   switch (VT.SimpleTy) {
2467   default:
2468     return TargetLowering::findRepresentativeClass(TRI, VT);
2469   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2470     RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2471     break;
2472   case MVT::x86mmx:
2473     RRC = &X86::VR64RegClass;
2474     break;
2475   case MVT::f32: case MVT::f64:
2476   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2477   case MVT::v4f32: case MVT::v2f64:
2478   case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2479   case MVT::v8f32: case MVT::v4f64:
2480   case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2481   case MVT::v16f32: case MVT::v8f64:
2482     RRC = &X86::VR128XRegClass;
2483     break;
2484   }
2485   return std::make_pair(RRC, Cost);
2486 }
2487 
getAddressSpace() const2488 unsigned X86TargetLowering::getAddressSpace() const {
2489   if (Subtarget.is64Bit())
2490     return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2491   return 256;
2492 }
2493 
hasStackGuardSlotTLS(const Triple & TargetTriple)2494 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2495   return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2496          (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2497 }
2498 
SegmentOffset(IRBuilderBase & IRB,int Offset,unsigned AddressSpace)2499 static Constant* SegmentOffset(IRBuilderBase &IRB,
2500                                int Offset, unsigned AddressSpace) {
2501   return ConstantExpr::getIntToPtr(
2502       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2503       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2504 }
2505 
getIRStackGuard(IRBuilderBase & IRB) const2506 Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
2507   // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2508   // tcbhead_t; use it instead of the usual global variable (see
2509   // sysdeps/{i386,x86_64}/nptl/tls.h)
2510   if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2511     if (Subtarget.isTargetFuchsia()) {
2512       // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2513       return SegmentOffset(IRB, 0x10, getAddressSpace());
2514     } else {
2515       unsigned AddressSpace = getAddressSpace();
2516       Module *M = IRB.GetInsertBlock()->getParent()->getParent();
2517       // Specially, some users may customize the base reg and offset.
2518       int Offset = M->getStackProtectorGuardOffset();
2519       // If we don't set -stack-protector-guard-offset value:
2520       // %fs:0x28, unless we're using a Kernel code model, in which case
2521       // it's %gs:0x28.  gs:0x14 on i386.
2522       if (Offset == INT_MAX)
2523         Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2524 
2525       StringRef GuardReg = M->getStackProtectorGuardReg();
2526       if (GuardReg == "fs")
2527         AddressSpace = X86AS::FS;
2528       else if (GuardReg == "gs")
2529         AddressSpace = X86AS::GS;
2530       return SegmentOffset(IRB, Offset, AddressSpace);
2531     }
2532   }
2533   return TargetLowering::getIRStackGuard(IRB);
2534 }
2535 
insertSSPDeclarations(Module & M) const2536 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2537   // MSVC CRT provides functionalities for stack protection.
2538   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2539       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2540     // MSVC CRT has a global variable holding security cookie.
2541     M.getOrInsertGlobal("__security_cookie",
2542                         Type::getInt8PtrTy(M.getContext()));
2543 
2544     // MSVC CRT has a function to validate security cookie.
2545     FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
2546         "__security_check_cookie", Type::getVoidTy(M.getContext()),
2547         Type::getInt8PtrTy(M.getContext()));
2548     if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
2549       F->setCallingConv(CallingConv::X86_FastCall);
2550       F->addAttribute(1, Attribute::AttrKind::InReg);
2551     }
2552     return;
2553   }
2554 
2555   StringRef GuardMode = M.getStackProtectorGuard();
2556 
2557   // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2558   if ((GuardMode == "tls" || GuardMode.empty()) &&
2559       hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2560     return;
2561   TargetLowering::insertSSPDeclarations(M);
2562 }
2563 
getSDagStackGuard(const Module & M) const2564 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2565   // MSVC CRT has a global variable holding security cookie.
2566   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2567       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2568     return M.getGlobalVariable("__security_cookie");
2569   }
2570   return TargetLowering::getSDagStackGuard(M);
2571 }
2572 
getSSPStackGuardCheck(const Module & M) const2573 Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2574   // MSVC CRT has a function to validate security cookie.
2575   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2576       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2577     return M.getFunction("__security_check_cookie");
2578   }
2579   return TargetLowering::getSSPStackGuardCheck(M);
2580 }
2581 
2582 Value *
getSafeStackPointerLocation(IRBuilderBase & IRB) const2583 X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
2584   if (Subtarget.getTargetTriple().isOSContiki())
2585     return getDefaultSafeStackPointerLocation(IRB, false);
2586 
2587   // Android provides a fixed TLS slot for the SafeStack pointer. See the
2588   // definition of TLS_SLOT_SAFESTACK in
2589   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2590   if (Subtarget.isTargetAndroid()) {
2591     // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2592     // %gs:0x24 on i386
2593     int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2594     return SegmentOffset(IRB, Offset, getAddressSpace());
2595   }
2596 
2597   // Fuchsia is similar.
2598   if (Subtarget.isTargetFuchsia()) {
2599     // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2600     return SegmentOffset(IRB, 0x18, getAddressSpace());
2601   }
2602 
2603   return TargetLowering::getSafeStackPointerLocation(IRB);
2604 }
2605 
2606 //===----------------------------------------------------------------------===//
2607 //               Return Value Calling Convention Implementation
2608 //===----------------------------------------------------------------------===//
2609 
CanLowerReturn(CallingConv::ID CallConv,MachineFunction & MF,bool isVarArg,const SmallVectorImpl<ISD::OutputArg> & Outs,LLVMContext & Context) const2610 bool X86TargetLowering::CanLowerReturn(
2611     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2612     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2613   SmallVector<CCValAssign, 16> RVLocs;
2614   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2615   return CCInfo.CheckReturn(Outs, RetCC_X86);
2616 }
2617 
getScratchRegisters(CallingConv::ID) const2618 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2619   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2620   return ScratchRegs;
2621 }
2622 
2623 /// Lowers masks values (v*i1) to the local register values
2624 /// \returns DAG node after lowering to register type
lowerMasksToReg(const SDValue & ValArg,const EVT & ValLoc,const SDLoc & Dl,SelectionDAG & DAG)2625 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2626                                const SDLoc &Dl, SelectionDAG &DAG) {
2627   EVT ValVT = ValArg.getValueType();
2628 
2629   if (ValVT == MVT::v1i1)
2630     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2631                        DAG.getIntPtrConstant(0, Dl));
2632 
2633   if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2634       (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2635     // Two stage lowering might be required
2636     // bitcast:   v8i1 -> i8 / v16i1 -> i16
2637     // anyextend: i8   -> i32 / i16   -> i32
2638     EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2639     SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2640     if (ValLoc == MVT::i32)
2641       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2642     return ValToCopy;
2643   }
2644 
2645   if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2646       (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2647     // One stage lowering is required
2648     // bitcast:   v32i1 -> i32 / v64i1 -> i64
2649     return DAG.getBitcast(ValLoc, ValArg);
2650   }
2651 
2652   return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2653 }
2654 
2655 /// Breaks v64i1 value into two registers and adds the new node to the DAG
Passv64i1ArgInRegs(const SDLoc & Dl,SelectionDAG & DAG,SDValue & Arg,SmallVectorImpl<std::pair<Register,SDValue>> & RegsToPass,CCValAssign & VA,CCValAssign & NextVA,const X86Subtarget & Subtarget)2656 static void Passv64i1ArgInRegs(
2657     const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
2658     SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
2659     CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2660   assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
2661   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2662   assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2663   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2664          "The value should reside in two registers");
2665 
2666   // Before splitting the value we cast it to i64
2667   Arg = DAG.getBitcast(MVT::i64, Arg);
2668 
2669   // Splitting the value into two i32 types
2670   SDValue Lo, Hi;
2671   Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2672                    DAG.getConstant(0, Dl, MVT::i32));
2673   Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2674                    DAG.getConstant(1, Dl, MVT::i32));
2675 
2676   // Attach the two i32 types into corresponding registers
2677   RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2678   RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2679 }
2680 
2681 SDValue
LowerReturn(SDValue Chain,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::OutputArg> & Outs,const SmallVectorImpl<SDValue> & OutVals,const SDLoc & dl,SelectionDAG & DAG) const2682 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2683                                bool isVarArg,
2684                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2685                                const SmallVectorImpl<SDValue> &OutVals,
2686                                const SDLoc &dl, SelectionDAG &DAG) const {
2687   MachineFunction &MF = DAG.getMachineFunction();
2688   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2689 
2690   // In some cases we need to disable registers from the default CSR list.
2691   // For example, when they are used for argument passing.
2692   bool ShouldDisableCalleeSavedRegister =
2693       CallConv == CallingConv::X86_RegCall ||
2694       MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2695 
2696   if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2697     report_fatal_error("X86 interrupts may not return any value");
2698 
2699   SmallVector<CCValAssign, 16> RVLocs;
2700   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2701   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2702 
2703   SmallVector<std::pair<Register, SDValue>, 4> RetVals;
2704   for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2705        ++I, ++OutsIndex) {
2706     CCValAssign &VA = RVLocs[I];
2707     assert(VA.isRegLoc() && "Can only return in registers!");
2708 
2709     // Add the register to the CalleeSaveDisableRegs list.
2710     if (ShouldDisableCalleeSavedRegister)
2711       MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2712 
2713     SDValue ValToCopy = OutVals[OutsIndex];
2714     EVT ValVT = ValToCopy.getValueType();
2715 
2716     // Promote values to the appropriate types.
2717     if (VA.getLocInfo() == CCValAssign::SExt)
2718       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2719     else if (VA.getLocInfo() == CCValAssign::ZExt)
2720       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2721     else if (VA.getLocInfo() == CCValAssign::AExt) {
2722       if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2723         ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2724       else
2725         ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2726     }
2727     else if (VA.getLocInfo() == CCValAssign::BCvt)
2728       ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2729 
2730     assert(VA.getLocInfo() != CCValAssign::FPExt &&
2731            "Unexpected FP-extend for return value.");
2732 
2733     // Report an error if we have attempted to return a value via an XMM
2734     // register and SSE was disabled.
2735     if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
2736       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2737       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2738     } else if (!Subtarget.hasSSE2() &&
2739                X86::FR64XRegClass.contains(VA.getLocReg()) &&
2740                ValVT == MVT::f64) {
2741       // When returning a double via an XMM register, report an error if SSE2 is
2742       // not enabled.
2743       errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2744       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2745     }
2746 
2747     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2748     // the RET instruction and handled by the FP Stackifier.
2749     if (VA.getLocReg() == X86::FP0 ||
2750         VA.getLocReg() == X86::FP1) {
2751       // If this is a copy from an xmm register to ST(0), use an FPExtend to
2752       // change the value to the FP stack register class.
2753       if (isScalarFPTypeInSSEReg(VA.getValVT()))
2754         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2755       RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2756       // Don't emit a copytoreg.
2757       continue;
2758     }
2759 
2760     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2761     // which is returned in RAX / RDX.
2762     if (Subtarget.is64Bit()) {
2763       if (ValVT == MVT::x86mmx) {
2764         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2765           ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2766           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2767                                   ValToCopy);
2768           // If we don't have SSE2 available, convert to v4f32 so the generated
2769           // register is legal.
2770           if (!Subtarget.hasSSE2())
2771             ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2772         }
2773       }
2774     }
2775 
2776     if (VA.needsCustom()) {
2777       assert(VA.getValVT() == MVT::v64i1 &&
2778              "Currently the only custom case is when we split v64i1 to 2 regs");
2779 
2780       Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
2781                          Subtarget);
2782 
2783       // Add the second register to the CalleeSaveDisableRegs list.
2784       if (ShouldDisableCalleeSavedRegister)
2785         MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2786     } else {
2787       RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2788     }
2789   }
2790 
2791   SDValue Flag;
2792   SmallVector<SDValue, 6> RetOps;
2793   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2794   // Operand #1 = Bytes To Pop
2795   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2796                    MVT::i32));
2797 
2798   // Copy the result values into the output registers.
2799   for (auto &RetVal : RetVals) {
2800     if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
2801       RetOps.push_back(RetVal.second);
2802       continue; // Don't emit a copytoreg.
2803     }
2804 
2805     Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);
2806     Flag = Chain.getValue(1);
2807     RetOps.push_back(
2808         DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
2809   }
2810 
2811   // Swift calling convention does not require we copy the sret argument
2812   // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2813 
2814   // All x86 ABIs require that for returning structs by value we copy
2815   // the sret argument into %rax/%eax (depending on ABI) for the return.
2816   // We saved the argument into a virtual register in the entry block,
2817   // so now we copy the value out and into %rax/%eax.
2818   //
2819   // Checking Function.hasStructRetAttr() here is insufficient because the IR
2820   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2821   // false, then an sret argument may be implicitly inserted in the SelDAG. In
2822   // either case FuncInfo->setSRetReturnReg() will have been called.
2823   if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
2824     // When we have both sret and another return value, we should use the
2825     // original Chain stored in RetOps[0], instead of the current Chain updated
2826     // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2827 
2828     // For the case of sret and another return value, we have
2829     //   Chain_0 at the function entry
2830     //   Chain_1 = getCopyToReg(Chain_0) in the above loop
2831     // If we use Chain_1 in getCopyFromReg, we will have
2832     //   Val = getCopyFromReg(Chain_1)
2833     //   Chain_2 = getCopyToReg(Chain_1, Val) from below
2834 
2835     // getCopyToReg(Chain_0) will be glued together with
2836     // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2837     // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2838     //   Data dependency from Unit B to Unit A due to usage of Val in
2839     //     getCopyToReg(Chain_1, Val)
2840     //   Chain dependency from Unit A to Unit B
2841 
2842     // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2843     SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2844                                      getPointerTy(MF.getDataLayout()));
2845 
2846     Register RetValReg
2847         = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2848           X86::RAX : X86::EAX;
2849     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2850     Flag = Chain.getValue(1);
2851 
2852     // RAX/EAX now acts like a return value.
2853     RetOps.push_back(
2854         DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2855 
2856     // Add the returned register to the CalleeSaveDisableRegs list.
2857     if (ShouldDisableCalleeSavedRegister)
2858       MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2859   }
2860 
2861   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2862   const MCPhysReg *I =
2863       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2864   if (I) {
2865     for (; *I; ++I) {
2866       if (X86::GR64RegClass.contains(*I))
2867         RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2868       else
2869         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2870     }
2871   }
2872 
2873   RetOps[0] = Chain;  // Update chain.
2874 
2875   // Add the flag if we have it.
2876   if (Flag.getNode())
2877     RetOps.push_back(Flag);
2878 
2879   X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2880   if (CallConv == CallingConv::X86_INTR)
2881     opcode = X86ISD::IRET;
2882   return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2883 }
2884 
isUsedByReturnOnly(SDNode * N,SDValue & Chain) const2885 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2886   if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2887     return false;
2888 
2889   SDValue TCChain = Chain;
2890   SDNode *Copy = *N->use_begin();
2891   if (Copy->getOpcode() == ISD::CopyToReg) {
2892     // If the copy has a glue operand, we conservatively assume it isn't safe to
2893     // perform a tail call.
2894     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2895       return false;
2896     TCChain = Copy->getOperand(0);
2897   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2898     return false;
2899 
2900   bool HasRet = false;
2901   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2902        UI != UE; ++UI) {
2903     if (UI->getOpcode() != X86ISD::RET_FLAG)
2904       return false;
2905     // If we are returning more than one value, we can definitely
2906     // not make a tail call see PR19530
2907     if (UI->getNumOperands() > 4)
2908       return false;
2909     if (UI->getNumOperands() == 4 &&
2910         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2911       return false;
2912     HasRet = true;
2913   }
2914 
2915   if (!HasRet)
2916     return false;
2917 
2918   Chain = TCChain;
2919   return true;
2920 }
2921 
getTypeForExtReturn(LLVMContext & Context,EVT VT,ISD::NodeType ExtendKind) const2922 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2923                                            ISD::NodeType ExtendKind) const {
2924   MVT ReturnMVT = MVT::i32;
2925 
2926   bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2927   if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2928     // The ABI does not require i1, i8 or i16 to be extended.
2929     //
2930     // On Darwin, there is code in the wild relying on Clang's old behaviour of
2931     // always extending i8/i16 return values, so keep doing that for now.
2932     // (PR26665).
2933     ReturnMVT = MVT::i8;
2934   }
2935 
2936   EVT MinVT = getRegisterType(Context, ReturnMVT);
2937   return VT.bitsLT(MinVT) ? MinVT : VT;
2938 }
2939 
2940 /// Reads two 32 bit registers and creates a 64 bit mask value.
2941 /// \param VA The current 32 bit value that need to be assigned.
2942 /// \param NextVA The next 32 bit value that need to be assigned.
2943 /// \param Root The parent DAG node.
2944 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2945 ///                        glue purposes. In the case the DAG is already using
2946 ///                        physical register instead of virtual, we should glue
2947 ///                        our new SDValue to InFlag SDvalue.
2948 /// \return a new SDvalue of size 64bit.
getv64i1Argument(CCValAssign & VA,CCValAssign & NextVA,SDValue & Root,SelectionDAG & DAG,const SDLoc & Dl,const X86Subtarget & Subtarget,SDValue * InFlag=nullptr)2949 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2950                                 SDValue &Root, SelectionDAG &DAG,
2951                                 const SDLoc &Dl, const X86Subtarget &Subtarget,
2952                                 SDValue *InFlag = nullptr) {
2953   assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2954   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2955   assert(VA.getValVT() == MVT::v64i1 &&
2956          "Expecting first location of 64 bit width type");
2957   assert(NextVA.getValVT() == VA.getValVT() &&
2958          "The locations should have the same type");
2959   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2960          "The values should reside in two registers");
2961 
2962   SDValue Lo, Hi;
2963   SDValue ArgValueLo, ArgValueHi;
2964 
2965   MachineFunction &MF = DAG.getMachineFunction();
2966   const TargetRegisterClass *RC = &X86::GR32RegClass;
2967 
2968   // Read a 32 bit value from the registers.
2969   if (nullptr == InFlag) {
2970     // When no physical register is present,
2971     // create an intermediate virtual register.
2972     Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
2973     ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2974     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2975     ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2976   } else {
2977     // When a physical register is available read the value from it and glue
2978     // the reads together.
2979     ArgValueLo =
2980       DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2981     *InFlag = ArgValueLo.getValue(2);
2982     ArgValueHi =
2983       DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2984     *InFlag = ArgValueHi.getValue(2);
2985   }
2986 
2987   // Convert the i32 type into v32i1 type.
2988   Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2989 
2990   // Convert the i32 type into v32i1 type.
2991   Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2992 
2993   // Concatenate the two values together.
2994   return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2995 }
2996 
2997 /// The function will lower a register of various sizes (8/16/32/64)
2998 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2999 /// \returns a DAG node contains the operand after lowering to mask type.
lowerRegToMasks(const SDValue & ValArg,const EVT & ValVT,const EVT & ValLoc,const SDLoc & Dl,SelectionDAG & DAG)3000 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
3001                                const EVT &ValLoc, const SDLoc &Dl,
3002                                SelectionDAG &DAG) {
3003   SDValue ValReturned = ValArg;
3004 
3005   if (ValVT == MVT::v1i1)
3006     return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
3007 
3008   if (ValVT == MVT::v64i1) {
3009     // In 32 bit machine, this case is handled by getv64i1Argument
3010     assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
3011     // In 64 bit machine, There is no need to truncate the value only bitcast
3012   } else {
3013     MVT maskLen;
3014     switch (ValVT.getSimpleVT().SimpleTy) {
3015     case MVT::v8i1:
3016       maskLen = MVT::i8;
3017       break;
3018     case MVT::v16i1:
3019       maskLen = MVT::i16;
3020       break;
3021     case MVT::v32i1:
3022       maskLen = MVT::i32;
3023       break;
3024     default:
3025       llvm_unreachable("Expecting a vector of i1 types");
3026     }
3027 
3028     ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
3029   }
3030   return DAG.getBitcast(ValVT, ValReturned);
3031 }
3032 
3033 /// Lower the result values of a call into the
3034 /// appropriate copies out of appropriate physical registers.
3035 ///
LowerCallResult(SDValue Chain,SDValue InFlag,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & dl,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals,uint32_t * RegMask) const3036 SDValue X86TargetLowering::LowerCallResult(
3037     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3038     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3039     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
3040     uint32_t *RegMask) const {
3041 
3042   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3043   // Assign locations to each value returned by this call.
3044   SmallVector<CCValAssign, 16> RVLocs;
3045   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3046                  *DAG.getContext());
3047   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3048 
3049   // Copy all of the result registers out of their specified physreg.
3050   for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
3051        ++I, ++InsIndex) {
3052     CCValAssign &VA = RVLocs[I];
3053     EVT CopyVT = VA.getLocVT();
3054 
3055     // In some calling conventions we need to remove the used registers
3056     // from the register mask.
3057     if (RegMask) {
3058       for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
3059            SubRegs.isValid(); ++SubRegs)
3060         RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3061     }
3062 
3063     // Report an error if there was an attempt to return FP values via XMM
3064     // registers.
3065     if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3066       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3067       if (VA.getLocReg() == X86::XMM1)
3068         VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3069       else
3070         VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3071     } else if (!Subtarget.hasSSE2() &&
3072                X86::FR64XRegClass.contains(VA.getLocReg()) &&
3073                CopyVT == MVT::f64) {
3074       errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3075       if (VA.getLocReg() == X86::XMM1)
3076         VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3077       else
3078         VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3079     }
3080 
3081     // If we prefer to use the value in xmm registers, copy it out as f80 and
3082     // use a truncate to move it from fp stack reg to xmm reg.
3083     bool RoundAfterCopy = false;
3084     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
3085         isScalarFPTypeInSSEReg(VA.getValVT())) {
3086       if (!Subtarget.hasX87())
3087         report_fatal_error("X87 register return with X87 disabled");
3088       CopyVT = MVT::f80;
3089       RoundAfterCopy = (CopyVT != VA.getLocVT());
3090     }
3091 
3092     SDValue Val;
3093     if (VA.needsCustom()) {
3094       assert(VA.getValVT() == MVT::v64i1 &&
3095              "Currently the only custom case is when we split v64i1 to 2 regs");
3096       Val =
3097           getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
3098     } else {
3099       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
3100                   .getValue(1);
3101       Val = Chain.getValue(0);
3102       InFlag = Chain.getValue(2);
3103     }
3104 
3105     if (RoundAfterCopy)
3106       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
3107                         // This truncation won't change the value.
3108                         DAG.getIntPtrConstant(1, dl));
3109 
3110     if (VA.isExtInLoc()) {
3111       if (VA.getValVT().isVector() &&
3112           VA.getValVT().getScalarType() == MVT::i1 &&
3113           ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3114            (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3115         // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3116         Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
3117       } else
3118         Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
3119     }
3120 
3121     if (VA.getLocInfo() == CCValAssign::BCvt)
3122       Val = DAG.getBitcast(VA.getValVT(), Val);
3123 
3124     InVals.push_back(Val);
3125   }
3126 
3127   return Chain;
3128 }
3129 
3130 //===----------------------------------------------------------------------===//
3131 //                C & StdCall & Fast Calling Convention implementation
3132 //===----------------------------------------------------------------------===//
3133 //  StdCall calling convention seems to be standard for many Windows' API
3134 //  routines and around. It differs from C calling convention just a little:
3135 //  callee should clean up the stack, not caller. Symbols should be also
3136 //  decorated in some fancy way :) It doesn't support any vector arguments.
3137 //  For info on fast calling convention see Fast Calling Convention (tail call)
3138 //  implementation LowerX86_32FastCCCallTo.
3139 
3140 /// CallIsStructReturn - Determines whether a call uses struct return
3141 /// semantics.
3142 enum StructReturnType {
3143   NotStructReturn,
3144   RegStructReturn,
3145   StackStructReturn
3146 };
3147 static StructReturnType
callIsStructReturn(ArrayRef<ISD::OutputArg> Outs,bool IsMCU)3148 callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
3149   if (Outs.empty())
3150     return NotStructReturn;
3151 
3152   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
3153   if (!Flags.isSRet())
3154     return NotStructReturn;
3155   if (Flags.isInReg() || IsMCU)
3156     return RegStructReturn;
3157   return StackStructReturn;
3158 }
3159 
3160 /// Determines whether a function uses struct return semantics.
3161 static StructReturnType
argsAreStructReturn(ArrayRef<ISD::InputArg> Ins,bool IsMCU)3162 argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
3163   if (Ins.empty())
3164     return NotStructReturn;
3165 
3166   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
3167   if (!Flags.isSRet())
3168     return NotStructReturn;
3169   if (Flags.isInReg() || IsMCU)
3170     return RegStructReturn;
3171   return StackStructReturn;
3172 }
3173 
3174 /// Make a copy of an aggregate at address specified by "Src" to address
3175 /// "Dst" with size and alignment information specified by the specific
3176 /// parameter attribute. The copy will be passed as a byval function parameter.
CreateCopyOfByValArgument(SDValue Src,SDValue Dst,SDValue Chain,ISD::ArgFlagsTy Flags,SelectionDAG & DAG,const SDLoc & dl)3177 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
3178                                          SDValue Chain, ISD::ArgFlagsTy Flags,
3179                                          SelectionDAG &DAG, const SDLoc &dl) {
3180   SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
3181 
3182   return DAG.getMemcpy(
3183       Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
3184       /*isVolatile*/ false, /*AlwaysInline=*/true,
3185       /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
3186 }
3187 
3188 /// Return true if the calling convention is one that we can guarantee TCO for.
canGuaranteeTCO(CallingConv::ID CC)3189 static bool canGuaranteeTCO(CallingConv::ID CC) {
3190   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3191           CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
3192           CC == CallingConv::HHVM || CC == CallingConv::Tail ||
3193           CC == CallingConv::SwiftTail);
3194 }
3195 
3196 /// Return true if we might ever do TCO for calls with this calling convention.
mayTailCallThisCC(CallingConv::ID CC)3197 static bool mayTailCallThisCC(CallingConv::ID CC) {
3198   switch (CC) {
3199   // C calling conventions:
3200   case CallingConv::C:
3201   case CallingConv::Win64:
3202   case CallingConv::X86_64_SysV:
3203   // Callee pop conventions:
3204   case CallingConv::X86_ThisCall:
3205   case CallingConv::X86_StdCall:
3206   case CallingConv::X86_VectorCall:
3207   case CallingConv::X86_FastCall:
3208   // Swift:
3209   case CallingConv::Swift:
3210     return true;
3211   default:
3212     return canGuaranteeTCO(CC);
3213   }
3214 }
3215 
3216 /// Return true if the function is being made into a tailcall target by
3217 /// changing its ABI.
shouldGuaranteeTCO(CallingConv::ID CC,bool GuaranteedTailCallOpt)3218 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
3219   return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
3220          CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
3221 }
3222 
mayBeEmittedAsTailCall(const CallInst * CI) const3223 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3224   if (!CI->isTailCall())
3225     return false;
3226 
3227   CallingConv::ID CalleeCC = CI->getCallingConv();
3228   if (!mayTailCallThisCC(CalleeCC))
3229     return false;
3230 
3231   return true;
3232 }
3233 
3234 SDValue
LowerMemArgument(SDValue Chain,CallingConv::ID CallConv,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & dl,SelectionDAG & DAG,const CCValAssign & VA,MachineFrameInfo & MFI,unsigned i) const3235 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
3236                                     const SmallVectorImpl<ISD::InputArg> &Ins,
3237                                     const SDLoc &dl, SelectionDAG &DAG,
3238                                     const CCValAssign &VA,
3239                                     MachineFrameInfo &MFI, unsigned i) const {
3240   // Create the nodes corresponding to a load from this parameter slot.
3241   ISD::ArgFlagsTy Flags = Ins[i].Flags;
3242   bool AlwaysUseMutable = shouldGuaranteeTCO(
3243       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
3244   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
3245   EVT ValVT;
3246   MVT PtrVT = getPointerTy(DAG.getDataLayout());
3247 
3248   // If value is passed by pointer we have address passed instead of the value
3249   // itself. No need to extend if the mask value and location share the same
3250   // absolute size.
3251   bool ExtendedInMem =
3252       VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
3253       VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
3254 
3255   if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
3256     ValVT = VA.getLocVT();
3257   else
3258     ValVT = VA.getValVT();
3259 
3260   // FIXME: For now, all byval parameter objects are marked mutable. This can be
3261   // changed with more analysis.
3262   // In case of tail call optimization mark all arguments mutable. Since they
3263   // could be overwritten by lowering of arguments in case of a tail call.
3264   if (Flags.isByVal()) {
3265     unsigned Bytes = Flags.getByValSize();
3266     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
3267 
3268     // FIXME: For now, all byval parameter objects are marked as aliasing. This
3269     // can be improved with deeper analysis.
3270     int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
3271                                    /*isAliased=*/true);
3272     return DAG.getFrameIndex(FI, PtrVT);
3273   }
3274 
3275   EVT ArgVT = Ins[i].ArgVT;
3276 
3277   // If this is a vector that has been split into multiple parts, and the
3278   // scalar size of the parts don't match the vector element size, then we can't
3279   // elide the copy. The parts will have padding between them instead of being
3280   // packed like a vector.
3281   bool ScalarizedAndExtendedVector =
3282       ArgVT.isVector() && !VA.getLocVT().isVector() &&
3283       VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
3284 
3285   // This is an argument in memory. We might be able to perform copy elision.
3286   // If the argument is passed directly in memory without any extension, then we
3287   // can perform copy elision. Large vector types, for example, may be passed
3288   // indirectly by pointer.
3289   if (Flags.isCopyElisionCandidate() &&
3290       VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
3291       !ScalarizedAndExtendedVector) {
3292     SDValue PartAddr;
3293     if (Ins[i].PartOffset == 0) {
3294       // If this is a one-part value or the first part of a multi-part value,
3295       // create a stack object for the entire argument value type and return a
3296       // load from our portion of it. This assumes that if the first part of an
3297       // argument is in memory, the rest will also be in memory.
3298       int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
3299                                      /*IsImmutable=*/false);
3300       PartAddr = DAG.getFrameIndex(FI, PtrVT);
3301       return DAG.getLoad(
3302           ValVT, dl, Chain, PartAddr,
3303           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3304     } else {
3305       // This is not the first piece of an argument in memory. See if there is
3306       // already a fixed stack object including this offset. If so, assume it
3307       // was created by the PartOffset == 0 branch above and create a load from
3308       // the appropriate offset into it.
3309       int64_t PartBegin = VA.getLocMemOffset();
3310       int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
3311       int FI = MFI.getObjectIndexBegin();
3312       for (; MFI.isFixedObjectIndex(FI); ++FI) {
3313         int64_t ObjBegin = MFI.getObjectOffset(FI);
3314         int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
3315         if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
3316           break;
3317       }
3318       if (MFI.isFixedObjectIndex(FI)) {
3319         SDValue Addr =
3320             DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
3321                         DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
3322         return DAG.getLoad(
3323             ValVT, dl, Chain, Addr,
3324             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
3325                                               Ins[i].PartOffset));
3326       }
3327     }
3328   }
3329 
3330   int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
3331                                  VA.getLocMemOffset(), isImmutable);
3332 
3333   // Set SExt or ZExt flag.
3334   if (VA.getLocInfo() == CCValAssign::ZExt) {
3335     MFI.setObjectZExt(FI, true);
3336   } else if (VA.getLocInfo() == CCValAssign::SExt) {
3337     MFI.setObjectSExt(FI, true);
3338   }
3339 
3340   SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3341   SDValue Val = DAG.getLoad(
3342       ValVT, dl, Chain, FIN,
3343       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3344   return ExtendedInMem
3345              ? (VA.getValVT().isVector()
3346                     ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
3347                     : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
3348              : Val;
3349 }
3350 
3351 // FIXME: Get this from tablegen.
get64BitArgumentGPRs(CallingConv::ID CallConv,const X86Subtarget & Subtarget)3352 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
3353                                                 const X86Subtarget &Subtarget) {
3354   assert(Subtarget.is64Bit());
3355 
3356   if (Subtarget.isCallingConvWin64(CallConv)) {
3357     static const MCPhysReg GPR64ArgRegsWin64[] = {
3358       X86::RCX, X86::RDX, X86::R8,  X86::R9
3359     };
3360     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
3361   }
3362 
3363   static const MCPhysReg GPR64ArgRegs64Bit[] = {
3364     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
3365   };
3366   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
3367 }
3368 
3369 // FIXME: Get this from tablegen.
get64BitArgumentXMMs(MachineFunction & MF,CallingConv::ID CallConv,const X86Subtarget & Subtarget)3370 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
3371                                                 CallingConv::ID CallConv,
3372                                                 const X86Subtarget &Subtarget) {
3373   assert(Subtarget.is64Bit());
3374   if (Subtarget.isCallingConvWin64(CallConv)) {
3375     // The XMM registers which might contain var arg parameters are shadowed
3376     // in their paired GPR.  So we only need to save the GPR to their home
3377     // slots.
3378     // TODO: __vectorcall will change this.
3379     return None;
3380   }
3381 
3382   bool isSoftFloat = Subtarget.useSoftFloat();
3383   if (isSoftFloat || !Subtarget.hasSSE1())
3384     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
3385     // registers.
3386     return None;
3387 
3388   static const MCPhysReg XMMArgRegs64Bit[] = {
3389     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3390     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3391   };
3392   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
3393 }
3394 
3395 #ifndef NDEBUG
isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs)3396 static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
3397   return llvm::is_sorted(
3398       ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
3399         return A.getValNo() < B.getValNo();
3400       });
3401 }
3402 #endif
3403 
3404 namespace {
3405 /// This is a helper class for lowering variable arguments parameters.
3406 class VarArgsLoweringHelper {
3407 public:
VarArgsLoweringHelper(X86MachineFunctionInfo * FuncInfo,const SDLoc & Loc,SelectionDAG & DAG,const X86Subtarget & Subtarget,CallingConv::ID CallConv,CCState & CCInfo)3408   VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
3409                         SelectionDAG &DAG, const X86Subtarget &Subtarget,
3410                         CallingConv::ID CallConv, CCState &CCInfo)
3411       : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
3412         TheMachineFunction(DAG.getMachineFunction()),
3413         TheFunction(TheMachineFunction.getFunction()),
3414         FrameInfo(TheMachineFunction.getFrameInfo()),
3415         FrameLowering(*Subtarget.getFrameLowering()),
3416         TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
3417         CCInfo(CCInfo) {}
3418 
3419   // Lower variable arguments parameters.
3420   void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
3421 
3422 private:
3423   void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
3424 
3425   void forwardMustTailParameters(SDValue &Chain);
3426 
is64Bit() const3427   bool is64Bit() const { return Subtarget.is64Bit(); }
isWin64() const3428   bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
3429 
3430   X86MachineFunctionInfo *FuncInfo;
3431   const SDLoc &DL;
3432   SelectionDAG &DAG;
3433   const X86Subtarget &Subtarget;
3434   MachineFunction &TheMachineFunction;
3435   const Function &TheFunction;
3436   MachineFrameInfo &FrameInfo;
3437   const TargetFrameLowering &FrameLowering;
3438   const TargetLowering &TargLowering;
3439   CallingConv::ID CallConv;
3440   CCState &CCInfo;
3441 };
3442 } // namespace
3443 
createVarArgAreaAndStoreRegisters(SDValue & Chain,unsigned StackSize)3444 void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
3445     SDValue &Chain, unsigned StackSize) {
3446   // If the function takes variable number of arguments, make a frame index for
3447   // the start of the first vararg value... for expansion of llvm.va_start. We
3448   // can skip this if there are no va_start calls.
3449   if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
3450                     CallConv != CallingConv::X86_ThisCall)) {
3451     FuncInfo->setVarArgsFrameIndex(
3452         FrameInfo.CreateFixedObject(1, StackSize, true));
3453   }
3454 
3455   // 64-bit calling conventions support varargs and register parameters, so we
3456   // have to do extra work to spill them in the prologue.
3457   if (is64Bit()) {
3458     // Find the first unallocated argument registers.
3459     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3460     ArrayRef<MCPhysReg> ArgXMMs =
3461         get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
3462     unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3463     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3464 
3465     assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3466            "SSE register cannot be used when SSE is disabled!");
3467 
3468     if (isWin64()) {
3469       // Get to the caller-allocated home save location.  Add 8 to account
3470       // for the return address.
3471       int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
3472       FuncInfo->setRegSaveFrameIndex(
3473           FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3474       // Fixup to set vararg frame on shadow area (4 x i64).
3475       if (NumIntRegs < 4)
3476         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3477     } else {
3478       // For X86-64, if there are vararg parameters that are passed via
3479       // registers, then we must store them to their spots on the stack so
3480       // they may be loaded by dereferencing the result of va_next.
3481       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3482       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3483       FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
3484           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
3485     }
3486 
3487     SmallVector<SDValue, 6>
3488         LiveGPRs; // list of SDValue for GPR registers keeping live input value
3489     SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
3490                                          // keeping live input value
3491     SDValue ALVal; // if applicable keeps SDValue for %al register
3492 
3493     // Gather all the live in physical registers.
3494     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3495       Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
3496       LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
3497     }
3498     const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
3499     if (!AvailableXmms.empty()) {
3500       Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3501       ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
3502       for (MCPhysReg Reg : AvailableXmms) {
3503         // FastRegisterAllocator spills virtual registers at basic
3504         // block boundary. That leads to usages of xmm registers
3505         // outside of check for %al. Pass physical registers to
3506         // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
3507         TheMachineFunction.getRegInfo().addLiveIn(Reg);
3508         LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
3509       }
3510     }
3511 
3512     // Store the integer parameter registers.
3513     SmallVector<SDValue, 8> MemOps;
3514     SDValue RSFIN =
3515         DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3516                           TargLowering.getPointerTy(DAG.getDataLayout()));
3517     unsigned Offset = FuncInfo->getVarArgsGPOffset();
3518     for (SDValue Val : LiveGPRs) {
3519       SDValue FIN = DAG.getNode(ISD::ADD, DL,
3520                                 TargLowering.getPointerTy(DAG.getDataLayout()),
3521                                 RSFIN, DAG.getIntPtrConstant(Offset, DL));
3522       SDValue Store =
3523           DAG.getStore(Val.getValue(1), DL, Val, FIN,
3524                        MachinePointerInfo::getFixedStack(
3525                            DAG.getMachineFunction(),
3526                            FuncInfo->getRegSaveFrameIndex(), Offset));
3527       MemOps.push_back(Store);
3528       Offset += 8;
3529     }
3530 
3531     // Now store the XMM (fp + vector) parameter registers.
3532     if (!LiveXMMRegs.empty()) {
3533       SmallVector<SDValue, 12> SaveXMMOps;
3534       SaveXMMOps.push_back(Chain);
3535       SaveXMMOps.push_back(ALVal);
3536       SaveXMMOps.push_back(RSFIN);
3537       SaveXMMOps.push_back(
3538           DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
3539       llvm::append_range(SaveXMMOps, LiveXMMRegs);
3540       MachineMemOperand *StoreMMO =
3541           DAG.getMachineFunction().getMachineMemOperand(
3542               MachinePointerInfo::getFixedStack(
3543                   DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(),
3544                   Offset),
3545               MachineMemOperand::MOStore, 128, Align(16));
3546       MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS,
3547                                                DL, DAG.getVTList(MVT::Other),
3548                                                SaveXMMOps, MVT::i8, StoreMMO));
3549     }
3550 
3551     if (!MemOps.empty())
3552       Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3553   }
3554 }
3555 
forwardMustTailParameters(SDValue & Chain)3556 void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
3557   // Find the largest legal vector type.
3558   MVT VecVT = MVT::Other;
3559   // FIXME: Only some x86_32 calling conventions support AVX512.
3560   if (Subtarget.useAVX512Regs() &&
3561       (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
3562                      CallConv == CallingConv::Intel_OCL_BI)))
3563     VecVT = MVT::v16f32;
3564   else if (Subtarget.hasAVX())
3565     VecVT = MVT::v8f32;
3566   else if (Subtarget.hasSSE2())
3567     VecVT = MVT::v4f32;
3568 
3569   // We forward some GPRs and some vector types.
3570   SmallVector<MVT, 2> RegParmTypes;
3571   MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
3572   RegParmTypes.push_back(IntVT);
3573   if (VecVT != MVT::Other)
3574     RegParmTypes.push_back(VecVT);
3575 
3576   // Compute the set of forwarded registers. The rest are scratch.
3577   SmallVectorImpl<ForwardedRegister> &Forwards =
3578       FuncInfo->getForwardedMustTailRegParms();
3579   CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3580 
3581   // Forward AL for SysV x86_64 targets, since it is used for varargs.
3582   if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
3583     Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3584     Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3585   }
3586 
3587   // Copy all forwards from physical to virtual registers.
3588   for (ForwardedRegister &FR : Forwards) {
3589     // FIXME: Can we use a less constrained schedule?
3590     SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
3591     FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
3592         TargLowering.getRegClassFor(FR.VT));
3593     Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
3594   }
3595 }
3596 
lowerVarArgsParameters(SDValue & Chain,unsigned StackSize)3597 void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
3598                                                    unsigned StackSize) {
3599   // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
3600   // If necessary, it would be set into the correct value later.
3601   FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3602   FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3603 
3604   if (FrameInfo.hasVAStart())
3605     createVarArgAreaAndStoreRegisters(Chain, StackSize);
3606 
3607   if (FrameInfo.hasMustTailInVarArgFunc())
3608     forwardMustTailParameters(Chain);
3609 }
3610 
LowerFormalArguments(SDValue Chain,CallingConv::ID CallConv,bool IsVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & dl,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals) const3611 SDValue X86TargetLowering::LowerFormalArguments(
3612     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
3613     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3614     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3615   MachineFunction &MF = DAG.getMachineFunction();
3616   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3617 
3618   const Function &F = MF.getFunction();
3619   if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
3620       F.getName() == "main")
3621     FuncInfo->setForceFramePointer(true);
3622 
3623   MachineFrameInfo &MFI = MF.getFrameInfo();
3624   bool Is64Bit = Subtarget.is64Bit();
3625   bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3626 
3627   assert(
3628       !(IsVarArg && canGuaranteeTCO(CallConv)) &&
3629       "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
3630 
3631   // Assign locations to all of the incoming arguments.
3632   SmallVector<CCValAssign, 16> ArgLocs;
3633   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3634 
3635   // Allocate shadow area for Win64.
3636   if (IsWin64)
3637     CCInfo.AllocateStack(32, Align(8));
3638 
3639   CCInfo.AnalyzeArguments(Ins, CC_X86);
3640 
3641   // In vectorcall calling convention a second pass is required for the HVA
3642   // types.
3643   if (CallingConv::X86_VectorCall == CallConv) {
3644     CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
3645   }
3646 
3647   // The next loop assumes that the locations are in the same order of the
3648   // input arguments.
3649   assert(isSortedByValueNo(ArgLocs) &&
3650          "Argument Location list must be sorted before lowering");
3651 
3652   SDValue ArgValue;
3653   for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
3654        ++I, ++InsIndex) {
3655     assert(InsIndex < Ins.size() && "Invalid Ins index");
3656     CCValAssign &VA = ArgLocs[I];
3657 
3658     if (VA.isRegLoc()) {
3659       EVT RegVT = VA.getLocVT();
3660       if (VA.needsCustom()) {
3661         assert(
3662             VA.getValVT() == MVT::v64i1 &&
3663             "Currently the only custom case is when we split v64i1 to 2 regs");
3664 
3665         // v64i1 values, in regcall calling convention, that are
3666         // compiled to 32 bit arch, are split up into two registers.
3667         ArgValue =
3668             getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3669       } else {
3670         const TargetRegisterClass *RC;
3671         if (RegVT == MVT::i8)
3672           RC = &X86::GR8RegClass;
3673         else if (RegVT == MVT::i16)
3674           RC = &X86::GR16RegClass;
3675         else if (RegVT == MVT::i32)
3676           RC = &X86::GR32RegClass;
3677         else if (Is64Bit && RegVT == MVT::i64)
3678           RC = &X86::GR64RegClass;
3679         else if (RegVT == MVT::f32)
3680           RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3681         else if (RegVT == MVT::f64)
3682           RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3683         else if (RegVT == MVT::f80)
3684           RC = &X86::RFP80RegClass;
3685         else if (RegVT == MVT::f128)
3686           RC = &X86::VR128RegClass;
3687         else if (RegVT.is512BitVector())
3688           RC = &X86::VR512RegClass;
3689         else if (RegVT.is256BitVector())
3690           RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3691         else if (RegVT.is128BitVector())
3692           RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3693         else if (RegVT == MVT::x86mmx)
3694           RC = &X86::VR64RegClass;
3695         else if (RegVT == MVT::v1i1)
3696           RC = &X86::VK1RegClass;
3697         else if (RegVT == MVT::v8i1)
3698           RC = &X86::VK8RegClass;
3699         else if (RegVT == MVT::v16i1)
3700           RC = &X86::VK16RegClass;
3701         else if (RegVT == MVT::v32i1)
3702           RC = &X86::VK32RegClass;
3703         else if (RegVT == MVT::v64i1)
3704           RC = &X86::VK64RegClass;
3705         else
3706           llvm_unreachable("Unknown argument type!");
3707 
3708         Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
3709         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3710       }
3711 
3712       // If this is an 8 or 16-bit value, it is really passed promoted to 32
3713       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
3714       // right size.
3715       if (VA.getLocInfo() == CCValAssign::SExt)
3716         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3717                                DAG.getValueType(VA.getValVT()));
3718       else if (VA.getLocInfo() == CCValAssign::ZExt)
3719         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3720                                DAG.getValueType(VA.getValVT()));
3721       else if (VA.getLocInfo() == CCValAssign::BCvt)
3722         ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3723 
3724       if (VA.isExtInLoc()) {
3725         // Handle MMX values passed in XMM regs.
3726         if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3727           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3728         else if (VA.getValVT().isVector() &&
3729                  VA.getValVT().getScalarType() == MVT::i1 &&
3730                  ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3731                   (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3732           // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3733           ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3734         } else
3735           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3736       }
3737     } else {
3738       assert(VA.isMemLoc());
3739       ArgValue =
3740           LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3741     }
3742 
3743     // If value is passed via pointer - do a load.
3744     if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
3745       ArgValue =
3746           DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3747 
3748     InVals.push_back(ArgValue);
3749   }
3750 
3751   for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3752     if (Ins[I].Flags.isSwiftAsync()) {
3753       auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
3754       if (Subtarget.is64Bit())
3755         X86FI->setHasSwiftAsyncContext(true);
3756       else {
3757         int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
3758         X86FI->setSwiftAsyncContextFrameIdx(FI);
3759         SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],
3760                                   DAG.getFrameIndex(FI, MVT::i32),
3761                                   MachinePointerInfo::getFixedStack(MF, FI));
3762         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
3763       }
3764     }
3765 
3766     // Swift calling convention does not require we copy the sret argument
3767     // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3768     if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
3769       continue;
3770 
3771     // All x86 ABIs require that for returning structs by value we copy the
3772     // sret argument into %rax/%eax (depending on ABI) for the return. Save
3773     // the argument into a virtual register so that we can access it from the
3774     // return points.
3775     if (Ins[I].Flags.isSRet()) {
3776       Register Reg = FuncInfo->getSRetReturnReg();
3777       if (!Reg) {
3778         MVT PtrTy = getPointerTy(DAG.getDataLayout());
3779         Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3780         FuncInfo->setSRetReturnReg(Reg);
3781       }
3782       SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3783       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3784       break;
3785     }
3786   }
3787 
3788   unsigned StackSize = CCInfo.getNextStackOffset();
3789   // Align stack specially for tail calls.
3790   if (shouldGuaranteeTCO(CallConv,
3791                          MF.getTarget().Options.GuaranteedTailCallOpt))
3792     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3793 
3794   if (IsVarArg)
3795     VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
3796         .lowerVarArgsParameters(Chain, StackSize);
3797 
3798   // Some CCs need callee pop.
3799   if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
3800                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
3801     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3802   } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3803     // X86 interrupts must pop the error code (and the alignment padding) if
3804     // present.
3805     FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3806   } else {
3807     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3808     // If this is an sret function, the return should pop the hidden pointer.
3809     if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3810         !Subtarget.getTargetTriple().isOSMSVCRT() &&
3811         argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3812       FuncInfo->setBytesToPopOnReturn(4);
3813   }
3814 
3815   if (!Is64Bit) {
3816     // RegSaveFrameIndex is X86-64 only.
3817     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3818   }
3819 
3820   FuncInfo->setArgumentStackSize(StackSize);
3821 
3822   if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3823     EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
3824     if (Personality == EHPersonality::CoreCLR) {
3825       assert(Is64Bit);
3826       // TODO: Add a mechanism to frame lowering that will allow us to indicate
3827       // that we'd prefer this slot be allocated towards the bottom of the frame
3828       // (i.e. near the stack pointer after allocating the frame).  Every
3829       // funclet needs a copy of this slot in its (mostly empty) frame, and the
3830       // offset from the bottom of this and each funclet's frame must be the
3831       // same, so the size of funclets' (mostly empty) frames is dictated by
3832       // how far this slot is from the bottom (since they allocate just enough
3833       // space to accommodate holding this slot at the correct offset).
3834       int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
3835       EHInfo->PSPSymFrameIdx = PSPSymFI;
3836     }
3837   }
3838 
3839   if (CallConv == CallingConv::X86_RegCall ||
3840       F.hasFnAttribute("no_caller_saved_registers")) {
3841     MachineRegisterInfo &MRI = MF.getRegInfo();
3842     for (std::pair<Register, Register> Pair : MRI.liveins())
3843       MRI.disableCalleeSavedRegister(Pair.first);
3844   }
3845 
3846   return Chain;
3847 }
3848 
LowerMemOpCallTo(SDValue Chain,SDValue StackPtr,SDValue Arg,const SDLoc & dl,SelectionDAG & DAG,const CCValAssign & VA,ISD::ArgFlagsTy Flags,bool isByVal) const3849 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3850                                             SDValue Arg, const SDLoc &dl,
3851                                             SelectionDAG &DAG,
3852                                             const CCValAssign &VA,
3853                                             ISD::ArgFlagsTy Flags,
3854                                             bool isByVal) const {
3855   unsigned LocMemOffset = VA.getLocMemOffset();
3856   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3857   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3858                        StackPtr, PtrOff);
3859   if (isByVal)
3860     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3861 
3862   return DAG.getStore(
3863       Chain, dl, Arg, PtrOff,
3864       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3865 }
3866 
3867 /// Emit a load of return address if tail call
3868 /// optimization is performed and it is required.
EmitTailCallLoadRetAddr(SelectionDAG & DAG,SDValue & OutRetAddr,SDValue Chain,bool IsTailCall,bool Is64Bit,int FPDiff,const SDLoc & dl) const3869 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3870     SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3871     bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3872   // Adjust the Return address stack slot.
3873   EVT VT = getPointerTy(DAG.getDataLayout());
3874   OutRetAddr = getReturnAddressFrameIndex(DAG);
3875 
3876   // Load the "old" Return address.
3877   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3878   return SDValue(OutRetAddr.getNode(), 1);
3879 }
3880 
3881 /// Emit a store of the return address if tail call
3882 /// optimization is performed and it is required (FPDiff!=0).
EmitTailCallStoreRetAddr(SelectionDAG & DAG,MachineFunction & MF,SDValue Chain,SDValue RetAddrFrIdx,EVT PtrVT,unsigned SlotSize,int FPDiff,const SDLoc & dl)3883 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3884                                         SDValue Chain, SDValue RetAddrFrIdx,
3885                                         EVT PtrVT, unsigned SlotSize,
3886                                         int FPDiff, const SDLoc &dl) {
3887   // Store the return address to the appropriate stack slot.
3888   if (!FPDiff) return Chain;
3889   // Calculate the new stack slot for the return address.
3890   int NewReturnAddrFI =
3891     MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3892                                          false);
3893   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3894   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3895                        MachinePointerInfo::getFixedStack(
3896                            DAG.getMachineFunction(), NewReturnAddrFI));
3897   return Chain;
3898 }
3899 
3900 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3901 /// operation of specified width.
getMOVL(SelectionDAG & DAG,const SDLoc & dl,MVT VT,SDValue V1,SDValue V2)3902 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3903                        SDValue V2) {
3904   unsigned NumElems = VT.getVectorNumElements();
3905   SmallVector<int, 8> Mask;
3906   Mask.push_back(NumElems);
3907   for (unsigned i = 1; i != NumElems; ++i)
3908     Mask.push_back(i);
3909   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3910 }
3911 
3912 SDValue
LowerCall(TargetLowering::CallLoweringInfo & CLI,SmallVectorImpl<SDValue> & InVals) const3913 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3914                              SmallVectorImpl<SDValue> &InVals) const {
3915   SelectionDAG &DAG                     = CLI.DAG;
3916   SDLoc &dl                             = CLI.DL;
3917   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3918   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
3919   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
3920   SDValue Chain                         = CLI.Chain;
3921   SDValue Callee                        = CLI.Callee;
3922   CallingConv::ID CallConv              = CLI.CallConv;
3923   bool &isTailCall                      = CLI.IsTailCall;
3924   bool isVarArg                         = CLI.IsVarArg;
3925   const auto *CB                        = CLI.CB;
3926 
3927   MachineFunction &MF = DAG.getMachineFunction();
3928   bool Is64Bit        = Subtarget.is64Bit();
3929   bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
3930   StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3931   bool IsSibcall      = false;
3932   bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
3933       CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
3934   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3935   bool HasNCSR = (CB && isa<CallInst>(CB) &&
3936                   CB->hasFnAttr("no_caller_saved_registers"));
3937   bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
3938   bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
3939   const Module *M = MF.getMMI().getModule();
3940   Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
3941 
3942   MachineFunction::CallSiteInfo CSInfo;
3943   if (CallConv == CallingConv::X86_INTR)
3944     report_fatal_error("X86 interrupts may not be called directly");
3945 
3946   bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
3947   if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
3948     // If we are using a GOT, disable tail calls to external symbols with
3949     // default visibility. Tail calling such a symbol requires using a GOT
3950     // relocation, which forces early binding of the symbol. This breaks code
3951     // that require lazy function symbol resolution. Using musttail or
3952     // GuaranteedTailCallOpt will override this.
3953     GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3954     if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3955                G->getGlobal()->hasDefaultVisibility()))
3956       isTailCall = false;
3957   }
3958 
3959 
3960   if (isTailCall && !IsMustTail) {
3961     // Check if it's really possible to do a tail call.
3962     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3963                     isVarArg, SR != NotStructReturn,
3964                     MF.getFunction().hasStructRetAttr(), CLI.RetTy,
3965                     Outs, OutVals, Ins, DAG);
3966 
3967     // Sibcalls are automatically detected tailcalls which do not require
3968     // ABI changes.
3969     if (!IsGuaranteeTCO && isTailCall)
3970       IsSibcall = true;
3971 
3972     if (isTailCall)
3973       ++NumTailCalls;
3974   }
3975 
3976   if (IsMustTail && !isTailCall)
3977     report_fatal_error("failed to perform tail call elimination on a call "
3978                        "site marked musttail");
3979 
3980   assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3981          "Var args not supported with calling convention fastcc, ghc or hipe");
3982 
3983   // Analyze operands of the call, assigning locations to each operand.
3984   SmallVector<CCValAssign, 16> ArgLocs;
3985   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3986 
3987   // Allocate shadow area for Win64.
3988   if (IsWin64)
3989     CCInfo.AllocateStack(32, Align(8));
3990 
3991   CCInfo.AnalyzeArguments(Outs, CC_X86);
3992 
3993   // In vectorcall calling convention a second pass is required for the HVA
3994   // types.
3995   if (CallingConv::X86_VectorCall == CallConv) {
3996     CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3997   }
3998 
3999   // Get a count of how many bytes are to be pushed on the stack.
4000   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
4001   if (IsSibcall)
4002     // This is a sibcall. The memory operands are available in caller's
4003     // own caller's stack.
4004     NumBytes = 0;
4005   else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
4006     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
4007 
4008   int FPDiff = 0;
4009   if (isTailCall &&
4010       shouldGuaranteeTCO(CallConv,
4011                          MF.getTarget().Options.GuaranteedTailCallOpt)) {
4012     // Lower arguments at fp - stackoffset + fpdiff.
4013     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
4014 
4015     FPDiff = NumBytesCallerPushed - NumBytes;
4016 
4017     // Set the delta of movement of the returnaddr stackslot.
4018     // But only set if delta is greater than previous delta.
4019     if (FPDiff < X86Info->getTCReturnAddrDelta())
4020       X86Info->setTCReturnAddrDelta(FPDiff);
4021   }
4022 
4023   unsigned NumBytesToPush = NumBytes;
4024   unsigned NumBytesToPop = NumBytes;
4025 
4026   // If we have an inalloca argument, all stack space has already been allocated
4027   // for us and be right at the top of the stack.  We don't support multiple
4028   // arguments passed in memory when using inalloca.
4029   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
4030     NumBytesToPush = 0;
4031     if (!ArgLocs.back().isMemLoc())
4032       report_fatal_error("cannot use inalloca attribute on a register "
4033                          "parameter");
4034     if (ArgLocs.back().getLocMemOffset() != 0)
4035       report_fatal_error("any parameter with the inalloca attribute must be "
4036                          "the only memory argument");
4037   } else if (CLI.IsPreallocated) {
4038     assert(ArgLocs.back().isMemLoc() &&
4039            "cannot use preallocated attribute on a register "
4040            "parameter");
4041     SmallVector<size_t, 4> PreallocatedOffsets;
4042     for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
4043       if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
4044         PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
4045       }
4046     }
4047     auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
4048     size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
4049     MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
4050     MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
4051     NumBytesToPush = 0;
4052   }
4053 
4054   if (!IsSibcall && !IsMustTail)
4055     Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
4056                                  NumBytes - NumBytesToPush, dl);
4057 
4058   SDValue RetAddrFrIdx;
4059   // Load return address for tail calls.
4060   if (isTailCall && FPDiff)
4061     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
4062                                     Is64Bit, FPDiff, dl);
4063 
4064   SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
4065   SmallVector<SDValue, 8> MemOpChains;
4066   SDValue StackPtr;
4067 
4068   // The next loop assumes that the locations are in the same order of the
4069   // input arguments.
4070   assert(isSortedByValueNo(ArgLocs) &&
4071          "Argument Location list must be sorted before lowering");
4072 
4073   // Walk the register/memloc assignments, inserting copies/loads.  In the case
4074   // of tail call optimization arguments are handle later.
4075   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4076   for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
4077        ++I, ++OutIndex) {
4078     assert(OutIndex < Outs.size() && "Invalid Out index");
4079     // Skip inalloca/preallocated arguments, they have already been written.
4080     ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
4081     if (Flags.isInAlloca() || Flags.isPreallocated())
4082       continue;
4083 
4084     CCValAssign &VA = ArgLocs[I];
4085     EVT RegVT = VA.getLocVT();
4086     SDValue Arg = OutVals[OutIndex];
4087     bool isByVal = Flags.isByVal();
4088 
4089     // Promote the value if needed.
4090     switch (VA.getLocInfo()) {
4091     default: llvm_unreachable("Unknown loc info!");
4092     case CCValAssign::Full: break;
4093     case CCValAssign::SExt:
4094       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
4095       break;
4096     case CCValAssign::ZExt:
4097       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
4098       break;
4099     case CCValAssign::AExt:
4100       if (Arg.getValueType().isVector() &&
4101           Arg.getValueType().getVectorElementType() == MVT::i1)
4102         Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
4103       else if (RegVT.is128BitVector()) {
4104         // Special case: passing MMX values in XMM registers.
4105         Arg = DAG.getBitcast(MVT::i64, Arg);
4106         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
4107         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
4108       } else
4109         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
4110       break;
4111     case CCValAssign::BCvt:
4112       Arg = DAG.getBitcast(RegVT, Arg);
4113       break;
4114     case CCValAssign::Indirect: {
4115       if (isByVal) {
4116         // Memcpy the argument to a temporary stack slot to prevent
4117         // the caller from seeing any modifications the callee may make
4118         // as guaranteed by the `byval` attribute.
4119         int FrameIdx = MF.getFrameInfo().CreateStackObject(
4120             Flags.getByValSize(),
4121             std::max(Align(16), Flags.getNonZeroByValAlign()), false);
4122         SDValue StackSlot =
4123             DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
4124         Chain =
4125             CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
4126         // From now on treat this as a regular pointer
4127         Arg = StackSlot;
4128         isByVal = false;
4129       } else {
4130         // Store the argument.
4131         SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
4132         int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
4133         Chain = DAG.getStore(
4134             Chain, dl, Arg, SpillSlot,
4135             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4136         Arg = SpillSlot;
4137       }
4138       break;
4139     }
4140     }
4141 
4142     if (VA.needsCustom()) {
4143       assert(VA.getValVT() == MVT::v64i1 &&
4144              "Currently the only custom case is when we split v64i1 to 2 regs");
4145       // Split v64i1 value into two registers
4146       Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
4147     } else if (VA.isRegLoc()) {
4148       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
4149       const TargetOptions &Options = DAG.getTarget().Options;
4150       if (Options.EmitCallSiteInfo)
4151         CSInfo.emplace_back(VA.getLocReg(), I);
4152       if (isVarArg && IsWin64) {
4153         // Win64 ABI requires argument XMM reg to be copied to the corresponding
4154         // shadow reg if callee is a varargs function.
4155         Register ShadowReg;
4156         switch (VA.getLocReg()) {
4157         case X86::XMM0: ShadowReg = X86::RCX; break;
4158         case X86::XMM1: ShadowReg = X86::RDX; break;
4159         case X86::XMM2: ShadowReg = X86::R8; break;
4160         case X86::XMM3: ShadowReg = X86::R9; break;
4161         }
4162         if (ShadowReg)
4163           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
4164       }
4165     } else if (!IsSibcall && (!isTailCall || isByVal)) {
4166       assert(VA.isMemLoc());
4167       if (!StackPtr.getNode())
4168         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4169                                       getPointerTy(DAG.getDataLayout()));
4170       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
4171                                              dl, DAG, VA, Flags, isByVal));
4172     }
4173   }
4174 
4175   if (!MemOpChains.empty())
4176     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
4177 
4178   if (Subtarget.isPICStyleGOT()) {
4179     // ELF / PIC requires GOT in the EBX register before function calls via PLT
4180     // GOT pointer (except regcall).
4181     if (!isTailCall) {
4182       // Indirect call with RegCall calling convertion may use up all the
4183       // general registers, so it is not suitable to bind EBX reister for
4184       // GOT address, just let register allocator handle it.
4185       if (CallConv != CallingConv::X86_RegCall)
4186         RegsToPass.push_back(std::make_pair(
4187           Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
4188                                           getPointerTy(DAG.getDataLayout()))));
4189     } else {
4190       // If we are tail calling and generating PIC/GOT style code load the
4191       // address of the callee into ECX. The value in ecx is used as target of
4192       // the tail jump. This is done to circumvent the ebx/callee-saved problem
4193       // for tail calls on PIC/GOT architectures. Normally we would just put the
4194       // address of GOT into ebx and then call target@PLT. But for tail calls
4195       // ebx would be restored (since ebx is callee saved) before jumping to the
4196       // target@PLT.
4197 
4198       // Note: The actual moving to ECX is done further down.
4199       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4200       if (G && !G->getGlobal()->hasLocalLinkage() &&
4201           G->getGlobal()->hasDefaultVisibility())
4202         Callee = LowerGlobalAddress(Callee, DAG);
4203       else if (isa<ExternalSymbolSDNode>(Callee))
4204         Callee = LowerExternalSymbol(Callee, DAG);
4205     }
4206   }
4207 
4208   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
4209     // From AMD64 ABI document:
4210     // For calls that may call functions that use varargs or stdargs
4211     // (prototype-less calls or calls to functions containing ellipsis (...) in
4212     // the declaration) %al is used as hidden argument to specify the number
4213     // of SSE registers used. The contents of %al do not need to match exactly
4214     // the number of registers, but must be an ubound on the number of SSE
4215     // registers used and is in the range 0 - 8 inclusive.
4216 
4217     // Count the number of XMM registers allocated.
4218     static const MCPhysReg XMMArgRegs[] = {
4219       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
4220       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
4221     };
4222     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
4223     assert((Subtarget.hasSSE1() || !NumXMMRegs)
4224            && "SSE registers cannot be used when SSE is disabled");
4225     RegsToPass.push_back(std::make_pair(Register(X86::AL),
4226                                         DAG.getConstant(NumXMMRegs, dl,
4227                                                         MVT::i8)));
4228   }
4229 
4230   if (isVarArg && IsMustTail) {
4231     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
4232     for (const auto &F : Forwards) {
4233       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
4234       RegsToPass.push_back(std::make_pair(F.PReg, Val));
4235     }
4236   }
4237 
4238   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
4239   // don't need this because the eligibility check rejects calls that require
4240   // shuffling arguments passed in memory.
4241   if (!IsSibcall && isTailCall) {
4242     // Force all the incoming stack arguments to be loaded from the stack
4243     // before any new outgoing arguments are stored to the stack, because the
4244     // outgoing stack slots may alias the incoming argument stack slots, and
4245     // the alias isn't otherwise explicit. This is slightly more conservative
4246     // than necessary, because it means that each store effectively depends
4247     // on every argument instead of just those arguments it would clobber.
4248     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
4249 
4250     SmallVector<SDValue, 8> MemOpChains2;
4251     SDValue FIN;
4252     int FI = 0;
4253     for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
4254          ++I, ++OutsIndex) {
4255       CCValAssign &VA = ArgLocs[I];
4256 
4257       if (VA.isRegLoc()) {
4258         if (VA.needsCustom()) {
4259           assert((CallConv == CallingConv::X86_RegCall) &&
4260                  "Expecting custom case only in regcall calling convention");
4261           // This means that we are in special case where one argument was
4262           // passed through two register locations - Skip the next location
4263           ++I;
4264         }
4265 
4266         continue;
4267       }
4268 
4269       assert(VA.isMemLoc());
4270       SDValue Arg = OutVals[OutsIndex];
4271       ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
4272       // Skip inalloca/preallocated arguments.  They don't require any work.
4273       if (Flags.isInAlloca() || Flags.isPreallocated())
4274         continue;
4275       // Create frame index.
4276       int32_t Offset = VA.getLocMemOffset()+FPDiff;
4277       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
4278       FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
4279       FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4280 
4281       if (Flags.isByVal()) {
4282         // Copy relative to framepointer.
4283         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
4284         if (!StackPtr.getNode())
4285           StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4286                                         getPointerTy(DAG.getDataLayout()));
4287         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4288                              StackPtr, Source);
4289 
4290         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
4291                                                          ArgChain,
4292                                                          Flags, DAG, dl));
4293       } else {
4294         // Store relative to framepointer.
4295         MemOpChains2.push_back(DAG.getStore(
4296             ArgChain, dl, Arg, FIN,
4297             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
4298       }
4299     }
4300 
4301     if (!MemOpChains2.empty())
4302       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
4303 
4304     // Store the return address to the appropriate stack slot.
4305     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
4306                                      getPointerTy(DAG.getDataLayout()),
4307                                      RegInfo->getSlotSize(), FPDiff, dl);
4308   }
4309 
4310   // Build a sequence of copy-to-reg nodes chained together with token chain
4311   // and flag operands which copy the outgoing args into registers.
4312   SDValue InFlag;
4313   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
4314     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
4315                              RegsToPass[i].second, InFlag);
4316     InFlag = Chain.getValue(1);
4317   }
4318 
4319   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
4320     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
4321     // In the 64-bit large code model, we have to make all calls
4322     // through a register, since the call instruction's 32-bit
4323     // pc-relative offset may not be large enough to hold the whole
4324     // address.
4325   } else if (Callee->getOpcode() == ISD::GlobalAddress ||
4326              Callee->getOpcode() == ISD::ExternalSymbol) {
4327     // Lower direct calls to global addresses and external symbols. Setting
4328     // ForCall to true here has the effect of removing WrapperRIP when possible
4329     // to allow direct calls to be selected without first materializing the
4330     // address into a register.
4331     Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
4332   } else if (Subtarget.isTarget64BitILP32() &&
4333              Callee->getValueType(0) == MVT::i32) {
4334     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
4335     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
4336   }
4337 
4338   // Returns a chain & a flag for retval copy to use.
4339   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4340   SmallVector<SDValue, 8> Ops;
4341 
4342   if (!IsSibcall && isTailCall && !IsMustTail) {
4343     Chain = DAG.getCALLSEQ_END(Chain,
4344                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4345                                DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
4346     InFlag = Chain.getValue(1);
4347   }
4348 
4349   Ops.push_back(Chain);
4350   Ops.push_back(Callee);
4351 
4352   if (isTailCall)
4353     Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
4354 
4355   // Add argument registers to the end of the list so that they are known live
4356   // into the call.
4357   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
4358     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
4359                                   RegsToPass[i].second.getValueType()));
4360 
4361   // Add a register mask operand representing the call-preserved registers.
4362   const uint32_t *Mask = [&]() {
4363     auto AdaptedCC = CallConv;
4364     // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
4365     // use X86_INTR calling convention because it has the same CSR mask
4366     // (same preserved registers).
4367     if (HasNCSR)
4368       AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
4369     // If NoCalleeSavedRegisters is requested, than use GHC since it happens
4370     // to use the CSR_NoRegs_RegMask.
4371     if (CB && CB->hasFnAttr("no_callee_saved_registers"))
4372       AdaptedCC = (CallingConv::ID)CallingConv::GHC;
4373     return RegInfo->getCallPreservedMask(MF, AdaptedCC);
4374   }();
4375   assert(Mask && "Missing call preserved mask for calling convention");
4376 
4377   // If this is an invoke in a 32-bit function using a funclet-based
4378   // personality, assume the function clobbers all registers. If an exception
4379   // is thrown, the runtime will not restore CSRs.
4380   // FIXME: Model this more precisely so that we can register allocate across
4381   // the normal edge and spill and fill across the exceptional edge.
4382   if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
4383     const Function &CallerFn = MF.getFunction();
4384     EHPersonality Pers =
4385         CallerFn.hasPersonalityFn()
4386             ? classifyEHPersonality(CallerFn.getPersonalityFn())
4387             : EHPersonality::Unknown;
4388     if (isFuncletEHPersonality(Pers))
4389       Mask = RegInfo->getNoPreservedMask();
4390   }
4391 
4392   // Define a new register mask from the existing mask.
4393   uint32_t *RegMask = nullptr;
4394 
4395   // In some calling conventions we need to remove the used physical registers
4396   // from the reg mask.
4397   if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
4398     const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
4399 
4400     // Allocate a new Reg Mask and copy Mask.
4401     RegMask = MF.allocateRegMask();
4402     unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
4403     memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
4404 
4405     // Make sure all sub registers of the argument registers are reset
4406     // in the RegMask.
4407     for (auto const &RegPair : RegsToPass)
4408       for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
4409            SubRegs.isValid(); ++SubRegs)
4410         RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
4411 
4412     // Create the RegMask Operand according to our updated mask.
4413     Ops.push_back(DAG.getRegisterMask(RegMask));
4414   } else {
4415     // Create the RegMask Operand according to the static mask.
4416     Ops.push_back(DAG.getRegisterMask(Mask));
4417   }
4418 
4419   if (InFlag.getNode())
4420     Ops.push_back(InFlag);
4421 
4422   if (isTailCall) {
4423     // We used to do:
4424     //// If this is the first return lowered for this function, add the regs
4425     //// to the liveout set for the function.
4426     // This isn't right, although it's probably harmless on x86; liveouts
4427     // should be computed from returns not tail calls.  Consider a void
4428     // function making a tail call to a function returning int.
4429     MF.getFrameInfo().setHasTailCall();
4430     SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
4431     DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
4432     return Ret;
4433   }
4434 
4435   if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
4436     Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
4437   } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
4438     // Calls with a "clang.arc.attachedcall" bundle are special. They should be
4439     // expanded to the call, directly followed by a special marker sequence and
4440     // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
4441     assert(!isTailCall &&
4442            "tail calls cannot be marked with clang.arc.attachedcall");
4443     assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode");
4444 
4445     // Add target constant to select ObjC runtime call just before the call
4446     // target. RuntimeCallType == 0 selects objc_retainAutoreleasedReturnValue,
4447     // RuntimeCallType == 0 selects objc_unsafeClaimAutoreleasedReturnValue when
4448     // epxanding the pseudo.
4449     unsigned RuntimeCallType =
4450         objcarc::hasAttachedCallOpBundle(CLI.CB, true) ? 0 : 1;
4451     Ops.insert(Ops.begin() + 1,
4452                DAG.getTargetConstant(RuntimeCallType, dl, MVT::i32));
4453     Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
4454   } else {
4455     Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
4456   }
4457 
4458   InFlag = Chain.getValue(1);
4459   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
4460   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
4461 
4462   // Save heapallocsite metadata.
4463   if (CLI.CB)
4464     if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
4465       DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
4466 
4467   // Create the CALLSEQ_END node.
4468   unsigned NumBytesForCalleeToPop;
4469   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
4470                        DAG.getTarget().Options.GuaranteedTailCallOpt))
4471     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
4472   else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
4473            !Subtarget.getTargetTriple().isOSMSVCRT() &&
4474            SR == StackStructReturn)
4475     // If this is a call to a struct-return function, the callee
4476     // pops the hidden struct pointer, so we have to push it back.
4477     // This is common for Darwin/X86, Linux & Mingw32 targets.
4478     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
4479     NumBytesForCalleeToPop = 4;
4480   else
4481     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
4482 
4483   // Returns a flag for retval copy to use.
4484   if (!IsSibcall) {
4485     Chain = DAG.getCALLSEQ_END(Chain,
4486                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4487                                DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
4488                                                      true),
4489                                InFlag, dl);
4490     InFlag = Chain.getValue(1);
4491   }
4492 
4493   // Handle result values, copying them out of physregs into vregs that we
4494   // return.
4495   return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
4496                          InVals, RegMask);
4497 }
4498 
4499 //===----------------------------------------------------------------------===//
4500 //                Fast Calling Convention (tail call) implementation
4501 //===----------------------------------------------------------------------===//
4502 
4503 //  Like std call, callee cleans arguments, convention except that ECX is
4504 //  reserved for storing the tail called function address. Only 2 registers are
4505 //  free for argument passing (inreg). Tail call optimization is performed
4506 //  provided:
4507 //                * tailcallopt is enabled
4508 //                * caller/callee are fastcc
4509 //  On X86_64 architecture with GOT-style position independent code only local
4510 //  (within module) calls are supported at the moment.
4511 //  To keep the stack aligned according to platform abi the function
4512 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
4513 //  of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
4514 //  If a tail called function callee has more arguments than the caller the
4515 //  caller needs to make sure that there is room to move the RETADDR to. This is
4516 //  achieved by reserving an area the size of the argument delta right after the
4517 //  original RETADDR, but before the saved framepointer or the spilled registers
4518 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
4519 //  stack layout:
4520 //    arg1
4521 //    arg2
4522 //    RETADDR
4523 //    [ new RETADDR
4524 //      move area ]
4525 //    (possible EBP)
4526 //    ESI
4527 //    EDI
4528 //    local1 ..
4529 
4530 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
4531 /// requirement.
4532 unsigned
GetAlignedArgumentStackSize(const unsigned StackSize,SelectionDAG & DAG) const4533 X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
4534                                                SelectionDAG &DAG) const {
4535   const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
4536   const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
4537   assert(StackSize % SlotSize == 0 &&
4538          "StackSize must be a multiple of SlotSize");
4539   return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
4540 }
4541 
4542 /// Return true if the given stack call argument is already available in the
4543 /// same position (relatively) of the caller's incoming argument stack.
4544 static
MatchingStackOffset(SDValue Arg,unsigned Offset,ISD::ArgFlagsTy Flags,MachineFrameInfo & MFI,const MachineRegisterInfo * MRI,const X86InstrInfo * TII,const CCValAssign & VA)4545 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
4546                          MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
4547                          const X86InstrInfo *TII, const CCValAssign &VA) {
4548   unsigned Bytes = Arg.getValueSizeInBits() / 8;
4549 
4550   for (;;) {
4551     // Look through nodes that don't alter the bits of the incoming value.
4552     unsigned Op = Arg.getOpcode();
4553     if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
4554       Arg = Arg.getOperand(0);
4555       continue;
4556     }
4557     if (Op == ISD::TRUNCATE) {
4558       const SDValue &TruncInput = Arg.getOperand(0);
4559       if (TruncInput.getOpcode() == ISD::AssertZext &&
4560           cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
4561               Arg.getValueType()) {
4562         Arg = TruncInput.getOperand(0);
4563         continue;
4564       }
4565     }
4566     break;
4567   }
4568 
4569   int FI = INT_MAX;
4570   if (Arg.getOpcode() == ISD::CopyFromReg) {
4571     Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
4572     if (!VR.isVirtual())
4573       return false;
4574     MachineInstr *Def = MRI->getVRegDef(VR);
4575     if (!Def)
4576       return false;
4577     if (!Flags.isByVal()) {
4578       if (!TII->isLoadFromStackSlot(*Def, FI))
4579         return false;
4580     } else {
4581       unsigned Opcode = Def->getOpcode();
4582       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
4583            Opcode == X86::LEA64_32r) &&
4584           Def->getOperand(1).isFI()) {
4585         FI = Def->getOperand(1).getIndex();
4586         Bytes = Flags.getByValSize();
4587       } else
4588         return false;
4589     }
4590   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4591     if (Flags.isByVal())
4592       // ByVal argument is passed in as a pointer but it's now being
4593       // dereferenced. e.g.
4594       // define @foo(%struct.X* %A) {
4595       //   tail call @bar(%struct.X* byval %A)
4596       // }
4597       return false;
4598     SDValue Ptr = Ld->getBasePtr();
4599     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4600     if (!FINode)
4601       return false;
4602     FI = FINode->getIndex();
4603   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4604     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4605     FI = FINode->getIndex();
4606     Bytes = Flags.getByValSize();
4607   } else
4608     return false;
4609 
4610   assert(FI != INT_MAX);
4611   if (!MFI.isFixedObjectIndex(FI))
4612     return false;
4613 
4614   if (Offset != MFI.getObjectOffset(FI))
4615     return false;
4616 
4617   // If this is not byval, check that the argument stack object is immutable.
4618   // inalloca and argument copy elision can create mutable argument stack
4619   // objects. Byval objects can be mutated, but a byval call intends to pass the
4620   // mutated memory.
4621   if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4622     return false;
4623 
4624   if (VA.getLocVT().getFixedSizeInBits() >
4625       Arg.getValueSizeInBits().getFixedSize()) {
4626     // If the argument location is wider than the argument type, check that any
4627     // extension flags match.
4628     if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4629         Flags.isSExt() != MFI.isObjectSExt(FI)) {
4630       return false;
4631     }
4632   }
4633 
4634   return Bytes == MFI.getObjectSize(FI);
4635 }
4636 
4637 /// Check whether the call is eligible for tail call optimization. Targets
4638 /// that want to do tail call optimization should implement this function.
IsEligibleForTailCallOptimization(SDValue Callee,CallingConv::ID CalleeCC,bool isVarArg,bool isCalleeStructRet,bool isCallerStructRet,Type * RetTy,const SmallVectorImpl<ISD::OutputArg> & Outs,const SmallVectorImpl<SDValue> & OutVals,const SmallVectorImpl<ISD::InputArg> & Ins,SelectionDAG & DAG) const4639 bool X86TargetLowering::IsEligibleForTailCallOptimization(
4640     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4641     bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4642     const SmallVectorImpl<ISD::OutputArg> &Outs,
4643     const SmallVectorImpl<SDValue> &OutVals,
4644     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4645   if (!mayTailCallThisCC(CalleeCC))
4646     return false;
4647 
4648   // If -tailcallopt is specified, make fastcc functions tail-callable.
4649   MachineFunction &MF = DAG.getMachineFunction();
4650   const Function &CallerF = MF.getFunction();
4651 
4652   // If the function return type is x86_fp80 and the callee return type is not,
4653   // then the FP_EXTEND of the call result is not a nop. It's not safe to
4654   // perform a tailcall optimization here.
4655   if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4656     return false;
4657 
4658   CallingConv::ID CallerCC = CallerF.getCallingConv();
4659   bool CCMatch = CallerCC == CalleeCC;
4660   bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4661   bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4662   bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
4663       CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
4664 
4665   // Win64 functions have extra shadow space for argument homing. Don't do the
4666   // sibcall if the caller and callee have mismatched expectations for this
4667   // space.
4668   if (IsCalleeWin64 != IsCallerWin64)
4669     return false;
4670 
4671   if (IsGuaranteeTCO) {
4672     if (canGuaranteeTCO(CalleeCC) && CCMatch)
4673       return true;
4674     return false;
4675   }
4676 
4677   // Look for obvious safe cases to perform tail call optimization that do not
4678   // require ABI changes. This is what gcc calls sibcall.
4679 
4680   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4681   // emit a special epilogue.
4682   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4683   if (RegInfo->hasStackRealignment(MF))
4684     return false;
4685 
4686   // Also avoid sibcall optimization if either caller or callee uses struct
4687   // return semantics.
4688   if (isCalleeStructRet || isCallerStructRet)
4689     return false;
4690 
4691   // Do not sibcall optimize vararg calls unless all arguments are passed via
4692   // registers.
4693   LLVMContext &C = *DAG.getContext();
4694   if (isVarArg && !Outs.empty()) {
4695     // Optimizing for varargs on Win64 is unlikely to be safe without
4696     // additional testing.
4697     if (IsCalleeWin64 || IsCallerWin64)
4698       return false;
4699 
4700     SmallVector<CCValAssign, 16> ArgLocs;
4701     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4702 
4703     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4704     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4705       if (!ArgLocs[i].isRegLoc())
4706         return false;
4707   }
4708 
4709   // If the call result is in ST0 / ST1, it needs to be popped off the x87
4710   // stack.  Therefore, if it's not used by the call it is not safe to optimize
4711   // this into a sibcall.
4712   bool Unused = false;
4713   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4714     if (!Ins[i].Used) {
4715       Unused = true;
4716       break;
4717     }
4718   }
4719   if (Unused) {
4720     SmallVector<CCValAssign, 16> RVLocs;
4721     CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4722     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4723     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4724       CCValAssign &VA = RVLocs[i];
4725       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4726         return false;
4727     }
4728   }
4729 
4730   // Check that the call results are passed in the same way.
4731   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4732                                   RetCC_X86, RetCC_X86))
4733     return false;
4734   // The callee has to preserve all registers the caller needs to preserve.
4735   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4736   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4737   if (!CCMatch) {
4738     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4739     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4740       return false;
4741   }
4742 
4743   unsigned StackArgsSize = 0;
4744 
4745   // If the callee takes no arguments then go on to check the results of the
4746   // call.
4747   if (!Outs.empty()) {
4748     // Check if stack adjustment is needed. For now, do not do this if any
4749     // argument is passed on the stack.
4750     SmallVector<CCValAssign, 16> ArgLocs;
4751     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4752 
4753     // Allocate shadow area for Win64
4754     if (IsCalleeWin64)
4755       CCInfo.AllocateStack(32, Align(8));
4756 
4757     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4758     StackArgsSize = CCInfo.getNextStackOffset();
4759 
4760     if (CCInfo.getNextStackOffset()) {
4761       // Check if the arguments are already laid out in the right way as
4762       // the caller's fixed stack objects.
4763       MachineFrameInfo &MFI = MF.getFrameInfo();
4764       const MachineRegisterInfo *MRI = &MF.getRegInfo();
4765       const X86InstrInfo *TII = Subtarget.getInstrInfo();
4766       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4767         CCValAssign &VA = ArgLocs[i];
4768         SDValue Arg = OutVals[i];
4769         ISD::ArgFlagsTy Flags = Outs[i].Flags;
4770         if (VA.getLocInfo() == CCValAssign::Indirect)
4771           return false;
4772         if (!VA.isRegLoc()) {
4773           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4774                                    MFI, MRI, TII, VA))
4775             return false;
4776         }
4777       }
4778     }
4779 
4780     bool PositionIndependent = isPositionIndependent();
4781     // If the tailcall address may be in a register, then make sure it's
4782     // possible to register allocate for it. In 32-bit, the call address can
4783     // only target EAX, EDX, or ECX since the tail call must be scheduled after
4784     // callee-saved registers are restored. These happen to be the same
4785     // registers used to pass 'inreg' arguments so watch out for those.
4786     if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4787                                   !isa<ExternalSymbolSDNode>(Callee)) ||
4788                                  PositionIndependent)) {
4789       unsigned NumInRegs = 0;
4790       // In PIC we need an extra register to formulate the address computation
4791       // for the callee.
4792       unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4793 
4794       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4795         CCValAssign &VA = ArgLocs[i];
4796         if (!VA.isRegLoc())
4797           continue;
4798         Register Reg = VA.getLocReg();
4799         switch (Reg) {
4800         default: break;
4801         case X86::EAX: case X86::EDX: case X86::ECX:
4802           if (++NumInRegs == MaxInRegs)
4803             return false;
4804           break;
4805         }
4806       }
4807     }
4808 
4809     const MachineRegisterInfo &MRI = MF.getRegInfo();
4810     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4811       return false;
4812   }
4813 
4814   bool CalleeWillPop =
4815       X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4816                        MF.getTarget().Options.GuaranteedTailCallOpt);
4817 
4818   if (unsigned BytesToPop =
4819           MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4820     // If we have bytes to pop, the callee must pop them.
4821     bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4822     if (!CalleePopMatches)
4823       return false;
4824   } else if (CalleeWillPop && StackArgsSize > 0) {
4825     // If we don't have bytes to pop, make sure the callee doesn't pop any.
4826     return false;
4827   }
4828 
4829   return true;
4830 }
4831 
4832 FastISel *
createFastISel(FunctionLoweringInfo & funcInfo,const TargetLibraryInfo * libInfo) const4833 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4834                                   const TargetLibraryInfo *libInfo) const {
4835   return X86::createFastISel(funcInfo, libInfo);
4836 }
4837 
4838 //===----------------------------------------------------------------------===//
4839 //                           Other Lowering Hooks
4840 //===----------------------------------------------------------------------===//
4841 
MayFoldLoad(SDValue Op)4842 static bool MayFoldLoad(SDValue Op) {
4843   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4844 }
4845 
MayFoldIntoStore(SDValue Op)4846 static bool MayFoldIntoStore(SDValue Op) {
4847   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4848 }
4849 
MayFoldIntoZeroExtend(SDValue Op)4850 static bool MayFoldIntoZeroExtend(SDValue Op) {
4851   if (Op.hasOneUse()) {
4852     unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4853     return (ISD::ZERO_EXTEND == Opcode);
4854   }
4855   return false;
4856 }
4857 
isTargetShuffle(unsigned Opcode)4858 static bool isTargetShuffle(unsigned Opcode) {
4859   switch(Opcode) {
4860   default: return false;
4861   case X86ISD::BLENDI:
4862   case X86ISD::PSHUFB:
4863   case X86ISD::PSHUFD:
4864   case X86ISD::PSHUFHW:
4865   case X86ISD::PSHUFLW:
4866   case X86ISD::SHUFP:
4867   case X86ISD::INSERTPS:
4868   case X86ISD::EXTRQI:
4869   case X86ISD::INSERTQI:
4870   case X86ISD::VALIGN:
4871   case X86ISD::PALIGNR:
4872   case X86ISD::VSHLDQ:
4873   case X86ISD::VSRLDQ:
4874   case X86ISD::MOVLHPS:
4875   case X86ISD::MOVHLPS:
4876   case X86ISD::MOVSHDUP:
4877   case X86ISD::MOVSLDUP:
4878   case X86ISD::MOVDDUP:
4879   case X86ISD::MOVSS:
4880   case X86ISD::MOVSD:
4881   case X86ISD::UNPCKL:
4882   case X86ISD::UNPCKH:
4883   case X86ISD::VBROADCAST:
4884   case X86ISD::VPERMILPI:
4885   case X86ISD::VPERMILPV:
4886   case X86ISD::VPERM2X128:
4887   case X86ISD::SHUF128:
4888   case X86ISD::VPERMIL2:
4889   case X86ISD::VPERMI:
4890   case X86ISD::VPPERM:
4891   case X86ISD::VPERMV:
4892   case X86ISD::VPERMV3:
4893   case X86ISD::VZEXT_MOVL:
4894     return true;
4895   }
4896 }
4897 
isTargetShuffleVariableMask(unsigned Opcode)4898 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4899   switch (Opcode) {
4900   default: return false;
4901   // Target Shuffles.
4902   case X86ISD::PSHUFB:
4903   case X86ISD::VPERMILPV:
4904   case X86ISD::VPERMIL2:
4905   case X86ISD::VPPERM:
4906   case X86ISD::VPERMV:
4907   case X86ISD::VPERMV3:
4908     return true;
4909   // 'Faux' Target Shuffles.
4910   case ISD::OR:
4911   case ISD::AND:
4912   case X86ISD::ANDNP:
4913     return true;
4914   }
4915 }
4916 
isTargetShuffleSplat(SDValue Op)4917 static bool isTargetShuffleSplat(SDValue Op) {
4918   unsigned Opcode = Op.getOpcode();
4919   if (Opcode == ISD::EXTRACT_SUBVECTOR)
4920     return isTargetShuffleSplat(Op.getOperand(0));
4921   return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD;
4922 }
4923 
getReturnAddressFrameIndex(SelectionDAG & DAG) const4924 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4925   MachineFunction &MF = DAG.getMachineFunction();
4926   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4927   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4928   int ReturnAddrIndex = FuncInfo->getRAIndex();
4929 
4930   if (ReturnAddrIndex == 0) {
4931     // Set up a frame object for the return address.
4932     unsigned SlotSize = RegInfo->getSlotSize();
4933     ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4934                                                           -(int64_t)SlotSize,
4935                                                           false);
4936     FuncInfo->setRAIndex(ReturnAddrIndex);
4937   }
4938 
4939   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4940 }
4941 
isOffsetSuitableForCodeModel(int64_t Offset,CodeModel::Model M,bool hasSymbolicDisplacement)4942 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4943                                        bool hasSymbolicDisplacement) {
4944   // Offset should fit into 32 bit immediate field.
4945   if (!isInt<32>(Offset))
4946     return false;
4947 
4948   // If we don't have a symbolic displacement - we don't have any extra
4949   // restrictions.
4950   if (!hasSymbolicDisplacement)
4951     return true;
4952 
4953   // FIXME: Some tweaks might be needed for medium code model.
4954   if (M != CodeModel::Small && M != CodeModel::Kernel)
4955     return false;
4956 
4957   // For small code model we assume that latest object is 16MB before end of 31
4958   // bits boundary. We may also accept pretty large negative constants knowing
4959   // that all objects are in the positive half of address space.
4960   if (M == CodeModel::Small && Offset < 16*1024*1024)
4961     return true;
4962 
4963   // For kernel code model we know that all object resist in the negative half
4964   // of 32bits address space. We may not accept negative offsets, since they may
4965   // be just off and we may accept pretty large positive ones.
4966   if (M == CodeModel::Kernel && Offset >= 0)
4967     return true;
4968 
4969   return false;
4970 }
4971 
4972 /// Determines whether the callee is required to pop its own arguments.
4973 /// Callee pop is necessary to support tail calls.
isCalleePop(CallingConv::ID CallingConv,bool is64Bit,bool IsVarArg,bool GuaranteeTCO)4974 bool X86::isCalleePop(CallingConv::ID CallingConv,
4975                       bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4976   // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4977   // can guarantee TCO.
4978   if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4979     return true;
4980 
4981   switch (CallingConv) {
4982   default:
4983     return false;
4984   case CallingConv::X86_StdCall:
4985   case CallingConv::X86_FastCall:
4986   case CallingConv::X86_ThisCall:
4987   case CallingConv::X86_VectorCall:
4988     return !is64Bit;
4989   }
4990 }
4991 
4992 /// Return true if the condition is an signed comparison operation.
isX86CCSigned(unsigned X86CC)4993 static bool isX86CCSigned(unsigned X86CC) {
4994   switch (X86CC) {
4995   default:
4996     llvm_unreachable("Invalid integer condition!");
4997   case X86::COND_E:
4998   case X86::COND_NE:
4999   case X86::COND_B:
5000   case X86::COND_A:
5001   case X86::COND_BE:
5002   case X86::COND_AE:
5003     return false;
5004   case X86::COND_G:
5005   case X86::COND_GE:
5006   case X86::COND_L:
5007   case X86::COND_LE:
5008     return true;
5009   }
5010 }
5011 
TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)5012 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
5013   switch (SetCCOpcode) {
5014   default: llvm_unreachable("Invalid integer condition!");
5015   case ISD::SETEQ:  return X86::COND_E;
5016   case ISD::SETGT:  return X86::COND_G;
5017   case ISD::SETGE:  return X86::COND_GE;
5018   case ISD::SETLT:  return X86::COND_L;
5019   case ISD::SETLE:  return X86::COND_LE;
5020   case ISD::SETNE:  return X86::COND_NE;
5021   case ISD::SETULT: return X86::COND_B;
5022   case ISD::SETUGT: return X86::COND_A;
5023   case ISD::SETULE: return X86::COND_BE;
5024   case ISD::SETUGE: return X86::COND_AE;
5025   }
5026 }
5027 
5028 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
5029 /// condition code, returning the condition code and the LHS/RHS of the
5030 /// comparison to make.
TranslateX86CC(ISD::CondCode SetCCOpcode,const SDLoc & DL,bool isFP,SDValue & LHS,SDValue & RHS,SelectionDAG & DAG)5031 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
5032                                bool isFP, SDValue &LHS, SDValue &RHS,
5033                                SelectionDAG &DAG) {
5034   if (!isFP) {
5035     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
5036       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
5037         // X > -1   -> X == 0, jump !sign.
5038         RHS = DAG.getConstant(0, DL, RHS.getValueType());
5039         return X86::COND_NS;
5040       }
5041       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
5042         // X < 0   -> X == 0, jump on sign.
5043         return X86::COND_S;
5044       }
5045       if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) {
5046         // X >= 0   -> X == 0, jump on !sign.
5047         return X86::COND_NS;
5048       }
5049       if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
5050         // X < 1   -> X <= 0
5051         RHS = DAG.getConstant(0, DL, RHS.getValueType());
5052         return X86::COND_LE;
5053       }
5054     }
5055 
5056     return TranslateIntegerX86CC(SetCCOpcode);
5057   }
5058 
5059   // First determine if it is required or is profitable to flip the operands.
5060 
5061   // If LHS is a foldable load, but RHS is not, flip the condition.
5062   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
5063       !ISD::isNON_EXTLoad(RHS.getNode())) {
5064     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
5065     std::swap(LHS, RHS);
5066   }
5067 
5068   switch (SetCCOpcode) {
5069   default: break;
5070   case ISD::SETOLT:
5071   case ISD::SETOLE:
5072   case ISD::SETUGT:
5073   case ISD::SETUGE:
5074     std::swap(LHS, RHS);
5075     break;
5076   }
5077 
5078   // On a floating point condition, the flags are set as follows:
5079   // ZF  PF  CF   op
5080   //  0 | 0 | 0 | X > Y
5081   //  0 | 0 | 1 | X < Y
5082   //  1 | 0 | 0 | X == Y
5083   //  1 | 1 | 1 | unordered
5084   switch (SetCCOpcode) {
5085   default: llvm_unreachable("Condcode should be pre-legalized away");
5086   case ISD::SETUEQ:
5087   case ISD::SETEQ:   return X86::COND_E;
5088   case ISD::SETOLT:              // flipped
5089   case ISD::SETOGT:
5090   case ISD::SETGT:   return X86::COND_A;
5091   case ISD::SETOLE:              // flipped
5092   case ISD::SETOGE:
5093   case ISD::SETGE:   return X86::COND_AE;
5094   case ISD::SETUGT:              // flipped
5095   case ISD::SETULT:
5096   case ISD::SETLT:   return X86::COND_B;
5097   case ISD::SETUGE:              // flipped
5098   case ISD::SETULE:
5099   case ISD::SETLE:   return X86::COND_BE;
5100   case ISD::SETONE:
5101   case ISD::SETNE:   return X86::COND_NE;
5102   case ISD::SETUO:   return X86::COND_P;
5103   case ISD::SETO:    return X86::COND_NP;
5104   case ISD::SETOEQ:
5105   case ISD::SETUNE:  return X86::COND_INVALID;
5106   }
5107 }
5108 
5109 /// Is there a floating point cmov for the specific X86 condition code?
5110 /// Current x86 isa includes the following FP cmov instructions:
5111 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
hasFPCMov(unsigned X86CC)5112 static bool hasFPCMov(unsigned X86CC) {
5113   switch (X86CC) {
5114   default:
5115     return false;
5116   case X86::COND_B:
5117   case X86::COND_BE:
5118   case X86::COND_E:
5119   case X86::COND_P:
5120   case X86::COND_A:
5121   case X86::COND_AE:
5122   case X86::COND_NE:
5123   case X86::COND_NP:
5124     return true;
5125   }
5126 }
5127 
5128 
getTgtMemIntrinsic(IntrinsicInfo & Info,const CallInst & I,MachineFunction & MF,unsigned Intrinsic) const5129 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
5130                                            const CallInst &I,
5131                                            MachineFunction &MF,
5132                                            unsigned Intrinsic) const {
5133   Info.flags = MachineMemOperand::MONone;
5134   Info.offset = 0;
5135 
5136   const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
5137   if (!IntrData) {
5138     switch (Intrinsic) {
5139     case Intrinsic::x86_aesenc128kl:
5140     case Intrinsic::x86_aesdec128kl:
5141       Info.opc = ISD::INTRINSIC_W_CHAIN;
5142       Info.ptrVal = I.getArgOperand(1);
5143       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5144       Info.align = Align(1);
5145       Info.flags |= MachineMemOperand::MOLoad;
5146       return true;
5147     case Intrinsic::x86_aesenc256kl:
5148     case Intrinsic::x86_aesdec256kl:
5149       Info.opc = ISD::INTRINSIC_W_CHAIN;
5150       Info.ptrVal = I.getArgOperand(1);
5151       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5152       Info.align = Align(1);
5153       Info.flags |= MachineMemOperand::MOLoad;
5154       return true;
5155     case Intrinsic::x86_aesencwide128kl:
5156     case Intrinsic::x86_aesdecwide128kl:
5157       Info.opc = ISD::INTRINSIC_W_CHAIN;
5158       Info.ptrVal = I.getArgOperand(0);
5159       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5160       Info.align = Align(1);
5161       Info.flags |= MachineMemOperand::MOLoad;
5162       return true;
5163     case Intrinsic::x86_aesencwide256kl:
5164     case Intrinsic::x86_aesdecwide256kl:
5165       Info.opc = ISD::INTRINSIC_W_CHAIN;
5166       Info.ptrVal = I.getArgOperand(0);
5167       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5168       Info.align = Align(1);
5169       Info.flags |= MachineMemOperand::MOLoad;
5170       return true;
5171     }
5172     return false;
5173   }
5174 
5175   switch (IntrData->Type) {
5176   case TRUNCATE_TO_MEM_VI8:
5177   case TRUNCATE_TO_MEM_VI16:
5178   case TRUNCATE_TO_MEM_VI32: {
5179     Info.opc = ISD::INTRINSIC_VOID;
5180     Info.ptrVal = I.getArgOperand(0);
5181     MVT VT  = MVT::getVT(I.getArgOperand(1)->getType());
5182     MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
5183     if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
5184       ScalarVT = MVT::i8;
5185     else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
5186       ScalarVT = MVT::i16;
5187     else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
5188       ScalarVT = MVT::i32;
5189 
5190     Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
5191     Info.align = Align(1);
5192     Info.flags |= MachineMemOperand::MOStore;
5193     break;
5194   }
5195   case GATHER:
5196   case GATHER_AVX2: {
5197     Info.opc = ISD::INTRINSIC_W_CHAIN;
5198     Info.ptrVal = nullptr;
5199     MVT DataVT = MVT::getVT(I.getType());
5200     MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5201     unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5202                                 IndexVT.getVectorNumElements());
5203     Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5204     Info.align = Align(1);
5205     Info.flags |= MachineMemOperand::MOLoad;
5206     break;
5207   }
5208   case SCATTER: {
5209     Info.opc = ISD::INTRINSIC_VOID;
5210     Info.ptrVal = nullptr;
5211     MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
5212     MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5213     unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5214                                 IndexVT.getVectorNumElements());
5215     Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5216     Info.align = Align(1);
5217     Info.flags |= MachineMemOperand::MOStore;
5218     break;
5219   }
5220   default:
5221     return false;
5222   }
5223 
5224   return true;
5225 }
5226 
5227 /// Returns true if the target can instruction select the
5228 /// specified FP immediate natively. If false, the legalizer will
5229 /// materialize the FP immediate as a load from a constant pool.
isFPImmLegal(const APFloat & Imm,EVT VT,bool ForCodeSize) const5230 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
5231                                      bool ForCodeSize) const {
5232   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
5233     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
5234       return true;
5235   }
5236   return false;
5237 }
5238 
shouldReduceLoadWidth(SDNode * Load,ISD::LoadExtType ExtTy,EVT NewVT) const5239 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
5240                                               ISD::LoadExtType ExtTy,
5241                                               EVT NewVT) const {
5242   assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
5243 
5244   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
5245   // relocation target a movq or addq instruction: don't let the load shrink.
5246   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
5247   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
5248     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
5249       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
5250 
5251   // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
5252   // those uses are extracted directly into a store, then the extract + store
5253   // can be store-folded. Therefore, it's probably not worth splitting the load.
5254   EVT VT = Load->getValueType(0);
5255   if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
5256     for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
5257       // Skip uses of the chain value. Result 0 of the node is the load value.
5258       if (UI.getUse().getResNo() != 0)
5259         continue;
5260 
5261       // If this use is not an extract + store, it's probably worth splitting.
5262       if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
5263           UI->use_begin()->getOpcode() != ISD::STORE)
5264         return true;
5265     }
5266     // All non-chain uses are extract + store.
5267     return false;
5268   }
5269 
5270   return true;
5271 }
5272 
5273 /// Returns true if it is beneficial to convert a load of a constant
5274 /// to just the constant itself.
shouldConvertConstantLoadToIntImm(const APInt & Imm,Type * Ty) const5275 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
5276                                                           Type *Ty) const {
5277   assert(Ty->isIntegerTy());
5278 
5279   unsigned BitSize = Ty->getPrimitiveSizeInBits();
5280   if (BitSize == 0 || BitSize > 64)
5281     return false;
5282   return true;
5283 }
5284 
reduceSelectOfFPConstantLoads(EVT CmpOpVT) const5285 bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
5286   // If we are using XMM registers in the ABI and the condition of the select is
5287   // a floating-point compare and we have blendv or conditional move, then it is
5288   // cheaper to select instead of doing a cross-register move and creating a
5289   // load that depends on the compare result.
5290   bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
5291   return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
5292 }
5293 
convertSelectOfConstantsToMath(EVT VT) const5294 bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
5295   // TODO: It might be a win to ease or lift this restriction, but the generic
5296   // folds in DAGCombiner conflict with vector folds for an AVX512 target.
5297   if (VT.isVector() && Subtarget.hasAVX512())
5298     return false;
5299 
5300   return true;
5301 }
5302 
decomposeMulByConstant(LLVMContext & Context,EVT VT,SDValue C) const5303 bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
5304                                                SDValue C) const {
5305   // TODO: We handle scalars using custom code, but generic combining could make
5306   // that unnecessary.
5307   APInt MulC;
5308   if (!ISD::isConstantSplatVector(C.getNode(), MulC))
5309     return false;
5310 
5311   // Find the type this will be legalized too. Otherwise we might prematurely
5312   // convert this to shl+add/sub and then still have to type legalize those ops.
5313   // Another choice would be to defer the decision for illegal types until
5314   // after type legalization. But constant splat vectors of i64 can't make it
5315   // through type legalization on 32-bit targets so we would need to special
5316   // case vXi64.
5317   while (getTypeAction(Context, VT) != TypeLegal)
5318     VT = getTypeToTransformTo(Context, VT);
5319 
5320   // If vector multiply is legal, assume that's faster than shl + add/sub.
5321   // TODO: Multiply is a complex op with higher latency and lower throughput in
5322   //       most implementations, so this check could be loosened based on type
5323   //       and/or a CPU attribute.
5324   if (isOperationLegal(ISD::MUL, VT))
5325     return false;
5326 
5327   // shl+add, shl+sub, shl+add+neg
5328   return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
5329          (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
5330 }
5331 
isExtractSubvectorCheap(EVT ResVT,EVT SrcVT,unsigned Index) const5332 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
5333                                                 unsigned Index) const {
5334   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
5335     return false;
5336 
5337   // Mask vectors support all subregister combinations and operations that
5338   // extract half of vector.
5339   if (ResVT.getVectorElementType() == MVT::i1)
5340     return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
5341                           (Index == ResVT.getVectorNumElements()));
5342 
5343   return (Index % ResVT.getVectorNumElements()) == 0;
5344 }
5345 
shouldScalarizeBinop(SDValue VecOp) const5346 bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
5347   unsigned Opc = VecOp.getOpcode();
5348 
5349   // Assume target opcodes can't be scalarized.
5350   // TODO - do we have any exceptions?
5351   if (Opc >= ISD::BUILTIN_OP_END)
5352     return false;
5353 
5354   // If the vector op is not supported, try to convert to scalar.
5355   EVT VecVT = VecOp.getValueType();
5356   if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
5357     return true;
5358 
5359   // If the vector op is supported, but the scalar op is not, the transform may
5360   // not be worthwhile.
5361   EVT ScalarVT = VecVT.getScalarType();
5362   return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
5363 }
5364 
shouldFormOverflowOp(unsigned Opcode,EVT VT,bool) const5365 bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
5366                                              bool) const {
5367   // TODO: Allow vectors?
5368   if (VT.isVector())
5369     return false;
5370   return VT.isSimple() || !isOperationExpand(Opcode, VT);
5371 }
5372 
isCheapToSpeculateCttz() const5373 bool X86TargetLowering::isCheapToSpeculateCttz() const {
5374   // Speculate cttz only if we can directly use TZCNT.
5375   return Subtarget.hasBMI();
5376 }
5377 
isCheapToSpeculateCtlz() const5378 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
5379   // Speculate ctlz only if we can directly use LZCNT.
5380   return Subtarget.hasLZCNT();
5381 }
5382 
isLoadBitCastBeneficial(EVT LoadVT,EVT BitcastVT,const SelectionDAG & DAG,const MachineMemOperand & MMO) const5383 bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
5384                                                 const SelectionDAG &DAG,
5385                                                 const MachineMemOperand &MMO) const {
5386   if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
5387       BitcastVT.getVectorElementType() == MVT::i1)
5388     return false;
5389 
5390   if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
5391     return false;
5392 
5393   // If both types are legal vectors, it's always ok to convert them.
5394   if (LoadVT.isVector() && BitcastVT.isVector() &&
5395       isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
5396     return true;
5397 
5398   return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
5399 }
5400 
canMergeStoresTo(unsigned AddressSpace,EVT MemVT,const SelectionDAG & DAG) const5401 bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
5402                                          const SelectionDAG &DAG) const {
5403   // Do not merge to float value size (128 bytes) if no implicit
5404   // float attribute is set.
5405   bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
5406       Attribute::NoImplicitFloat);
5407 
5408   if (NoFloat) {
5409     unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
5410     return (MemVT.getSizeInBits() <= MaxIntSize);
5411   }
5412   // Make sure we don't merge greater than our preferred vector
5413   // width.
5414   if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
5415     return false;
5416 
5417   return true;
5418 }
5419 
isCtlzFast() const5420 bool X86TargetLowering::isCtlzFast() const {
5421   return Subtarget.hasFastLZCNT();
5422 }
5423 
isMaskAndCmp0FoldingBeneficial(const Instruction & AndI) const5424 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
5425     const Instruction &AndI) const {
5426   return true;
5427 }
5428 
hasAndNotCompare(SDValue Y) const5429 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
5430   EVT VT = Y.getValueType();
5431 
5432   if (VT.isVector())
5433     return false;
5434 
5435   if (!Subtarget.hasBMI())
5436     return false;
5437 
5438   // There are only 32-bit and 64-bit forms for 'andn'.
5439   if (VT != MVT::i32 && VT != MVT::i64)
5440     return false;
5441 
5442   return !isa<ConstantSDNode>(Y);
5443 }
5444 
hasAndNot(SDValue Y) const5445 bool X86TargetLowering::hasAndNot(SDValue Y) const {
5446   EVT VT = Y.getValueType();
5447 
5448   if (!VT.isVector())
5449     return hasAndNotCompare(Y);
5450 
5451   // Vector.
5452 
5453   if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
5454     return false;
5455 
5456   if (VT == MVT::v4i32)
5457     return true;
5458 
5459   return Subtarget.hasSSE2();
5460 }
5461 
hasBitTest(SDValue X,SDValue Y) const5462 bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
5463   return X.getValueType().isScalarInteger(); // 'bt'
5464 }
5465 
5466 bool X86TargetLowering::
shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X,ConstantSDNode * XC,ConstantSDNode * CC,SDValue Y,unsigned OldShiftOpcode,unsigned NewShiftOpcode,SelectionDAG & DAG) const5467     shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5468         SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
5469         unsigned OldShiftOpcode, unsigned NewShiftOpcode,
5470         SelectionDAG &DAG) const {
5471   // Does baseline recommend not to perform the fold by default?
5472   if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5473           X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
5474     return false;
5475   // For scalars this transform is always beneficial.
5476   if (X.getValueType().isScalarInteger())
5477     return true;
5478   // If all the shift amounts are identical, then transform is beneficial even
5479   // with rudimentary SSE2 shifts.
5480   if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
5481     return true;
5482   // If we have AVX2 with it's powerful shift operations, then it's also good.
5483   if (Subtarget.hasAVX2())
5484     return true;
5485   // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
5486   return NewShiftOpcode == ISD::SHL;
5487 }
5488 
shouldFoldConstantShiftPairToMask(const SDNode * N,CombineLevel Level) const5489 bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
5490     const SDNode *N, CombineLevel Level) const {
5491   assert(((N->getOpcode() == ISD::SHL &&
5492            N->getOperand(0).getOpcode() == ISD::SRL) ||
5493           (N->getOpcode() == ISD::SRL &&
5494            N->getOperand(0).getOpcode() == ISD::SHL)) &&
5495          "Expected shift-shift mask");
5496   EVT VT = N->getValueType(0);
5497   if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
5498       (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
5499     // Only fold if the shift values are equal - so it folds to AND.
5500     // TODO - we should fold if either is a non-uniform vector but we don't do
5501     // the fold for non-splats yet.
5502     return N->getOperand(1) == N->getOperand(0).getOperand(1);
5503   }
5504   return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
5505 }
5506 
shouldFoldMaskToVariableShiftPair(SDValue Y) const5507 bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
5508   EVT VT = Y.getValueType();
5509 
5510   // For vectors, we don't have a preference, but we probably want a mask.
5511   if (VT.isVector())
5512     return false;
5513 
5514   // 64-bit shifts on 32-bit targets produce really bad bloated code.
5515   if (VT == MVT::i64 && !Subtarget.is64Bit())
5516     return false;
5517 
5518   return true;
5519 }
5520 
shouldExpandShift(SelectionDAG & DAG,SDNode * N) const5521 bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
5522                                           SDNode *N) const {
5523   if (DAG.getMachineFunction().getFunction().hasMinSize() &&
5524       !Subtarget.isOSWindows())
5525     return false;
5526   return true;
5527 }
5528 
shouldSplatInsEltVarIndex(EVT VT) const5529 bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
5530   // Any legal vector type can be splatted more efficiently than
5531   // loading/spilling from memory.
5532   return isTypeLegal(VT);
5533 }
5534 
hasFastEqualityCompare(unsigned NumBits) const5535 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
5536   MVT VT = MVT::getIntegerVT(NumBits);
5537   if (isTypeLegal(VT))
5538     return VT;
5539 
5540   // PMOVMSKB can handle this.
5541   if (NumBits == 128 && isTypeLegal(MVT::v16i8))
5542     return MVT::v16i8;
5543 
5544   // VPMOVMSKB can handle this.
5545   if (NumBits == 256 && isTypeLegal(MVT::v32i8))
5546     return MVT::v32i8;
5547 
5548   // TODO: Allow 64-bit type for 32-bit target.
5549   // TODO: 512-bit types should be allowed, but make sure that those
5550   // cases are handled in combineVectorSizedSetCCEquality().
5551 
5552   return MVT::INVALID_SIMPLE_VALUE_TYPE;
5553 }
5554 
5555 /// Val is the undef sentinel value or equal to the specified value.
isUndefOrEqual(int Val,int CmpVal)5556 static bool isUndefOrEqual(int Val, int CmpVal) {
5557   return ((Val == SM_SentinelUndef) || (Val == CmpVal));
5558 }
5559 
5560 /// Return true if every element in Mask is the undef sentinel value or equal to
5561 /// the specified value..
isUndefOrEqual(ArrayRef<int> Mask,int CmpVal)5562 static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
5563   return llvm::all_of(Mask, [CmpVal](int M) {
5564     return (M == SM_SentinelUndef) || (M == CmpVal);
5565   });
5566 }
5567 
5568 /// Val is either the undef or zero sentinel value.
isUndefOrZero(int Val)5569 static bool isUndefOrZero(int Val) {
5570   return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
5571 }
5572 
5573 /// Return true if every element in Mask, beginning from position Pos and ending
5574 /// in Pos+Size is the undef sentinel value.
isUndefInRange(ArrayRef<int> Mask,unsigned Pos,unsigned Size)5575 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
5576   return llvm::all_of(Mask.slice(Pos, Size),
5577                       [](int M) { return M == SM_SentinelUndef; });
5578 }
5579 
5580 /// Return true if the mask creates a vector whose lower half is undefined.
isUndefLowerHalf(ArrayRef<int> Mask)5581 static bool isUndefLowerHalf(ArrayRef<int> Mask) {
5582   unsigned NumElts = Mask.size();
5583   return isUndefInRange(Mask, 0, NumElts / 2);
5584 }
5585 
5586 /// Return true if the mask creates a vector whose upper half is undefined.
isUndefUpperHalf(ArrayRef<int> Mask)5587 static bool isUndefUpperHalf(ArrayRef<int> Mask) {
5588   unsigned NumElts = Mask.size();
5589   return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
5590 }
5591 
5592 /// Return true if Val falls within the specified range (L, H].
isInRange(int Val,int Low,int Hi)5593 static bool isInRange(int Val, int Low, int Hi) {
5594   return (Val >= Low && Val < Hi);
5595 }
5596 
5597 /// Return true if the value of any element in Mask falls within the specified
5598 /// range (L, H].
isAnyInRange(ArrayRef<int> Mask,int Low,int Hi)5599 static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
5600   return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
5601 }
5602 
5603 /// Return true if the value of any element in Mask is the zero sentinel value.
isAnyZero(ArrayRef<int> Mask)5604 static bool isAnyZero(ArrayRef<int> Mask) {
5605   return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
5606 }
5607 
5608 /// Return true if the value of any element in Mask is the zero or undef
5609 /// sentinel values.
isAnyZeroOrUndef(ArrayRef<int> Mask)5610 static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
5611   return llvm::any_of(Mask, [](int M) {
5612     return M == SM_SentinelZero || M == SM_SentinelUndef;
5613   });
5614 }
5615 
5616 /// Return true if Val is undef or if its value falls within the
5617 /// specified range (L, H].
isUndefOrInRange(int Val,int Low,int Hi)5618 static bool isUndefOrInRange(int Val, int Low, int Hi) {
5619   return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
5620 }
5621 
5622 /// Return true if every element in Mask is undef or if its value
5623 /// falls within the specified range (L, H].
isUndefOrInRange(ArrayRef<int> Mask,int Low,int Hi)5624 static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5625   return llvm::all_of(
5626       Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
5627 }
5628 
5629 /// Return true if Val is undef, zero or if its value falls within the
5630 /// specified range (L, H].
isUndefOrZeroOrInRange(int Val,int Low,int Hi)5631 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
5632   return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
5633 }
5634 
5635 /// Return true if every element in Mask is undef, zero or if its value
5636 /// falls within the specified range (L, H].
isUndefOrZeroOrInRange(ArrayRef<int> Mask,int Low,int Hi)5637 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5638   return llvm::all_of(
5639       Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
5640 }
5641 
5642 /// Return true if every element in Mask, beginning
5643 /// from position Pos and ending in Pos + Size, falls within the specified
5644 /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
isSequentialOrUndefInRange(ArrayRef<int> Mask,unsigned Pos,unsigned Size,int Low,int Step=1)5645 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
5646                                        unsigned Size, int Low, int Step = 1) {
5647   for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5648     if (!isUndefOrEqual(Mask[i], Low))
5649       return false;
5650   return true;
5651 }
5652 
5653 /// Return true if every element in Mask, beginning
5654 /// from position Pos and ending in Pos+Size, falls within the specified
5655 /// sequential range (Low, Low+Size], or is undef or is zero.
isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask,unsigned Pos,unsigned Size,int Low,int Step=1)5656 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5657                                              unsigned Size, int Low,
5658                                              int Step = 1) {
5659   for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5660     if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
5661       return false;
5662   return true;
5663 }
5664 
5665 /// Return true if every element in Mask, beginning
5666 /// from position Pos and ending in Pos+Size is undef or is zero.
isUndefOrZeroInRange(ArrayRef<int> Mask,unsigned Pos,unsigned Size)5667 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5668                                  unsigned Size) {
5669   return llvm::all_of(Mask.slice(Pos, Size),
5670                       [](int M) { return isUndefOrZero(M); });
5671 }
5672 
5673 /// Helper function to test whether a shuffle mask could be
5674 /// simplified by widening the elements being shuffled.
5675 ///
5676 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
5677 /// leaves it in an unspecified state.
5678 ///
5679 /// NOTE: This must handle normal vector shuffle masks and *target* vector
5680 /// shuffle masks. The latter have the special property of a '-2' representing
5681 /// a zero-ed lane of a vector.
canWidenShuffleElements(ArrayRef<int> Mask,SmallVectorImpl<int> & WidenedMask)5682 static bool canWidenShuffleElements(ArrayRef<int> Mask,
5683                                     SmallVectorImpl<int> &WidenedMask) {
5684   WidenedMask.assign(Mask.size() / 2, 0);
5685   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
5686     int M0 = Mask[i];
5687     int M1 = Mask[i + 1];
5688 
5689     // If both elements are undef, its trivial.
5690     if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
5691       WidenedMask[i / 2] = SM_SentinelUndef;
5692       continue;
5693     }
5694 
5695     // Check for an undef mask and a mask value properly aligned to fit with
5696     // a pair of values. If we find such a case, use the non-undef mask's value.
5697     if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
5698       WidenedMask[i / 2] = M1 / 2;
5699       continue;
5700     }
5701     if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
5702       WidenedMask[i / 2] = M0 / 2;
5703       continue;
5704     }
5705 
5706     // When zeroing, we need to spread the zeroing across both lanes to widen.
5707     if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
5708       if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
5709           (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
5710         WidenedMask[i / 2] = SM_SentinelZero;
5711         continue;
5712       }
5713       return false;
5714     }
5715 
5716     // Finally check if the two mask values are adjacent and aligned with
5717     // a pair.
5718     if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
5719       WidenedMask[i / 2] = M0 / 2;
5720       continue;
5721     }
5722 
5723     // Otherwise we can't safely widen the elements used in this shuffle.
5724     return false;
5725   }
5726   assert(WidenedMask.size() == Mask.size() / 2 &&
5727          "Incorrect size of mask after widening the elements!");
5728 
5729   return true;
5730 }
5731 
canWidenShuffleElements(ArrayRef<int> Mask,const APInt & Zeroable,bool V2IsZero,SmallVectorImpl<int> & WidenedMask)5732 static bool canWidenShuffleElements(ArrayRef<int> Mask,
5733                                     const APInt &Zeroable,
5734                                     bool V2IsZero,
5735                                     SmallVectorImpl<int> &WidenedMask) {
5736   // Create an alternative mask with info about zeroable elements.
5737   // Here we do not set undef elements as zeroable.
5738   SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
5739   if (V2IsZero) {
5740     assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
5741     for (int i = 0, Size = Mask.size(); i != Size; ++i)
5742       if (Mask[i] != SM_SentinelUndef && Zeroable[i])
5743         ZeroableMask[i] = SM_SentinelZero;
5744   }
5745   return canWidenShuffleElements(ZeroableMask, WidenedMask);
5746 }
5747 
canWidenShuffleElements(ArrayRef<int> Mask)5748 static bool canWidenShuffleElements(ArrayRef<int> Mask) {
5749   SmallVector<int, 32> WidenedMask;
5750   return canWidenShuffleElements(Mask, WidenedMask);
5751 }
5752 
5753 // Attempt to narrow/widen shuffle mask until it matches the target number of
5754 // elements.
scaleShuffleElements(ArrayRef<int> Mask,unsigned NumDstElts,SmallVectorImpl<int> & ScaledMask)5755 static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
5756                                  SmallVectorImpl<int> &ScaledMask) {
5757   unsigned NumSrcElts = Mask.size();
5758   assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
5759          "Illegal shuffle scale factor");
5760 
5761   // Narrowing is guaranteed to work.
5762   if (NumDstElts >= NumSrcElts) {
5763     int Scale = NumDstElts / NumSrcElts;
5764     llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
5765     return true;
5766   }
5767 
5768   // We have to repeat the widening until we reach the target size, but we can
5769   // split out the first widening as it sets up ScaledMask for us.
5770   if (canWidenShuffleElements(Mask, ScaledMask)) {
5771     while (ScaledMask.size() > NumDstElts) {
5772       SmallVector<int, 16> WidenedMask;
5773       if (!canWidenShuffleElements(ScaledMask, WidenedMask))
5774         return false;
5775       ScaledMask = std::move(WidenedMask);
5776     }
5777     return true;
5778   }
5779 
5780   return false;
5781 }
5782 
5783 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
isZeroNode(SDValue Elt)5784 bool X86::isZeroNode(SDValue Elt) {
5785   return isNullConstant(Elt) || isNullFPConstant(Elt);
5786 }
5787 
5788 // Build a vector of constants.
5789 // Use an UNDEF node if MaskElt == -1.
5790 // Split 64-bit constants in the 32-bit mode.
getConstVector(ArrayRef<int> Values,MVT VT,SelectionDAG & DAG,const SDLoc & dl,bool IsMask=false)5791 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
5792                               const SDLoc &dl, bool IsMask = false) {
5793 
5794   SmallVector<SDValue, 32>  Ops;
5795   bool Split = false;
5796 
5797   MVT ConstVecVT = VT;
5798   unsigned NumElts = VT.getVectorNumElements();
5799   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5800   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5801     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5802     Split = true;
5803   }
5804 
5805   MVT EltVT = ConstVecVT.getVectorElementType();
5806   for (unsigned i = 0; i < NumElts; ++i) {
5807     bool IsUndef = Values[i] < 0 && IsMask;
5808     SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
5809       DAG.getConstant(Values[i], dl, EltVT);
5810     Ops.push_back(OpNode);
5811     if (Split)
5812       Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
5813                     DAG.getConstant(0, dl, EltVT));
5814   }
5815   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5816   if (Split)
5817     ConstsNode = DAG.getBitcast(VT, ConstsNode);
5818   return ConstsNode;
5819 }
5820 
getConstVector(ArrayRef<APInt> Bits,APInt & Undefs,MVT VT,SelectionDAG & DAG,const SDLoc & dl)5821 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
5822                               MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5823   assert(Bits.size() == Undefs.getBitWidth() &&
5824          "Unequal constant and undef arrays");
5825   SmallVector<SDValue, 32> Ops;
5826   bool Split = false;
5827 
5828   MVT ConstVecVT = VT;
5829   unsigned NumElts = VT.getVectorNumElements();
5830   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5831   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5832     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5833     Split = true;
5834   }
5835 
5836   MVT EltVT = ConstVecVT.getVectorElementType();
5837   for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
5838     if (Undefs[i]) {
5839       Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
5840       continue;
5841     }
5842     const APInt &V = Bits[i];
5843     assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
5844     if (Split) {
5845       Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
5846       Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
5847     } else if (EltVT == MVT::f32) {
5848       APFloat FV(APFloat::IEEEsingle(), V);
5849       Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5850     } else if (EltVT == MVT::f64) {
5851       APFloat FV(APFloat::IEEEdouble(), V);
5852       Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5853     } else {
5854       Ops.push_back(DAG.getConstant(V, dl, EltVT));
5855     }
5856   }
5857 
5858   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5859   return DAG.getBitcast(VT, ConstsNode);
5860 }
5861 
5862 /// Returns a vector of specified type with all zero elements.
getZeroVector(MVT VT,const X86Subtarget & Subtarget,SelectionDAG & DAG,const SDLoc & dl)5863 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
5864                              SelectionDAG &DAG, const SDLoc &dl) {
5865   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
5866           VT.getVectorElementType() == MVT::i1) &&
5867          "Unexpected vector type");
5868 
5869   // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
5870   // type. This ensures they get CSE'd. But if the integer type is not
5871   // available, use a floating-point +0.0 instead.
5872   SDValue Vec;
5873   if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
5874     Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
5875   } else if (VT.isFloatingPoint()) {
5876     Vec = DAG.getConstantFP(+0.0, dl, VT);
5877   } else if (VT.getVectorElementType() == MVT::i1) {
5878     assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
5879            "Unexpected vector type");
5880     Vec = DAG.getConstant(0, dl, VT);
5881   } else {
5882     unsigned Num32BitElts = VT.getSizeInBits() / 32;
5883     Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
5884   }
5885   return DAG.getBitcast(VT, Vec);
5886 }
5887 
extractSubVector(SDValue Vec,unsigned IdxVal,SelectionDAG & DAG,const SDLoc & dl,unsigned vectorWidth)5888 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
5889                                 const SDLoc &dl, unsigned vectorWidth) {
5890   EVT VT = Vec.getValueType();
5891   EVT ElVT = VT.getVectorElementType();
5892   unsigned Factor = VT.getSizeInBits() / vectorWidth;
5893   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5894                                   VT.getVectorNumElements() / Factor);
5895 
5896   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
5897   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5898   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5899 
5900   // This is the index of the first element of the vectorWidth-bit chunk
5901   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5902   IdxVal &= ~(ElemsPerChunk - 1);
5903 
5904   // If the input is a buildvector just emit a smaller one.
5905   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5906     return DAG.getBuildVector(ResultVT, dl,
5907                               Vec->ops().slice(IdxVal, ElemsPerChunk));
5908 
5909   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5910   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5911 }
5912 
5913 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
5914 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5915 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5916 /// instructions or a simple subregister reference. Idx is an index in the
5917 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
5918 /// lowering EXTRACT_VECTOR_ELT operations easier.
extract128BitVector(SDValue Vec,unsigned IdxVal,SelectionDAG & DAG,const SDLoc & dl)5919 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5920                                    SelectionDAG &DAG, const SDLoc &dl) {
5921   assert((Vec.getValueType().is256BitVector() ||
5922           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
5923   return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5924 }
5925 
5926 /// Generate a DAG to grab 256-bits from a 512-bit vector.
extract256BitVector(SDValue Vec,unsigned IdxVal,SelectionDAG & DAG,const SDLoc & dl)5927 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5928                                    SelectionDAG &DAG, const SDLoc &dl) {
5929   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
5930   return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5931 }
5932 
insertSubVector(SDValue Result,SDValue Vec,unsigned IdxVal,SelectionDAG & DAG,const SDLoc & dl,unsigned vectorWidth)5933 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5934                                SelectionDAG &DAG, const SDLoc &dl,
5935                                unsigned vectorWidth) {
5936   assert((vectorWidth == 128 || vectorWidth == 256) &&
5937          "Unsupported vector width");
5938   // Inserting UNDEF is Result
5939   if (Vec.isUndef())
5940     return Result;
5941   EVT VT = Vec.getValueType();
5942   EVT ElVT = VT.getVectorElementType();
5943   EVT ResultVT = Result.getValueType();
5944 
5945   // Insert the relevant vectorWidth bits.
5946   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5947   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5948 
5949   // This is the index of the first element of the vectorWidth-bit chunk
5950   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5951   IdxVal &= ~(ElemsPerChunk - 1);
5952 
5953   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5954   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5955 }
5956 
5957 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
5958 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5959 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5960 /// simple superregister reference.  Idx is an index in the 128 bits
5961 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
5962 /// lowering INSERT_VECTOR_ELT operations easier.
insert128BitVector(SDValue Result,SDValue Vec,unsigned IdxVal,SelectionDAG & DAG,const SDLoc & dl)5963 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5964                                   SelectionDAG &DAG, const SDLoc &dl) {
5965   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5966   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5967 }
5968 
5969 /// Widen a vector to a larger size with the same scalar type, with the new
5970 /// elements either zero or undef.
widenSubVector(MVT VT,SDValue Vec,bool ZeroNewElements,const X86Subtarget & Subtarget,SelectionDAG & DAG,const SDLoc & dl)5971 static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
5972                               const X86Subtarget &Subtarget, SelectionDAG &DAG,
5973                               const SDLoc &dl) {
5974   assert(Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() &&
5975          Vec.getValueType().getScalarType() == VT.getScalarType() &&
5976          "Unsupported vector widening type");
5977   SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
5978                                 : DAG.getUNDEF(VT);
5979   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
5980                      DAG.getIntPtrConstant(0, dl));
5981 }
5982 
5983 /// Widen a vector to a larger size with the same scalar type, with the new
5984 /// elements either zero or undef.
widenSubVector(SDValue Vec,bool ZeroNewElements,const X86Subtarget & Subtarget,SelectionDAG & DAG,const SDLoc & dl,unsigned WideSizeInBits)5985 static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
5986                               const X86Subtarget &Subtarget, SelectionDAG &DAG,
5987                               const SDLoc &dl, unsigned WideSizeInBits) {
5988   assert(Vec.getValueSizeInBits() < WideSizeInBits &&
5989          (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
5990          "Unsupported vector widening type");
5991   unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
5992   MVT SVT = Vec.getSimpleValueType().getScalarType();
5993   MVT VT = MVT::getVectorVT(SVT, WideNumElts);
5994   return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
5995 }
5996 
5997 // Helper function to collect subvector ops that are concatenated together,
5998 // either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
5999 // The subvectors in Ops are guaranteed to be the same type.
collectConcatOps(SDNode * N,SmallVectorImpl<SDValue> & Ops)6000 static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
6001   assert(Ops.empty() && "Expected an empty ops vector");
6002 
6003   if (N->getOpcode() == ISD::CONCAT_VECTORS) {
6004     Ops.append(N->op_begin(), N->op_end());
6005     return true;
6006   }
6007 
6008   if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
6009     SDValue Src = N->getOperand(0);
6010     SDValue Sub = N->getOperand(1);
6011     const APInt &Idx = N->getConstantOperandAPInt(2);
6012     EVT VT = Src.getValueType();
6013     EVT SubVT = Sub.getValueType();
6014 
6015     // TODO - Handle more general insert_subvector chains.
6016     if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
6017         Idx == (VT.getVectorNumElements() / 2)) {
6018       // insert_subvector(insert_subvector(undef, x, lo), y, hi)
6019       if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6020           Src.getOperand(1).getValueType() == SubVT &&
6021           isNullConstant(Src.getOperand(2))) {
6022         Ops.push_back(Src.getOperand(1));
6023         Ops.push_back(Sub);
6024         return true;
6025       }
6026       // insert_subvector(x, extract_subvector(x, lo), hi)
6027       if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6028           Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
6029         Ops.append(2, Sub);
6030         return true;
6031       }
6032     }
6033   }
6034 
6035   return false;
6036 }
6037 
splitVector(SDValue Op,SelectionDAG & DAG,const SDLoc & dl)6038 static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
6039                                                const SDLoc &dl) {
6040   EVT VT = Op.getValueType();
6041   unsigned NumElems = VT.getVectorNumElements();
6042   unsigned SizeInBits = VT.getSizeInBits();
6043   assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
6044          "Can't split odd sized vector");
6045 
6046   SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
6047   SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
6048   return std::make_pair(Lo, Hi);
6049 }
6050 
6051 // Split an unary integer op into 2 half sized ops.
splitVectorIntUnary(SDValue Op,SelectionDAG & DAG)6052 static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
6053   EVT VT = Op.getValueType();
6054 
6055   // Make sure we only try to split 256/512-bit types to avoid creating
6056   // narrow vectors.
6057   assert((Op.getOperand(0).getValueType().is256BitVector() ||
6058           Op.getOperand(0).getValueType().is512BitVector()) &&
6059          (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
6060   assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
6061              VT.getVectorNumElements() &&
6062          "Unexpected VTs!");
6063 
6064   SDLoc dl(Op);
6065 
6066   // Extract the Lo/Hi vectors
6067   SDValue Lo, Hi;
6068   std::tie(Lo, Hi) = splitVector(Op.getOperand(0), DAG, dl);
6069 
6070   EVT LoVT, HiVT;
6071   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6072   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6073                      DAG.getNode(Op.getOpcode(), dl, LoVT, Lo),
6074                      DAG.getNode(Op.getOpcode(), dl, HiVT, Hi));
6075 }
6076 
6077 /// Break a binary integer operation into 2 half sized ops and then
6078 /// concatenate the result back.
splitVectorIntBinary(SDValue Op,SelectionDAG & DAG)6079 static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
6080   EVT VT = Op.getValueType();
6081 
6082   // Sanity check that all the types match.
6083   assert(Op.getOperand(0).getValueType() == VT &&
6084          Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
6085   assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
6086 
6087   SDLoc dl(Op);
6088 
6089   // Extract the LHS Lo/Hi vectors
6090   SDValue LHS1, LHS2;
6091   std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);
6092 
6093   // Extract the RHS Lo/Hi vectors
6094   SDValue RHS1, RHS2;
6095   std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);
6096 
6097   EVT LoVT, HiVT;
6098   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6099   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6100                      DAG.getNode(Op.getOpcode(), dl, LoVT, LHS1, RHS1),
6101                      DAG.getNode(Op.getOpcode(), dl, HiVT, LHS2, RHS2));
6102 }
6103 
6104 // Helper for splitting operands of an operation to legal target size and
6105 // apply a function on each part.
6106 // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
6107 // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
6108 // deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
6109 // The argument Builder is a function that will be applied on each split part:
6110 // SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
6111 template <typename F>
SplitOpsAndApply(SelectionDAG & DAG,const X86Subtarget & Subtarget,const SDLoc & DL,EVT VT,ArrayRef<SDValue> Ops,F Builder,bool CheckBWI=true)6112 SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
6113                          const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
6114                          F Builder, bool CheckBWI = true) {
6115   assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
6116   unsigned NumSubs = 1;
6117   if ((CheckBWI && Subtarget.useBWIRegs()) ||
6118       (!CheckBWI && Subtarget.useAVX512Regs())) {
6119     if (VT.getSizeInBits() > 512) {
6120       NumSubs = VT.getSizeInBits() / 512;
6121       assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
6122     }
6123   } else if (Subtarget.hasAVX2()) {
6124     if (VT.getSizeInBits() > 256) {
6125       NumSubs = VT.getSizeInBits() / 256;
6126       assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
6127     }
6128   } else {
6129     if (VT.getSizeInBits() > 128) {
6130       NumSubs = VT.getSizeInBits() / 128;
6131       assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
6132     }
6133   }
6134 
6135   if (NumSubs == 1)
6136     return Builder(DAG, DL, Ops);
6137 
6138   SmallVector<SDValue, 4> Subs;
6139   for (unsigned i = 0; i != NumSubs; ++i) {
6140     SmallVector<SDValue, 2> SubOps;
6141     for (SDValue Op : Ops) {
6142       EVT OpVT = Op.getValueType();
6143       unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
6144       unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
6145       SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
6146     }
6147     Subs.push_back(Builder(DAG, DL, SubOps));
6148   }
6149   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
6150 }
6151 
6152 /// Insert i1-subvector to i1-vector.
insert1BitVector(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)6153 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
6154                                 const X86Subtarget &Subtarget) {
6155 
6156   SDLoc dl(Op);
6157   SDValue Vec = Op.getOperand(0);
6158   SDValue SubVec = Op.getOperand(1);
6159   SDValue Idx = Op.getOperand(2);
6160   unsigned IdxVal = Op.getConstantOperandVal(2);
6161 
6162   // Inserting undef is a nop. We can just return the original vector.
6163   if (SubVec.isUndef())
6164     return Vec;
6165 
6166   if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
6167     return Op;
6168 
6169   MVT OpVT = Op.getSimpleValueType();
6170   unsigned NumElems = OpVT.getVectorNumElements();
6171   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6172 
6173   // Extend to natively supported kshift.
6174   MVT WideOpVT = OpVT;
6175   if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
6176     WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
6177 
6178   // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
6179   // if necessary.
6180   if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
6181     // May need to promote to a legal type.
6182     Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6183                      DAG.getConstant(0, dl, WideOpVT),
6184                      SubVec, Idx);
6185     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6186   }
6187 
6188   MVT SubVecVT = SubVec.getSimpleValueType();
6189   unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
6190   assert(IdxVal + SubVecNumElems <= NumElems &&
6191          IdxVal % SubVecVT.getSizeInBits() == 0 &&
6192          "Unexpected index value in INSERT_SUBVECTOR");
6193 
6194   SDValue Undef = DAG.getUNDEF(WideOpVT);
6195 
6196   if (IdxVal == 0) {
6197     // Zero lower bits of the Vec
6198     SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
6199     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
6200                       ZeroIdx);
6201     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6202     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6203     // Merge them together, SubVec should be zero extended.
6204     SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6205                          DAG.getConstant(0, dl, WideOpVT),
6206                          SubVec, ZeroIdx);
6207     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6208     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6209   }
6210 
6211   SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6212                        Undef, SubVec, ZeroIdx);
6213 
6214   if (Vec.isUndef()) {
6215     assert(IdxVal != 0 && "Unexpected index");
6216     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6217                          DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6218     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6219   }
6220 
6221   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
6222     assert(IdxVal != 0 && "Unexpected index");
6223     NumElems = WideOpVT.getVectorNumElements();
6224     unsigned ShiftLeft = NumElems - SubVecNumElems;
6225     unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6226     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6227                          DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6228     if (ShiftRight != 0)
6229       SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6230                            DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6231     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6232   }
6233 
6234   // Simple case when we put subvector in the upper part
6235   if (IdxVal + SubVecNumElems == NumElems) {
6236     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6237                          DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6238     if (SubVecNumElems * 2 == NumElems) {
6239       // Special case, use legal zero extending insert_subvector. This allows
6240       // isel to optimize when bits are known zero.
6241       Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
6242       Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6243                         DAG.getConstant(0, dl, WideOpVT),
6244                         Vec, ZeroIdx);
6245     } else {
6246       // Otherwise use explicit shifts to zero the bits.
6247       Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6248                         Undef, Vec, ZeroIdx);
6249       NumElems = WideOpVT.getVectorNumElements();
6250       SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
6251       Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6252       Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6253     }
6254     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6255     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6256   }
6257 
6258   // Inserting into the middle is more complicated.
6259 
6260   NumElems = WideOpVT.getVectorNumElements();
6261 
6262   // Widen the vector if needed.
6263   Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
6264 
6265   unsigned ShiftLeft = NumElems - SubVecNumElems;
6266   unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6267 
6268   // Do an optimization for the the most frequently used types.
6269   if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
6270     APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
6271     Mask0.flipAllBits();
6272     SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
6273     SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
6274     Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
6275     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6276                          DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6277     SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6278                          DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6279     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6280 
6281     // Reduce to original width if needed.
6282     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6283   }
6284 
6285   // Clear the upper bits of the subvector and move it to its insert position.
6286   SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6287                        DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6288   SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6289                        DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6290 
6291   // Isolate the bits below the insertion point.
6292   unsigned LowShift = NumElems - IdxVal;
6293   SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
6294                             DAG.getTargetConstant(LowShift, dl, MVT::i8));
6295   Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
6296                     DAG.getTargetConstant(LowShift, dl, MVT::i8));
6297 
6298   // Isolate the bits after the last inserted bit.
6299   unsigned HighShift = IdxVal + SubVecNumElems;
6300   SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
6301                             DAG.getTargetConstant(HighShift, dl, MVT::i8));
6302   High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
6303                     DAG.getTargetConstant(HighShift, dl, MVT::i8));
6304 
6305   // Now OR all 3 pieces together.
6306   Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
6307   SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
6308 
6309   // Reduce to original width if needed.
6310   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6311 }
6312 
concatSubVectors(SDValue V1,SDValue V2,SelectionDAG & DAG,const SDLoc & dl)6313 static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
6314                                 const SDLoc &dl) {
6315   assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
6316   EVT SubVT = V1.getValueType();
6317   EVT SubSVT = SubVT.getScalarType();
6318   unsigned SubNumElts = SubVT.getVectorNumElements();
6319   unsigned SubVectorWidth = SubVT.getSizeInBits();
6320   EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
6321   SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
6322   return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
6323 }
6324 
6325 /// Returns a vector of specified type with all bits set.
6326 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
6327 /// Then bitcast to their original type, ensuring they get CSE'd.
getOnesVector(EVT VT,SelectionDAG & DAG,const SDLoc & dl)6328 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6329   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6330          "Expected a 128/256/512-bit vector type");
6331 
6332   APInt Ones = APInt::getAllOnesValue(32);
6333   unsigned NumElts = VT.getSizeInBits() / 32;
6334   SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
6335   return DAG.getBitcast(VT, Vec);
6336 }
6337 
6338 // Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
getOpcode_EXTEND(unsigned Opcode)6339 static unsigned getOpcode_EXTEND(unsigned Opcode) {
6340   switch (Opcode) {
6341   case ISD::ANY_EXTEND:
6342   case ISD::ANY_EXTEND_VECTOR_INREG:
6343     return ISD::ANY_EXTEND;
6344   case ISD::ZERO_EXTEND:
6345   case ISD::ZERO_EXTEND_VECTOR_INREG:
6346     return ISD::ZERO_EXTEND;
6347   case ISD::SIGN_EXTEND:
6348   case ISD::SIGN_EXTEND_VECTOR_INREG:
6349     return ISD::SIGN_EXTEND;
6350   }
6351   llvm_unreachable("Unknown opcode");
6352 }
6353 
6354 // Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)6355 static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
6356   switch (Opcode) {
6357   case ISD::ANY_EXTEND:
6358   case ISD::ANY_EXTEND_VECTOR_INREG:
6359     return ISD::ANY_EXTEND_VECTOR_INREG;
6360   case ISD::ZERO_EXTEND:
6361   case ISD::ZERO_EXTEND_VECTOR_INREG:
6362     return ISD::ZERO_EXTEND_VECTOR_INREG;
6363   case ISD::SIGN_EXTEND:
6364   case ISD::SIGN_EXTEND_VECTOR_INREG:
6365     return ISD::SIGN_EXTEND_VECTOR_INREG;
6366   }
6367   llvm_unreachable("Unknown opcode");
6368 }
6369 
getEXTEND_VECTOR_INREG(unsigned Opcode,const SDLoc & DL,EVT VT,SDValue In,SelectionDAG & DAG)6370 static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
6371                                       SDValue In, SelectionDAG &DAG) {
6372   EVT InVT = In.getValueType();
6373   assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
6374   assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
6375           ISD::ZERO_EXTEND == Opcode) &&
6376          "Unknown extension opcode");
6377 
6378   // For 256-bit vectors, we only need the lower (128-bit) input half.
6379   // For 512-bit vectors, we only need the lower input half or quarter.
6380   if (InVT.getSizeInBits() > 128) {
6381     assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
6382            "Expected VTs to be the same size!");
6383     unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
6384     In = extractSubVector(In, 0, DAG, DL,
6385                           std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
6386     InVT = In.getValueType();
6387   }
6388 
6389   if (VT.getVectorNumElements() != InVT.getVectorNumElements())
6390     Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
6391 
6392   return DAG.getNode(Opcode, DL, VT, In);
6393 }
6394 
6395 // Match (xor X, -1) -> X.
6396 // Match extract_subvector(xor X, -1) -> extract_subvector(X).
6397 // Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
IsNOT(SDValue V,SelectionDAG & DAG)6398 static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
6399   V = peekThroughBitcasts(V);
6400   if (V.getOpcode() == ISD::XOR &&
6401       ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
6402     return V.getOperand(0);
6403   if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6404       (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
6405     if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
6406       Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
6407       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
6408                          Not, V.getOperand(1));
6409     }
6410   }
6411   SmallVector<SDValue, 2> CatOps;
6412   if (collectConcatOps(V.getNode(), CatOps)) {
6413     for (SDValue &CatOp : CatOps) {
6414       SDValue NotCat = IsNOT(CatOp, DAG);
6415       if (!NotCat) return SDValue();
6416       CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
6417     }
6418     return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
6419   }
6420   return SDValue();
6421 }
6422 
createUnpackShuffleMask(EVT VT,SmallVectorImpl<int> & Mask,bool Lo,bool Unary)6423 void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
6424                                    bool Lo, bool Unary) {
6425   assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
6426          "Illegal vector type to unpack");
6427   assert(Mask.empty() && "Expected an empty shuffle mask vector");
6428   int NumElts = VT.getVectorNumElements();
6429   int NumEltsInLane = 128 / VT.getScalarSizeInBits();
6430   for (int i = 0; i < NumElts; ++i) {
6431     unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
6432     int Pos = (i % NumEltsInLane) / 2 + LaneStart;
6433     Pos += (Unary ? 0 : NumElts * (i % 2));
6434     Pos += (Lo ? 0 : NumEltsInLane / 2);
6435     Mask.push_back(Pos);
6436   }
6437 }
6438 
6439 /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
6440 /// imposed by AVX and specific to the unary pattern. Example:
6441 /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
6442 /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
createSplat2ShuffleMask(MVT VT,SmallVectorImpl<int> & Mask,bool Lo)6443 void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6444                                    bool Lo) {
6445   assert(Mask.empty() && "Expected an empty shuffle mask vector");
6446   int NumElts = VT.getVectorNumElements();
6447   for (int i = 0; i < NumElts; ++i) {
6448     int Pos = i / 2;
6449     Pos += (Lo ? 0 : NumElts / 2);
6450     Mask.push_back(Pos);
6451   }
6452 }
6453 
6454 /// Returns a vector_shuffle node for an unpackl operation.
getUnpackl(SelectionDAG & DAG,const SDLoc & dl,EVT VT,SDValue V1,SDValue V2)6455 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
6456                           SDValue V1, SDValue V2) {
6457   SmallVector<int, 8> Mask;
6458   createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
6459   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6460 }
6461 
6462 /// Returns a vector_shuffle node for an unpackh operation.
getUnpackh(SelectionDAG & DAG,const SDLoc & dl,EVT VT,SDValue V1,SDValue V2)6463 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
6464                           SDValue V1, SDValue V2) {
6465   SmallVector<int, 8> Mask;
6466   createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
6467   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6468 }
6469 
6470 /// Return a vector_shuffle of the specified vector of zero or undef vector.
6471 /// This produces a shuffle where the low element of V2 is swizzled into the
6472 /// zero/undef vector, landing at element Idx.
6473 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
getShuffleVectorZeroOrUndef(SDValue V2,int Idx,bool IsZero,const X86Subtarget & Subtarget,SelectionDAG & DAG)6474 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
6475                                            bool IsZero,
6476                                            const X86Subtarget &Subtarget,
6477                                            SelectionDAG &DAG) {
6478   MVT VT = V2.getSimpleValueType();
6479   SDValue V1 = IsZero
6480     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
6481   int NumElems = VT.getVectorNumElements();
6482   SmallVector<int, 16> MaskVec(NumElems);
6483   for (int i = 0; i != NumElems; ++i)
6484     // If this is the insertion idx, put the low elt of V2 here.
6485     MaskVec[i] = (i == Idx) ? NumElems : i;
6486   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
6487 }
6488 
getTargetConstantFromBasePtr(SDValue Ptr)6489 static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
6490   if (Ptr.getOpcode() == X86ISD::Wrapper ||
6491       Ptr.getOpcode() == X86ISD::WrapperRIP)
6492     Ptr = Ptr.getOperand(0);
6493 
6494   auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
6495   if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
6496     return nullptr;
6497 
6498   return CNode->getConstVal();
6499 }
6500 
getTargetConstantFromNode(LoadSDNode * Load)6501 static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
6502   if (!Load || !ISD::isNormalLoad(Load))
6503     return nullptr;
6504   return getTargetConstantFromBasePtr(Load->getBasePtr());
6505 }
6506 
getTargetConstantFromNode(SDValue Op)6507 static const Constant *getTargetConstantFromNode(SDValue Op) {
6508   Op = peekThroughBitcasts(Op);
6509   return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
6510 }
6511 
6512 const Constant *
getTargetConstantFromLoad(LoadSDNode * LD) const6513 X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
6514   assert(LD && "Unexpected null LoadSDNode");
6515   return getTargetConstantFromNode(LD);
6516 }
6517 
6518 // Extract raw constant bits from constant pools.
getTargetConstantBitsFromNode(SDValue Op,unsigned EltSizeInBits,APInt & UndefElts,SmallVectorImpl<APInt> & EltBits,bool AllowWholeUndefs=true,bool AllowPartialUndefs=true)6519 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
6520                                           APInt &UndefElts,
6521                                           SmallVectorImpl<APInt> &EltBits,
6522                                           bool AllowWholeUndefs = true,
6523                                           bool AllowPartialUndefs = true) {
6524   assert(EltBits.empty() && "Expected an empty EltBits vector");
6525 
6526   Op = peekThroughBitcasts(Op);
6527 
6528   EVT VT = Op.getValueType();
6529   unsigned SizeInBits = VT.getSizeInBits();
6530   assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
6531   unsigned NumElts = SizeInBits / EltSizeInBits;
6532 
6533   // Bitcast a source array of element bits to the target size.
6534   auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
6535     unsigned NumSrcElts = UndefSrcElts.getBitWidth();
6536     unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
6537     assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
6538            "Constant bit sizes don't match");
6539 
6540     // Don't split if we don't allow undef bits.
6541     bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
6542     if (UndefSrcElts.getBoolValue() && !AllowUndefs)
6543       return false;
6544 
6545     // If we're already the right size, don't bother bitcasting.
6546     if (NumSrcElts == NumElts) {
6547       UndefElts = UndefSrcElts;
6548       EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
6549       return true;
6550     }
6551 
6552     // Extract all the undef/constant element data and pack into single bitsets.
6553     APInt UndefBits(SizeInBits, 0);
6554     APInt MaskBits(SizeInBits, 0);
6555 
6556     for (unsigned i = 0; i != NumSrcElts; ++i) {
6557       unsigned BitOffset = i * SrcEltSizeInBits;
6558       if (UndefSrcElts[i])
6559         UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
6560       MaskBits.insertBits(SrcEltBits[i], BitOffset);
6561     }
6562 
6563     // Split the undef/constant single bitset data into the target elements.
6564     UndefElts = APInt(NumElts, 0);
6565     EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
6566 
6567     for (unsigned i = 0; i != NumElts; ++i) {
6568       unsigned BitOffset = i * EltSizeInBits;
6569       APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
6570 
6571       // Only treat an element as UNDEF if all bits are UNDEF.
6572       if (UndefEltBits.isAllOnesValue()) {
6573         if (!AllowWholeUndefs)
6574           return false;
6575         UndefElts.setBit(i);
6576         continue;
6577       }
6578 
6579       // If only some bits are UNDEF then treat them as zero (or bail if not
6580       // supported).
6581       if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
6582         return false;
6583 
6584       EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
6585     }
6586     return true;
6587   };
6588 
6589   // Collect constant bits and insert into mask/undef bit masks.
6590   auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
6591                                 unsigned UndefBitIndex) {
6592     if (!Cst)
6593       return false;
6594     if (isa<UndefValue>(Cst)) {
6595       Undefs.setBit(UndefBitIndex);
6596       return true;
6597     }
6598     if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
6599       Mask = CInt->getValue();
6600       return true;
6601     }
6602     if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
6603       Mask = CFP->getValueAPF().bitcastToAPInt();
6604       return true;
6605     }
6606     return false;
6607   };
6608 
6609   // Handle UNDEFs.
6610   if (Op.isUndef()) {
6611     APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
6612     SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
6613     return CastBitData(UndefSrcElts, SrcEltBits);
6614   }
6615 
6616   // Extract scalar constant bits.
6617   if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
6618     APInt UndefSrcElts = APInt::getNullValue(1);
6619     SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
6620     return CastBitData(UndefSrcElts, SrcEltBits);
6621   }
6622   if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
6623     APInt UndefSrcElts = APInt::getNullValue(1);
6624     APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6625     SmallVector<APInt, 64> SrcEltBits(1, RawBits);
6626     return CastBitData(UndefSrcElts, SrcEltBits);
6627   }
6628 
6629   // Extract constant bits from build vector.
6630   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6631     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6632     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6633 
6634     APInt UndefSrcElts(NumSrcElts, 0);
6635     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6636     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6637       const SDValue &Src = Op.getOperand(i);
6638       if (Src.isUndef()) {
6639         UndefSrcElts.setBit(i);
6640         continue;
6641       }
6642       auto *Cst = cast<ConstantSDNode>(Src);
6643       SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
6644     }
6645     return CastBitData(UndefSrcElts, SrcEltBits);
6646   }
6647   if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
6648     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6649     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6650 
6651     APInt UndefSrcElts(NumSrcElts, 0);
6652     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6653     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6654       const SDValue &Src = Op.getOperand(i);
6655       if (Src.isUndef()) {
6656         UndefSrcElts.setBit(i);
6657         continue;
6658       }
6659       auto *Cst = cast<ConstantFPSDNode>(Src);
6660       APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6661       SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
6662     }
6663     return CastBitData(UndefSrcElts, SrcEltBits);
6664   }
6665 
6666   // Extract constant bits from constant pool vector.
6667   if (auto *Cst = getTargetConstantFromNode(Op)) {
6668     Type *CstTy = Cst->getType();
6669     unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
6670     if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
6671       return false;
6672 
6673     unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
6674     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6675 
6676     APInt UndefSrcElts(NumSrcElts, 0);
6677     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6678     for (unsigned i = 0; i != NumSrcElts; ++i)
6679       if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
6680                                UndefSrcElts, i))
6681         return false;
6682 
6683     return CastBitData(UndefSrcElts, SrcEltBits);
6684   }
6685 
6686   // Extract constant bits from a broadcasted constant pool scalar.
6687   if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
6688       EltSizeInBits <= VT.getScalarSizeInBits()) {
6689     auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
6690     if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
6691       return false;
6692 
6693     SDValue Ptr = MemIntr->getBasePtr();
6694     if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
6695       unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
6696       unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6697 
6698       APInt UndefSrcElts(NumSrcElts, 0);
6699       SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
6700       if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
6701         if (UndefSrcElts[0])
6702           UndefSrcElts.setBits(0, NumSrcElts);
6703         SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
6704         return CastBitData(UndefSrcElts, SrcEltBits);
6705       }
6706     }
6707   }
6708 
6709   // Extract constant bits from a subvector broadcast.
6710   if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
6711     auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
6712     SDValue Ptr = MemIntr->getBasePtr();
6713     // The source constant may be larger than the subvector broadcast,
6714     // ensure we extract the correct subvector constants.
6715     if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
6716       Type *CstTy = Cst->getType();
6717       unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
6718       unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
6719       if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
6720           (SizeInBits % SubVecSizeInBits) != 0)
6721         return false;
6722       unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
6723       unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
6724       unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
6725       APInt UndefSubElts(NumSubElts, 0);
6726       SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
6727                                         APInt(CstEltSizeInBits, 0));
6728       for (unsigned i = 0; i != NumSubElts; ++i) {
6729         if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
6730                                  UndefSubElts, i))
6731           return false;
6732         for (unsigned j = 1; j != NumSubVecs; ++j)
6733           SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
6734       }
6735       UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
6736                                      UndefSubElts);
6737       return CastBitData(UndefSubElts, SubEltBits);
6738     }
6739   }
6740 
6741   // Extract a rematerialized scalar constant insertion.
6742   if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
6743       Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
6744       isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
6745     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6746     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6747 
6748     APInt UndefSrcElts(NumSrcElts, 0);
6749     SmallVector<APInt, 64> SrcEltBits;
6750     auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
6751     SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
6752     SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
6753     return CastBitData(UndefSrcElts, SrcEltBits);
6754   }
6755 
6756   // Insert constant bits from a base and sub vector sources.
6757   if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
6758     // If bitcasts to larger elements we might lose track of undefs - don't
6759     // allow any to be safe.
6760     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6761     bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
6762 
6763     APInt UndefSrcElts, UndefSubElts;
6764     SmallVector<APInt, 32> EltSrcBits, EltSubBits;
6765     if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
6766                                       UndefSubElts, EltSubBits,
6767                                       AllowWholeUndefs && AllowUndefs,
6768                                       AllowPartialUndefs && AllowUndefs) &&
6769         getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
6770                                       UndefSrcElts, EltSrcBits,
6771                                       AllowWholeUndefs && AllowUndefs,
6772                                       AllowPartialUndefs && AllowUndefs)) {
6773       unsigned BaseIdx = Op.getConstantOperandVal(2);
6774       UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
6775       for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
6776         EltSrcBits[BaseIdx + i] = EltSubBits[i];
6777       return CastBitData(UndefSrcElts, EltSrcBits);
6778     }
6779   }
6780 
6781   // Extract constant bits from a subvector's source.
6782   if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
6783     // TODO - support extract_subvector through bitcasts.
6784     if (EltSizeInBits != VT.getScalarSizeInBits())
6785       return false;
6786 
6787     if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6788                                       UndefElts, EltBits, AllowWholeUndefs,
6789                                       AllowPartialUndefs)) {
6790       EVT SrcVT = Op.getOperand(0).getValueType();
6791       unsigned NumSrcElts = SrcVT.getVectorNumElements();
6792       unsigned NumSubElts = VT.getVectorNumElements();
6793       unsigned BaseIdx = Op.getConstantOperandVal(1);
6794       UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
6795       if ((BaseIdx + NumSubElts) != NumSrcElts)
6796         EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
6797       if (BaseIdx != 0)
6798         EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
6799       return true;
6800     }
6801   }
6802 
6803   // Extract constant bits from shuffle node sources.
6804   if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
6805     // TODO - support shuffle through bitcasts.
6806     if (EltSizeInBits != VT.getScalarSizeInBits())
6807       return false;
6808 
6809     ArrayRef<int> Mask = SVN->getMask();
6810     if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
6811         llvm::any_of(Mask, [](int M) { return M < 0; }))
6812       return false;
6813 
6814     APInt UndefElts0, UndefElts1;
6815     SmallVector<APInt, 32> EltBits0, EltBits1;
6816     if (isAnyInRange(Mask, 0, NumElts) &&
6817         !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6818                                        UndefElts0, EltBits0, AllowWholeUndefs,
6819                                        AllowPartialUndefs))
6820       return false;
6821     if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
6822         !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
6823                                        UndefElts1, EltBits1, AllowWholeUndefs,
6824                                        AllowPartialUndefs))
6825       return false;
6826 
6827     UndefElts = APInt::getNullValue(NumElts);
6828     for (int i = 0; i != (int)NumElts; ++i) {
6829       int M = Mask[i];
6830       if (M < 0) {
6831         UndefElts.setBit(i);
6832         EltBits.push_back(APInt::getNullValue(EltSizeInBits));
6833       } else if (M < (int)NumElts) {
6834         if (UndefElts0[M])
6835           UndefElts.setBit(i);
6836         EltBits.push_back(EltBits0[M]);
6837       } else {
6838         if (UndefElts1[M - NumElts])
6839           UndefElts.setBit(i);
6840         EltBits.push_back(EltBits1[M - NumElts]);
6841       }
6842     }
6843     return true;
6844   }
6845 
6846   return false;
6847 }
6848 
6849 namespace llvm {
6850 namespace X86 {
isConstantSplat(SDValue Op,APInt & SplatVal,bool AllowPartialUndefs)6851 bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
6852   APInt UndefElts;
6853   SmallVector<APInt, 16> EltBits;
6854   if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
6855                                     UndefElts, EltBits, true,
6856                                     AllowPartialUndefs)) {
6857     int SplatIndex = -1;
6858     for (int i = 0, e = EltBits.size(); i != e; ++i) {
6859       if (UndefElts[i])
6860         continue;
6861       if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
6862         SplatIndex = -1;
6863         break;
6864       }
6865       SplatIndex = i;
6866     }
6867     if (0 <= SplatIndex) {
6868       SplatVal = EltBits[SplatIndex];
6869       return true;
6870     }
6871   }
6872 
6873   return false;
6874 }
6875 } // namespace X86
6876 } // namespace llvm
6877 
getTargetShuffleMaskIndices(SDValue MaskNode,unsigned MaskEltSizeInBits,SmallVectorImpl<uint64_t> & RawMask,APInt & UndefElts)6878 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
6879                                         unsigned MaskEltSizeInBits,
6880                                         SmallVectorImpl<uint64_t> &RawMask,
6881                                         APInt &UndefElts) {
6882   // Extract the raw target constant bits.
6883   SmallVector<APInt, 64> EltBits;
6884   if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
6885                                      EltBits, /* AllowWholeUndefs */ true,
6886                                      /* AllowPartialUndefs */ false))
6887     return false;
6888 
6889   // Insert the extracted elements into the mask.
6890   for (const APInt &Elt : EltBits)
6891     RawMask.push_back(Elt.getZExtValue());
6892 
6893   return true;
6894 }
6895 
6896 /// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
6897 /// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
6898 /// Note: This ignores saturation, so inputs must be checked first.
createPackShuffleMask(MVT VT,SmallVectorImpl<int> & Mask,bool Unary,unsigned NumStages=1)6899 static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6900                                   bool Unary, unsigned NumStages = 1) {
6901   assert(Mask.empty() && "Expected an empty shuffle mask vector");
6902   unsigned NumElts = VT.getVectorNumElements();
6903   unsigned NumLanes = VT.getSizeInBits() / 128;
6904   unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
6905   unsigned Offset = Unary ? 0 : NumElts;
6906   unsigned Repetitions = 1u << (NumStages - 1);
6907   unsigned Increment = 1u << NumStages;
6908   assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
6909 
6910   for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
6911     for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
6912       for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
6913         Mask.push_back(Elt + (Lane * NumEltsPerLane));
6914       for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
6915         Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
6916     }
6917   }
6918 }
6919 
6920 // Split the demanded elts of a PACKSS/PACKUS node between its operands.
getPackDemandedElts(EVT VT,const APInt & DemandedElts,APInt & DemandedLHS,APInt & DemandedRHS)6921 static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
6922                                 APInt &DemandedLHS, APInt &DemandedRHS) {
6923   int NumLanes = VT.getSizeInBits() / 128;
6924   int NumElts = DemandedElts.getBitWidth();
6925   int NumInnerElts = NumElts / 2;
6926   int NumEltsPerLane = NumElts / NumLanes;
6927   int NumInnerEltsPerLane = NumInnerElts / NumLanes;
6928 
6929   DemandedLHS = APInt::getNullValue(NumInnerElts);
6930   DemandedRHS = APInt::getNullValue(NumInnerElts);
6931 
6932   // Map DemandedElts to the packed operands.
6933   for (int Lane = 0; Lane != NumLanes; ++Lane) {
6934     for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
6935       int OuterIdx = (Lane * NumEltsPerLane) + Elt;
6936       int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
6937       if (DemandedElts[OuterIdx])
6938         DemandedLHS.setBit(InnerIdx);
6939       if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
6940         DemandedRHS.setBit(InnerIdx);
6941     }
6942   }
6943 }
6944 
6945 // Split the demanded elts of a HADD/HSUB node between its operands.
getHorizDemandedElts(EVT VT,const APInt & DemandedElts,APInt & DemandedLHS,APInt & DemandedRHS)6946 static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
6947                                  APInt &DemandedLHS, APInt &DemandedRHS) {
6948   int NumLanes = VT.getSizeInBits() / 128;
6949   int NumElts = DemandedElts.getBitWidth();
6950   int NumEltsPerLane = NumElts / NumLanes;
6951   int HalfEltsPerLane = NumEltsPerLane / 2;
6952 
6953   DemandedLHS = APInt::getNullValue(NumElts);
6954   DemandedRHS = APInt::getNullValue(NumElts);
6955 
6956   // Map DemandedElts to the horizontal operands.
6957   for (int Idx = 0; Idx != NumElts; ++Idx) {
6958     if (!DemandedElts[Idx])
6959       continue;
6960     int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
6961     int LocalIdx = Idx % NumEltsPerLane;
6962     if (LocalIdx < HalfEltsPerLane) {
6963       DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
6964       DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
6965     } else {
6966       LocalIdx -= HalfEltsPerLane;
6967       DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
6968       DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
6969     }
6970   }
6971 }
6972 
6973 /// Calculates the shuffle mask corresponding to the target-specific opcode.
6974 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
6975 /// operands in \p Ops, and returns true.
6976 /// Sets \p IsUnary to true if only one source is used. Note that this will set
6977 /// IsUnary for shuffles which use a single input multiple times, and in those
6978 /// cases it will adjust the mask to only have indices within that single input.
6979 /// It is an error to call this with non-empty Mask/Ops vectors.
getTargetShuffleMask(SDNode * N,MVT VT,bool AllowSentinelZero,SmallVectorImpl<SDValue> & Ops,SmallVectorImpl<int> & Mask,bool & IsUnary)6980 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
6981                                  SmallVectorImpl<SDValue> &Ops,
6982                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
6983   unsigned NumElems = VT.getVectorNumElements();
6984   unsigned MaskEltSize = VT.getScalarSizeInBits();
6985   SmallVector<uint64_t, 32> RawMask;
6986   APInt RawUndefs;
6987   uint64_t ImmN;
6988 
6989   assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
6990   assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
6991 
6992   IsUnary = false;
6993   bool IsFakeUnary = false;
6994   switch (N->getOpcode()) {
6995   case X86ISD::BLENDI:
6996     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6997     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6998     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6999     DecodeBLENDMask(NumElems, ImmN, Mask);
7000     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7001     break;
7002   case X86ISD::SHUFP:
7003     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7004     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7005     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7006     DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
7007     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7008     break;
7009   case X86ISD::INSERTPS:
7010     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7011     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7012     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7013     DecodeINSERTPSMask(ImmN, Mask);
7014     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7015     break;
7016   case X86ISD::EXTRQI:
7017     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7018     if (isa<ConstantSDNode>(N->getOperand(1)) &&
7019         isa<ConstantSDNode>(N->getOperand(2))) {
7020       int BitLen = N->getConstantOperandVal(1);
7021       int BitIdx = N->getConstantOperandVal(2);
7022       DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7023       IsUnary = true;
7024     }
7025     break;
7026   case X86ISD::INSERTQI:
7027     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7028     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7029     if (isa<ConstantSDNode>(N->getOperand(2)) &&
7030         isa<ConstantSDNode>(N->getOperand(3))) {
7031       int BitLen = N->getConstantOperandVal(2);
7032       int BitIdx = N->getConstantOperandVal(3);
7033       DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7034       IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7035     }
7036     break;
7037   case X86ISD::UNPCKH:
7038     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7039     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7040     DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
7041     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7042     break;
7043   case X86ISD::UNPCKL:
7044     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7045     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7046     DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
7047     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7048     break;
7049   case X86ISD::MOVHLPS:
7050     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7051     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7052     DecodeMOVHLPSMask(NumElems, Mask);
7053     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7054     break;
7055   case X86ISD::MOVLHPS:
7056     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7057     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7058     DecodeMOVLHPSMask(NumElems, Mask);
7059     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7060     break;
7061   case X86ISD::VALIGN:
7062     assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
7063            "Only 32-bit and 64-bit elements are supported!");
7064     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7065     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7066     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7067     DecodeVALIGNMask(NumElems, ImmN, Mask);
7068     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7069     Ops.push_back(N->getOperand(1));
7070     Ops.push_back(N->getOperand(0));
7071     break;
7072   case X86ISD::PALIGNR:
7073     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
7074     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7075     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7076     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7077     DecodePALIGNRMask(NumElems, ImmN, Mask);
7078     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7079     Ops.push_back(N->getOperand(1));
7080     Ops.push_back(N->getOperand(0));
7081     break;
7082   case X86ISD::VSHLDQ:
7083     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
7084     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7085     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7086     DecodePSLLDQMask(NumElems, ImmN, Mask);
7087     IsUnary = true;
7088     break;
7089   case X86ISD::VSRLDQ:
7090     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
7091     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7092     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7093     DecodePSRLDQMask(NumElems, ImmN, Mask);
7094     IsUnary = true;
7095     break;
7096   case X86ISD::PSHUFD:
7097   case X86ISD::VPERMILPI:
7098     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7099     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7100     DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
7101     IsUnary = true;
7102     break;
7103   case X86ISD::PSHUFHW:
7104     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7105     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7106     DecodePSHUFHWMask(NumElems, ImmN, Mask);
7107     IsUnary = true;
7108     break;
7109   case X86ISD::PSHUFLW:
7110     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7111     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7112     DecodePSHUFLWMask(NumElems, ImmN, Mask);
7113     IsUnary = true;
7114     break;
7115   case X86ISD::VZEXT_MOVL:
7116     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7117     DecodeZeroMoveLowMask(NumElems, Mask);
7118     IsUnary = true;
7119     break;
7120   case X86ISD::VBROADCAST:
7121     // We only decode broadcasts of same-sized vectors, peeking through to
7122     // extracted subvectors is likely to cause hasOneUse issues with
7123     // SimplifyDemandedBits etc.
7124     if (N->getOperand(0).getValueType() == VT) {
7125       DecodeVectorBroadcast(NumElems, Mask);
7126       IsUnary = true;
7127       break;
7128     }
7129     return false;
7130   case X86ISD::VPERMILPV: {
7131     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7132     IsUnary = true;
7133     SDValue MaskNode = N->getOperand(1);
7134     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7135                                     RawUndefs)) {
7136       DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
7137       break;
7138     }
7139     return false;
7140   }
7141   case X86ISD::PSHUFB: {
7142     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
7143     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7144     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7145     IsUnary = true;
7146     SDValue MaskNode = N->getOperand(1);
7147     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7148       DecodePSHUFBMask(RawMask, RawUndefs, Mask);
7149       break;
7150     }
7151     return false;
7152   }
7153   case X86ISD::VPERMI:
7154     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7155     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7156     DecodeVPERMMask(NumElems, ImmN, Mask);
7157     IsUnary = true;
7158     break;
7159   case X86ISD::MOVSS:
7160   case X86ISD::MOVSD:
7161     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7162     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7163     DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
7164     break;
7165   case X86ISD::VPERM2X128:
7166     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7167     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7168     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7169     DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
7170     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7171     break;
7172   case X86ISD::SHUF128:
7173     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7174     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7175     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7176     decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
7177     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7178     break;
7179   case X86ISD::MOVSLDUP:
7180     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7181     DecodeMOVSLDUPMask(NumElems, Mask);
7182     IsUnary = true;
7183     break;
7184   case X86ISD::MOVSHDUP:
7185     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7186     DecodeMOVSHDUPMask(NumElems, Mask);
7187     IsUnary = true;
7188     break;
7189   case X86ISD::MOVDDUP:
7190     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7191     DecodeMOVDDUPMask(NumElems, Mask);
7192     IsUnary = true;
7193     break;
7194   case X86ISD::VPERMIL2: {
7195     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7196     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7197     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7198     SDValue MaskNode = N->getOperand(2);
7199     SDValue CtrlNode = N->getOperand(3);
7200     if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
7201       unsigned CtrlImm = CtrlOp->getZExtValue();
7202       if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7203                                       RawUndefs)) {
7204         DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
7205                             Mask);
7206         break;
7207       }
7208     }
7209     return false;
7210   }
7211   case X86ISD::VPPERM: {
7212     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7213     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7214     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7215     SDValue MaskNode = N->getOperand(2);
7216     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7217       DecodeVPPERMMask(RawMask, RawUndefs, Mask);
7218       break;
7219     }
7220     return false;
7221   }
7222   case X86ISD::VPERMV: {
7223     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7224     IsUnary = true;
7225     // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
7226     Ops.push_back(N->getOperand(1));
7227     SDValue MaskNode = N->getOperand(0);
7228     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7229                                     RawUndefs)) {
7230       DecodeVPERMVMask(RawMask, RawUndefs, Mask);
7231       break;
7232     }
7233     return false;
7234   }
7235   case X86ISD::VPERMV3: {
7236     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7237     assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
7238     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
7239     // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
7240     Ops.push_back(N->getOperand(0));
7241     Ops.push_back(N->getOperand(2));
7242     SDValue MaskNode = N->getOperand(1);
7243     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7244                                     RawUndefs)) {
7245       DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
7246       break;
7247     }
7248     return false;
7249   }
7250   default: llvm_unreachable("unknown target shuffle node");
7251   }
7252 
7253   // Empty mask indicates the decode failed.
7254   if (Mask.empty())
7255     return false;
7256 
7257   // Check if we're getting a shuffle mask with zero'd elements.
7258   if (!AllowSentinelZero && isAnyZero(Mask))
7259     return false;
7260 
7261   // If we have a fake unary shuffle, the shuffle mask is spread across two
7262   // inputs that are actually the same node. Re-map the mask to always point
7263   // into the first input.
7264   if (IsFakeUnary)
7265     for (int &M : Mask)
7266       if (M >= (int)Mask.size())
7267         M -= Mask.size();
7268 
7269   // If we didn't already add operands in the opcode-specific code, default to
7270   // adding 1 or 2 operands starting at 0.
7271   if (Ops.empty()) {
7272     Ops.push_back(N->getOperand(0));
7273     if (!IsUnary || IsFakeUnary)
7274       Ops.push_back(N->getOperand(1));
7275   }
7276 
7277   return true;
7278 }
7279 
7280 // Wrapper for getTargetShuffleMask with InUnary;
getTargetShuffleMask(SDNode * N,MVT VT,bool AllowSentinelZero,SmallVectorImpl<SDValue> & Ops,SmallVectorImpl<int> & Mask)7281 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
7282                                  SmallVectorImpl<SDValue> &Ops,
7283                                  SmallVectorImpl<int> &Mask) {
7284   bool IsUnary;
7285   return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);
7286 }
7287 
7288 /// Compute whether each element of a shuffle is zeroable.
7289 ///
7290 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
7291 /// Either it is an undef element in the shuffle mask, the element of the input
7292 /// referenced is undef, or the element of the input referenced is known to be
7293 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7294 /// as many lanes with this technique as possible to simplify the remaining
7295 /// shuffle.
computeZeroableShuffleElements(ArrayRef<int> Mask,SDValue V1,SDValue V2,APInt & KnownUndef,APInt & KnownZero)7296 static void computeZeroableShuffleElements(ArrayRef<int> Mask,
7297                                            SDValue V1, SDValue V2,
7298                                            APInt &KnownUndef, APInt &KnownZero) {
7299   int Size = Mask.size();
7300   KnownUndef = KnownZero = APInt::getNullValue(Size);
7301 
7302   V1 = peekThroughBitcasts(V1);
7303   V2 = peekThroughBitcasts(V2);
7304 
7305   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7306   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7307 
7308   int VectorSizeInBits = V1.getValueSizeInBits();
7309   int ScalarSizeInBits = VectorSizeInBits / Size;
7310   assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
7311 
7312   for (int i = 0; i < Size; ++i) {
7313     int M = Mask[i];
7314     // Handle the easy cases.
7315     if (M < 0) {
7316       KnownUndef.setBit(i);
7317       continue;
7318     }
7319     if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7320       KnownZero.setBit(i);
7321       continue;
7322     }
7323 
7324     // Determine shuffle input and normalize the mask.
7325     SDValue V = M < Size ? V1 : V2;
7326     M %= Size;
7327 
7328     // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
7329     if (V.getOpcode() != ISD::BUILD_VECTOR)
7330       continue;
7331 
7332     // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
7333     // the (larger) source element must be UNDEF/ZERO.
7334     if ((Size % V.getNumOperands()) == 0) {
7335       int Scale = Size / V->getNumOperands();
7336       SDValue Op = V.getOperand(M / Scale);
7337       if (Op.isUndef())
7338         KnownUndef.setBit(i);
7339       if (X86::isZeroNode(Op))
7340         KnownZero.setBit(i);
7341       else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
7342         APInt Val = Cst->getAPIntValue();
7343         Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7344         if (Val == 0)
7345           KnownZero.setBit(i);
7346       } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7347         APInt Val = Cst->getValueAPF().bitcastToAPInt();
7348         Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7349         if (Val == 0)
7350           KnownZero.setBit(i);
7351       }
7352       continue;
7353     }
7354 
7355     // If the BUILD_VECTOR has more elements then all the (smaller) source
7356     // elements must be UNDEF or ZERO.
7357     if ((V.getNumOperands() % Size) == 0) {
7358       int Scale = V->getNumOperands() / Size;
7359       bool AllUndef = true;
7360       bool AllZero = true;
7361       for (int j = 0; j < Scale; ++j) {
7362         SDValue Op = V.getOperand((M * Scale) + j);
7363         AllUndef &= Op.isUndef();
7364         AllZero &= X86::isZeroNode(Op);
7365       }
7366       if (AllUndef)
7367         KnownUndef.setBit(i);
7368       if (AllZero)
7369         KnownZero.setBit(i);
7370       continue;
7371     }
7372   }
7373 }
7374 
7375 /// Decode a target shuffle mask and inputs and see if any values are
7376 /// known to be undef or zero from their inputs.
7377 /// Returns true if the target shuffle mask was decoded.
7378 /// FIXME: Merge this with computeZeroableShuffleElements?
getTargetShuffleAndZeroables(SDValue N,SmallVectorImpl<int> & Mask,SmallVectorImpl<SDValue> & Ops,APInt & KnownUndef,APInt & KnownZero)7379 static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
7380                                          SmallVectorImpl<SDValue> &Ops,
7381                                          APInt &KnownUndef, APInt &KnownZero) {
7382   bool IsUnary;
7383   if (!isTargetShuffle(N.getOpcode()))
7384     return false;
7385 
7386   MVT VT = N.getSimpleValueType();
7387   if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
7388     return false;
7389 
7390   int Size = Mask.size();
7391   SDValue V1 = Ops[0];
7392   SDValue V2 = IsUnary ? V1 : Ops[1];
7393   KnownUndef = KnownZero = APInt::getNullValue(Size);
7394 
7395   V1 = peekThroughBitcasts(V1);
7396   V2 = peekThroughBitcasts(V2);
7397 
7398   assert((VT.getSizeInBits() % Size) == 0 &&
7399          "Illegal split of shuffle value type");
7400   unsigned EltSizeInBits = VT.getSizeInBits() / Size;
7401 
7402   // Extract known constant input data.
7403   APInt UndefSrcElts[2];
7404   SmallVector<APInt, 32> SrcEltBits[2];
7405   bool IsSrcConstant[2] = {
7406       getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
7407                                     SrcEltBits[0], true, false),
7408       getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
7409                                     SrcEltBits[1], true, false)};
7410 
7411   for (int i = 0; i < Size; ++i) {
7412     int M = Mask[i];
7413 
7414     // Already decoded as SM_SentinelZero / SM_SentinelUndef.
7415     if (M < 0) {
7416       assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
7417       if (SM_SentinelUndef == M)
7418         KnownUndef.setBit(i);
7419       if (SM_SentinelZero == M)
7420         KnownZero.setBit(i);
7421       continue;
7422     }
7423 
7424     // Determine shuffle input and normalize the mask.
7425     unsigned SrcIdx = M / Size;
7426     SDValue V = M < Size ? V1 : V2;
7427     M %= Size;
7428 
7429     // We are referencing an UNDEF input.
7430     if (V.isUndef()) {
7431       KnownUndef.setBit(i);
7432       continue;
7433     }
7434 
7435     // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
7436     // TODO: We currently only set UNDEF for integer types - floats use the same
7437     // registers as vectors and many of the scalar folded loads rely on the
7438     // SCALAR_TO_VECTOR pattern.
7439     if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
7440         (Size % V.getValueType().getVectorNumElements()) == 0) {
7441       int Scale = Size / V.getValueType().getVectorNumElements();
7442       int Idx = M / Scale;
7443       if (Idx != 0 && !VT.isFloatingPoint())
7444         KnownUndef.setBit(i);
7445       else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
7446         KnownZero.setBit(i);
7447       continue;
7448     }
7449 
7450     // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
7451     // base vectors.
7452     if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
7453       SDValue Vec = V.getOperand(0);
7454       int NumVecElts = Vec.getValueType().getVectorNumElements();
7455       if (Vec.isUndef() && Size == NumVecElts) {
7456         int Idx = V.getConstantOperandVal(2);
7457         int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
7458         if (M < Idx || (Idx + NumSubElts) <= M)
7459           KnownUndef.setBit(i);
7460       }
7461       continue;
7462     }
7463 
7464     // Attempt to extract from the source's constant bits.
7465     if (IsSrcConstant[SrcIdx]) {
7466       if (UndefSrcElts[SrcIdx][M])
7467         KnownUndef.setBit(i);
7468       else if (SrcEltBits[SrcIdx][M] == 0)
7469         KnownZero.setBit(i);
7470     }
7471   }
7472 
7473   assert(VT.getVectorNumElements() == (unsigned)Size &&
7474          "Different mask size from vector size!");
7475   return true;
7476 }
7477 
7478 // Replace target shuffle mask elements with known undef/zero sentinels.
resolveTargetShuffleFromZeroables(SmallVectorImpl<int> & Mask,const APInt & KnownUndef,const APInt & KnownZero,bool ResolveKnownZeros=true)7479 static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
7480                                               const APInt &KnownUndef,
7481                                               const APInt &KnownZero,
7482                                               bool ResolveKnownZeros= true) {
7483   unsigned NumElts = Mask.size();
7484   assert(KnownUndef.getBitWidth() == NumElts &&
7485          KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
7486 
7487   for (unsigned i = 0; i != NumElts; ++i) {
7488     if (KnownUndef[i])
7489       Mask[i] = SM_SentinelUndef;
7490     else if (ResolveKnownZeros && KnownZero[i])
7491       Mask[i] = SM_SentinelZero;
7492   }
7493 }
7494 
7495 // Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> & Mask,APInt & KnownUndef,APInt & KnownZero)7496 static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
7497                                               APInt &KnownUndef,
7498                                               APInt &KnownZero) {
7499   unsigned NumElts = Mask.size();
7500   KnownUndef = KnownZero = APInt::getNullValue(NumElts);
7501 
7502   for (unsigned i = 0; i != NumElts; ++i) {
7503     int M = Mask[i];
7504     if (SM_SentinelUndef == M)
7505       KnownUndef.setBit(i);
7506     if (SM_SentinelZero == M)
7507       KnownZero.setBit(i);
7508   }
7509 }
7510 
7511 // Forward declaration (for getFauxShuffleMask recursive check).
7512 // TODO: Use DemandedElts variant.
7513 static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7514                                    SmallVectorImpl<int> &Mask,
7515                                    const SelectionDAG &DAG, unsigned Depth,
7516                                    bool ResolveKnownElts);
7517 
7518 // Attempt to decode ops that could be represented as a shuffle mask.
7519 // The decoded shuffle mask may contain a different number of elements to the
7520 // destination value type.
getFauxShuffleMask(SDValue N,const APInt & DemandedElts,SmallVectorImpl<int> & Mask,SmallVectorImpl<SDValue> & Ops,const SelectionDAG & DAG,unsigned Depth,bool ResolveKnownElts)7521 static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
7522                                SmallVectorImpl<int> &Mask,
7523                                SmallVectorImpl<SDValue> &Ops,
7524                                const SelectionDAG &DAG, unsigned Depth,
7525                                bool ResolveKnownElts) {
7526   Mask.clear();
7527   Ops.clear();
7528 
7529   MVT VT = N.getSimpleValueType();
7530   unsigned NumElts = VT.getVectorNumElements();
7531   unsigned NumSizeInBits = VT.getSizeInBits();
7532   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
7533   if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
7534     return false;
7535   assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
7536   unsigned NumSizeInBytes = NumSizeInBits / 8;
7537   unsigned NumBytesPerElt = NumBitsPerElt / 8;
7538 
7539   unsigned Opcode = N.getOpcode();
7540   switch (Opcode) {
7541   case ISD::VECTOR_SHUFFLE: {
7542     // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
7543     ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
7544     if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
7545       Mask.append(ShuffleMask.begin(), ShuffleMask.end());
7546       Ops.push_back(N.getOperand(0));
7547       Ops.push_back(N.getOperand(1));
7548       return true;
7549     }
7550     return false;
7551   }
7552   case ISD::AND:
7553   case X86ISD::ANDNP: {
7554     // Attempt to decode as a per-byte mask.
7555     APInt UndefElts;
7556     SmallVector<APInt, 32> EltBits;
7557     SDValue N0 = N.getOperand(0);
7558     SDValue N1 = N.getOperand(1);
7559     bool IsAndN = (X86ISD::ANDNP == Opcode);
7560     uint64_t ZeroMask = IsAndN ? 255 : 0;
7561     if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
7562       return false;
7563     for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
7564       if (UndefElts[i]) {
7565         Mask.push_back(SM_SentinelUndef);
7566         continue;
7567       }
7568       const APInt &ByteBits = EltBits[i];
7569       if (ByteBits != 0 && ByteBits != 255)
7570         return false;
7571       Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
7572     }
7573     Ops.push_back(IsAndN ? N1 : N0);
7574     return true;
7575   }
7576   case ISD::OR: {
7577     // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
7578     // is a valid shuffle index.
7579     SDValue N0 = peekThroughBitcasts(N.getOperand(0));
7580     SDValue N1 = peekThroughBitcasts(N.getOperand(1));
7581     if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
7582       return false;
7583     SmallVector<int, 64> SrcMask0, SrcMask1;
7584     SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
7585     if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
7586                                 true) ||
7587         !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
7588                                 true))
7589       return false;
7590 
7591     size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
7592     SmallVector<int, 64> Mask0, Mask1;
7593     narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
7594     narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
7595     for (int i = 0; i != (int)MaskSize; ++i) {
7596       // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
7597       // loops converting between OR and BLEND shuffles due to
7598       // canWidenShuffleElements merging away undef elements, meaning we
7599       // fail to recognise the OR as the undef element isn't known zero.
7600       if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
7601         Mask.push_back(SM_SentinelZero);
7602       else if (Mask1[i] == SM_SentinelZero)
7603         Mask.push_back(i);
7604       else if (Mask0[i] == SM_SentinelZero)
7605         Mask.push_back(i + MaskSize);
7606       else
7607         return false;
7608     }
7609     Ops.push_back(N0);
7610     Ops.push_back(N1);
7611     return true;
7612   }
7613   case ISD::INSERT_SUBVECTOR: {
7614     SDValue Src = N.getOperand(0);
7615     SDValue Sub = N.getOperand(1);
7616     EVT SubVT = Sub.getValueType();
7617     unsigned NumSubElts = SubVT.getVectorNumElements();
7618     if (!N->isOnlyUserOf(Sub.getNode()))
7619       return false;
7620     uint64_t InsertIdx = N.getConstantOperandVal(2);
7621     // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
7622     if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
7623         Sub.getOperand(0).getValueType() == VT) {
7624       uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
7625       for (int i = 0; i != (int)NumElts; ++i)
7626         Mask.push_back(i);
7627       for (int i = 0; i != (int)NumSubElts; ++i)
7628         Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
7629       Ops.push_back(Src);
7630       Ops.push_back(Sub.getOperand(0));
7631       return true;
7632     }
7633     // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
7634     SmallVector<int, 64> SubMask;
7635     SmallVector<SDValue, 2> SubInputs;
7636     if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
7637                                 SubMask, DAG, Depth + 1, ResolveKnownElts))
7638       return false;
7639 
7640     // Subvector shuffle inputs must not be larger than the subvector.
7641     if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
7642           return SubVT.getFixedSizeInBits() <
7643                  SubInput.getValueSizeInBits().getFixedSize();
7644         }))
7645       return false;
7646 
7647     if (SubMask.size() != NumSubElts) {
7648       assert(((SubMask.size() % NumSubElts) == 0 ||
7649               (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
7650       if ((NumSubElts % SubMask.size()) == 0) {
7651         int Scale = NumSubElts / SubMask.size();
7652         SmallVector<int,64> ScaledSubMask;
7653         narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
7654         SubMask = ScaledSubMask;
7655       } else {
7656         int Scale = SubMask.size() / NumSubElts;
7657         NumSubElts = SubMask.size();
7658         NumElts *= Scale;
7659         InsertIdx *= Scale;
7660       }
7661     }
7662     Ops.push_back(Src);
7663     Ops.append(SubInputs.begin(), SubInputs.end());
7664     if (ISD::isBuildVectorAllZeros(Src.getNode()))
7665       Mask.append(NumElts, SM_SentinelZero);
7666     else
7667       for (int i = 0; i != (int)NumElts; ++i)
7668         Mask.push_back(i);
7669     for (int i = 0; i != (int)NumSubElts; ++i) {
7670       int M = SubMask[i];
7671       if (0 <= M) {
7672         int InputIdx = M / NumSubElts;
7673         M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
7674       }
7675       Mask[i + InsertIdx] = M;
7676     }
7677     return true;
7678   }
7679   case X86ISD::PINSRB:
7680   case X86ISD::PINSRW:
7681   case ISD::SCALAR_TO_VECTOR:
7682   case ISD::INSERT_VECTOR_ELT: {
7683     // Match against a insert_vector_elt/scalar_to_vector of an extract from a
7684     // vector, for matching src/dst vector types.
7685     SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
7686 
7687     unsigned DstIdx = 0;
7688     if (Opcode != ISD::SCALAR_TO_VECTOR) {
7689       // Check we have an in-range constant insertion index.
7690       if (!isa<ConstantSDNode>(N.getOperand(2)) ||
7691           N.getConstantOperandAPInt(2).uge(NumElts))
7692         return false;
7693       DstIdx = N.getConstantOperandVal(2);
7694 
7695       // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
7696       if (X86::isZeroNode(Scl)) {
7697         Ops.push_back(N.getOperand(0));
7698         for (unsigned i = 0; i != NumElts; ++i)
7699           Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
7700         return true;
7701       }
7702     }
7703 
7704     // Peek through trunc/aext/zext.
7705     // TODO: aext shouldn't require SM_SentinelZero padding.
7706     // TODO: handle shift of scalars.
7707     unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
7708     while (Scl.getOpcode() == ISD::TRUNCATE ||
7709            Scl.getOpcode() == ISD::ANY_EXTEND ||
7710            Scl.getOpcode() == ISD::ZERO_EXTEND) {
7711       Scl = Scl.getOperand(0);
7712       MinBitsPerElt =
7713           std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
7714     }
7715     if ((MinBitsPerElt % 8) != 0)
7716       return false;
7717 
7718     // Attempt to find the source vector the scalar was extracted from.
7719     SDValue SrcExtract;
7720     if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
7721          Scl.getOpcode() == X86ISD::PEXTRW ||
7722          Scl.getOpcode() == X86ISD::PEXTRB) &&
7723         Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
7724       SrcExtract = Scl;
7725     }
7726     if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
7727       return false;
7728 
7729     SDValue SrcVec = SrcExtract.getOperand(0);
7730     EVT SrcVT = SrcVec.getValueType();
7731     if (!SrcVT.getScalarType().isByteSized())
7732       return false;
7733     unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
7734     unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
7735     unsigned DstByte = DstIdx * NumBytesPerElt;
7736     MinBitsPerElt =
7737         std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
7738 
7739     // Create 'identity' byte level shuffle mask and then add inserted bytes.
7740     if (Opcode == ISD::SCALAR_TO_VECTOR) {
7741       Ops.push_back(SrcVec);
7742       Mask.append(NumSizeInBytes, SM_SentinelUndef);
7743     } else {
7744       Ops.push_back(SrcVec);
7745       Ops.push_back(N.getOperand(0));
7746       for (int i = 0; i != (int)NumSizeInBytes; ++i)
7747         Mask.push_back(NumSizeInBytes + i);
7748     }
7749 
7750     unsigned MinBytesPerElts = MinBitsPerElt / 8;
7751     MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
7752     for (unsigned i = 0; i != MinBytesPerElts; ++i)
7753       Mask[DstByte + i] = SrcByte + i;
7754     for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
7755       Mask[DstByte + i] = SM_SentinelZero;
7756     return true;
7757   }
7758   case X86ISD::PACKSS:
7759   case X86ISD::PACKUS: {
7760     SDValue N0 = N.getOperand(0);
7761     SDValue N1 = N.getOperand(1);
7762     assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
7763            N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
7764            "Unexpected input value type");
7765 
7766     APInt EltsLHS, EltsRHS;
7767     getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
7768 
7769     // If we know input saturation won't happen (or we don't care for particular
7770     // lanes), we can treat this as a truncation shuffle.
7771     bool Offset0 = false, Offset1 = false;
7772     if (Opcode == X86ISD::PACKSS) {
7773       if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
7774            DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
7775           (!(N1.isUndef() || EltsRHS.isNullValue()) &&
7776            DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
7777         return false;
7778       // We can't easily fold ASHR into a shuffle, but if it was feeding a
7779       // PACKSS then it was likely being used for sign-extension for a
7780       // truncation, so just peek through and adjust the mask accordingly.
7781       if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
7782           N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
7783         Offset0 = true;
7784         N0 = N0.getOperand(0);
7785       }
7786       if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
7787           N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
7788         Offset1 = true;
7789         N1 = N1.getOperand(0);
7790       }
7791     } else {
7792       APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
7793       if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
7794            !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
7795           (!(N1.isUndef() || EltsRHS.isNullValue()) &&
7796            !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
7797         return false;
7798     }
7799 
7800     bool IsUnary = (N0 == N1);
7801 
7802     Ops.push_back(N0);
7803     if (!IsUnary)
7804       Ops.push_back(N1);
7805 
7806     createPackShuffleMask(VT, Mask, IsUnary);
7807 
7808     if (Offset0 || Offset1) {
7809       for (int &M : Mask)
7810         if ((Offset0 && isInRange(M, 0, NumElts)) ||
7811             (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
7812           ++M;
7813     }
7814     return true;
7815   }
7816   case X86ISD::VTRUNC: {
7817     SDValue Src = N.getOperand(0);
7818     EVT SrcVT = Src.getValueType();
7819     // Truncated source must be a simple vector.
7820     if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7821         (SrcVT.getScalarSizeInBits() % 8) != 0)
7822       return false;
7823     unsigned NumSrcElts = SrcVT.getVectorNumElements();
7824     unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
7825     unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
7826     assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
7827     for (unsigned i = 0; i != NumSrcElts; ++i)
7828       Mask.push_back(i * Scale);
7829     Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
7830     Ops.push_back(Src);
7831     return true;
7832   }
7833   case X86ISD::VSHLI:
7834   case X86ISD::VSRLI: {
7835     uint64_t ShiftVal = N.getConstantOperandVal(1);
7836     // Out of range bit shifts are guaranteed to be zero.
7837     if (NumBitsPerElt <= ShiftVal) {
7838       Mask.append(NumElts, SM_SentinelZero);
7839       return true;
7840     }
7841 
7842     // We can only decode 'whole byte' bit shifts as shuffles.
7843     if ((ShiftVal % 8) != 0)
7844       break;
7845 
7846     uint64_t ByteShift = ShiftVal / 8;
7847     Ops.push_back(N.getOperand(0));
7848 
7849     // Clear mask to all zeros and insert the shifted byte indices.
7850     Mask.append(NumSizeInBytes, SM_SentinelZero);
7851 
7852     if (X86ISD::VSHLI == Opcode) {
7853       for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
7854         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
7855           Mask[i + j] = i + j - ByteShift;
7856     } else {
7857       for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
7858         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
7859           Mask[i + j - ByteShift] = i + j;
7860     }
7861     return true;
7862   }
7863   case X86ISD::VROTLI:
7864   case X86ISD::VROTRI: {
7865     // We can only decode 'whole byte' bit rotates as shuffles.
7866     uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
7867     if ((RotateVal % 8) != 0)
7868       return false;
7869     Ops.push_back(N.getOperand(0));
7870     int Offset = RotateVal / 8;
7871     Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
7872     for (int i = 0; i != (int)NumElts; ++i) {
7873       int BaseIdx = i * NumBytesPerElt;
7874       for (int j = 0; j != (int)NumBytesPerElt; ++j) {
7875         Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
7876       }
7877     }
7878     return true;
7879   }
7880   case X86ISD::VBROADCAST: {
7881     SDValue Src = N.getOperand(0);
7882     if (!Src.getSimpleValueType().isVector()) {
7883       if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7884           !isNullConstant(Src.getOperand(1)) ||
7885           Src.getOperand(0).getValueType().getScalarType() !=
7886               VT.getScalarType())
7887         return false;
7888       Src = Src.getOperand(0);
7889     }
7890     Ops.push_back(Src);
7891     Mask.append(NumElts, 0);
7892     return true;
7893   }
7894   case ISD::ZERO_EXTEND:
7895   case ISD::ANY_EXTEND:
7896   case ISD::ZERO_EXTEND_VECTOR_INREG:
7897   case ISD::ANY_EXTEND_VECTOR_INREG: {
7898     SDValue Src = N.getOperand(0);
7899     EVT SrcVT = Src.getValueType();
7900 
7901     // Extended source must be a simple vector.
7902     if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7903         (SrcVT.getScalarSizeInBits() % 8) != 0)
7904       return false;
7905 
7906     bool IsAnyExtend =
7907         (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
7908     DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
7909                          IsAnyExtend, Mask);
7910     Ops.push_back(Src);
7911     return true;
7912   }
7913   }
7914 
7915   return false;
7916 }
7917 
7918 /// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> & Inputs,SmallVectorImpl<int> & Mask)7919 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
7920                                               SmallVectorImpl<int> &Mask) {
7921   int MaskWidth = Mask.size();
7922   SmallVector<SDValue, 16> UsedInputs;
7923   for (int i = 0, e = Inputs.size(); i < e; ++i) {
7924     int lo = UsedInputs.size() * MaskWidth;
7925     int hi = lo + MaskWidth;
7926 
7927     // Strip UNDEF input usage.
7928     if (Inputs[i].isUndef())
7929       for (int &M : Mask)
7930         if ((lo <= M) && (M < hi))
7931           M = SM_SentinelUndef;
7932 
7933     // Check for unused inputs.
7934     if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
7935       for (int &M : Mask)
7936         if (lo <= M)
7937           M -= MaskWidth;
7938       continue;
7939     }
7940 
7941     // Check for repeated inputs.
7942     bool IsRepeat = false;
7943     for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
7944       if (UsedInputs[j] != Inputs[i])
7945         continue;
7946       for (int &M : Mask)
7947         if (lo <= M)
7948           M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
7949       IsRepeat = true;
7950       break;
7951     }
7952     if (IsRepeat)
7953       continue;
7954 
7955     UsedInputs.push_back(Inputs[i]);
7956   }
7957   Inputs = UsedInputs;
7958 }
7959 
7960 /// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
7961 /// and then sets the SM_SentinelUndef and SM_SentinelZero values.
7962 /// Returns true if the target shuffle mask was decoded.
getTargetShuffleInputs(SDValue Op,const APInt & DemandedElts,SmallVectorImpl<SDValue> & Inputs,SmallVectorImpl<int> & Mask,APInt & KnownUndef,APInt & KnownZero,const SelectionDAG & DAG,unsigned Depth,bool ResolveKnownElts)7963 static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
7964                                    SmallVectorImpl<SDValue> &Inputs,
7965                                    SmallVectorImpl<int> &Mask,
7966                                    APInt &KnownUndef, APInt &KnownZero,
7967                                    const SelectionDAG &DAG, unsigned Depth,
7968                                    bool ResolveKnownElts) {
7969   EVT VT = Op.getValueType();
7970   if (!VT.isSimple() || !VT.isVector())
7971     return false;
7972 
7973   if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
7974     if (ResolveKnownElts)
7975       resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
7976     return true;
7977   }
7978   if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
7979                          ResolveKnownElts)) {
7980     resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
7981     return true;
7982   }
7983   return false;
7984 }
7985 
getTargetShuffleInputs(SDValue Op,SmallVectorImpl<SDValue> & Inputs,SmallVectorImpl<int> & Mask,const SelectionDAG & DAG,unsigned Depth=0,bool ResolveKnownElts=true)7986 static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7987                                    SmallVectorImpl<int> &Mask,
7988                                    const SelectionDAG &DAG, unsigned Depth = 0,
7989                                    bool ResolveKnownElts = true) {
7990   EVT VT = Op.getValueType();
7991   if (!VT.isSimple() || !VT.isVector())
7992     return false;
7993 
7994   APInt KnownUndef, KnownZero;
7995   unsigned NumElts = Op.getValueType().getVectorNumElements();
7996   APInt DemandedElts = APInt::getAllOnesValue(NumElts);
7997   return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
7998                                 KnownZero, DAG, Depth, ResolveKnownElts);
7999 }
8000 
8001 // Attempt to create a scalar/subvector broadcast from the base MemSDNode.
getBROADCAST_LOAD(unsigned Opcode,const SDLoc & DL,EVT VT,EVT MemVT,MemSDNode * Mem,unsigned Offset,SelectionDAG & DAG)8002 static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
8003                                  EVT MemVT, MemSDNode *Mem, unsigned Offset,
8004                                  SelectionDAG &DAG) {
8005   assert((Opcode == X86ISD::VBROADCAST_LOAD ||
8006           Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
8007          "Unknown broadcast load type");
8008 
8009   // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
8010   if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
8011     return SDValue();
8012 
8013   SDValue Ptr =
8014       DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);
8015   SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8016   SDValue Ops[] = {Mem->getChain(), Ptr};
8017   SDValue BcstLd = DAG.getMemIntrinsicNode(
8018       Opcode, DL, Tys, Ops, MemVT,
8019       DAG.getMachineFunction().getMachineMemOperand(
8020           Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
8021   DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
8022   return BcstLd;
8023 }
8024 
8025 /// Returns the scalar element that will make up the i'th
8026 /// element of the result of the vector shuffle.
getShuffleScalarElt(SDValue Op,unsigned Index,SelectionDAG & DAG,unsigned Depth)8027 static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
8028                                    SelectionDAG &DAG, unsigned Depth) {
8029   if (Depth >= SelectionDAG::MaxRecursionDepth)
8030     return SDValue(); // Limit search depth.
8031 
8032   EVT VT = Op.getValueType();
8033   unsigned Opcode = Op.getOpcode();
8034   unsigned NumElems = VT.getVectorNumElements();
8035 
8036   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
8037   if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
8038     int Elt = SV->getMaskElt(Index);
8039 
8040     if (Elt < 0)
8041       return DAG.getUNDEF(VT.getVectorElementType());
8042 
8043     SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
8044     return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8045   }
8046 
8047   // Recurse into target specific vector shuffles to find scalars.
8048   if (isTargetShuffle(Opcode)) {
8049     MVT ShufVT = VT.getSimpleVT();
8050     MVT ShufSVT = ShufVT.getVectorElementType();
8051     int NumElems = (int)ShufVT.getVectorNumElements();
8052     SmallVector<int, 16> ShuffleMask;
8053     SmallVector<SDValue, 16> ShuffleOps;
8054     if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
8055                               ShuffleMask))
8056       return SDValue();
8057 
8058     int Elt = ShuffleMask[Index];
8059     if (Elt == SM_SentinelZero)
8060       return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
8061                                  : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
8062     if (Elt == SM_SentinelUndef)
8063       return DAG.getUNDEF(ShufSVT);
8064 
8065     assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
8066     SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
8067     return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8068   }
8069 
8070   // Recurse into insert_subvector base/sub vector to find scalars.
8071   if (Opcode == ISD::INSERT_SUBVECTOR) {
8072     SDValue Vec = Op.getOperand(0);
8073     SDValue Sub = Op.getOperand(1);
8074     uint64_t SubIdx = Op.getConstantOperandVal(2);
8075     unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
8076 
8077     if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
8078       return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
8079     return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
8080   }
8081 
8082   // Recurse into concat_vectors sub vector to find scalars.
8083   if (Opcode == ISD::CONCAT_VECTORS) {
8084     EVT SubVT = Op.getOperand(0).getValueType();
8085     unsigned NumSubElts = SubVT.getVectorNumElements();
8086     uint64_t SubIdx = Index / NumSubElts;
8087     uint64_t SubElt = Index % NumSubElts;
8088     return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
8089   }
8090 
8091   // Recurse into extract_subvector src vector to find scalars.
8092   if (Opcode == ISD::EXTRACT_SUBVECTOR) {
8093     SDValue Src = Op.getOperand(0);
8094     uint64_t SrcIdx = Op.getConstantOperandVal(1);
8095     return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
8096   }
8097 
8098   // We only peek through bitcasts of the same vector width.
8099   if (Opcode == ISD::BITCAST) {
8100     SDValue Src = Op.getOperand(0);
8101     EVT SrcVT = Src.getValueType();
8102     if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
8103       return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
8104     return SDValue();
8105   }
8106 
8107   // Actual nodes that may contain scalar elements
8108 
8109   // For insert_vector_elt - either return the index matching scalar or recurse
8110   // into the base vector.
8111   if (Opcode == ISD::INSERT_VECTOR_ELT &&
8112       isa<ConstantSDNode>(Op.getOperand(2))) {
8113     if (Op.getConstantOperandAPInt(2) == Index)
8114       return Op.getOperand(1);
8115     return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
8116   }
8117 
8118   if (Opcode == ISD::SCALAR_TO_VECTOR)
8119     return (Index == 0) ? Op.getOperand(0)
8120                         : DAG.getUNDEF(VT.getVectorElementType());
8121 
8122   if (Opcode == ISD::BUILD_VECTOR)
8123     return Op.getOperand(Index);
8124 
8125   return SDValue();
8126 }
8127 
8128 // Use PINSRB/PINSRW/PINSRD to create a build vector.
LowerBuildVectorAsInsert(SDValue Op,const APInt & NonZeroMask,unsigned NumNonZero,unsigned NumZero,SelectionDAG & DAG,const X86Subtarget & Subtarget)8129 static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
8130                                         unsigned NumNonZero, unsigned NumZero,
8131                                         SelectionDAG &DAG,
8132                                         const X86Subtarget &Subtarget) {
8133   MVT VT = Op.getSimpleValueType();
8134   unsigned NumElts = VT.getVectorNumElements();
8135   assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
8136           ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
8137          "Illegal vector insertion");
8138 
8139   SDLoc dl(Op);
8140   SDValue V;
8141   bool First = true;
8142 
8143   for (unsigned i = 0; i < NumElts; ++i) {
8144     bool IsNonZero = NonZeroMask[i];
8145     if (!IsNonZero)
8146       continue;
8147 
8148     // If the build vector contains zeros or our first insertion is not the
8149     // first index then insert into zero vector to break any register
8150     // dependency else use SCALAR_TO_VECTOR.
8151     if (First) {
8152       First = false;
8153       if (NumZero || 0 != i)
8154         V = getZeroVector(VT, Subtarget, DAG, dl);
8155       else {
8156         assert(0 == i && "Expected insertion into zero-index");
8157         V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8158         V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
8159         V = DAG.getBitcast(VT, V);
8160         continue;
8161       }
8162     }
8163     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
8164                     DAG.getIntPtrConstant(i, dl));
8165   }
8166 
8167   return V;
8168 }
8169 
8170 /// Custom lower build_vector of v16i8.
LowerBuildVectorv16i8(SDValue Op,const APInt & NonZeroMask,unsigned NumNonZero,unsigned NumZero,SelectionDAG & DAG,const X86Subtarget & Subtarget)8171 static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
8172                                      unsigned NumNonZero, unsigned NumZero,
8173                                      SelectionDAG &DAG,
8174                                      const X86Subtarget &Subtarget) {
8175   if (NumNonZero > 8 && !Subtarget.hasSSE41())
8176     return SDValue();
8177 
8178   // SSE4.1 - use PINSRB to insert each byte directly.
8179   if (Subtarget.hasSSE41())
8180     return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8181                                     Subtarget);
8182 
8183   SDLoc dl(Op);
8184   SDValue V;
8185 
8186   // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
8187   for (unsigned i = 0; i < 16; i += 2) {
8188     bool ThisIsNonZero = NonZeroMask[i];
8189     bool NextIsNonZero = NonZeroMask[i + 1];
8190     if (!ThisIsNonZero && !NextIsNonZero)
8191       continue;
8192 
8193     // FIXME: Investigate combining the first 4 bytes as a i32 instead.
8194     SDValue Elt;
8195     if (ThisIsNonZero) {
8196       if (NumZero || NextIsNonZero)
8197         Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8198       else
8199         Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8200     }
8201 
8202     if (NextIsNonZero) {
8203       SDValue NextElt = Op.getOperand(i + 1);
8204       if (i == 0 && NumZero)
8205         NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
8206       else
8207         NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
8208       NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
8209                             DAG.getConstant(8, dl, MVT::i8));
8210       if (ThisIsNonZero)
8211         Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
8212       else
8213         Elt = NextElt;
8214     }
8215 
8216     // If our first insertion is not the first index or zeros are needed, then
8217     // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
8218     // elements undefined).
8219     if (!V) {
8220       if (i != 0 || NumZero)
8221         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
8222       else {
8223         V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
8224         V = DAG.getBitcast(MVT::v8i16, V);
8225         continue;
8226       }
8227     }
8228     Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
8229     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
8230                     DAG.getIntPtrConstant(i / 2, dl));
8231   }
8232 
8233   return DAG.getBitcast(MVT::v16i8, V);
8234 }
8235 
8236 /// Custom lower build_vector of v8i16.
LowerBuildVectorv8i16(SDValue Op,const APInt & NonZeroMask,unsigned NumNonZero,unsigned NumZero,SelectionDAG & DAG,const X86Subtarget & Subtarget)8237 static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
8238                                      unsigned NumNonZero, unsigned NumZero,
8239                                      SelectionDAG &DAG,
8240                                      const X86Subtarget &Subtarget) {
8241   if (NumNonZero > 4 && !Subtarget.hasSSE41())
8242     return SDValue();
8243 
8244   // Use PINSRW to insert each byte directly.
8245   return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8246                                   Subtarget);
8247 }
8248 
8249 /// Custom lower build_vector of v4i32 or v4f32.
LowerBuildVectorv4x32(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)8250 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
8251                                      const X86Subtarget &Subtarget) {
8252   // If this is a splat of a pair of elements, use MOVDDUP (unless the target
8253   // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
8254   // Because we're creating a less complicated build vector here, we may enable
8255   // further folding of the MOVDDUP via shuffle transforms.
8256   if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
8257       Op.getOperand(0) == Op.getOperand(2) &&
8258       Op.getOperand(1) == Op.getOperand(3) &&
8259       Op.getOperand(0) != Op.getOperand(1)) {
8260     SDLoc DL(Op);
8261     MVT VT = Op.getSimpleValueType();
8262     MVT EltVT = VT.getVectorElementType();
8263     // Create a new build vector with the first 2 elements followed by undef
8264     // padding, bitcast to v2f64, duplicate, and bitcast back.
8265     SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
8266                        DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
8267     SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
8268     SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
8269     return DAG.getBitcast(VT, Dup);
8270   }
8271 
8272   // Find all zeroable elements.
8273   std::bitset<4> Zeroable, Undefs;
8274   for (int i = 0; i < 4; ++i) {
8275     SDValue Elt = Op.getOperand(i);
8276     Undefs[i] = Elt.isUndef();
8277     Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
8278   }
8279   assert(Zeroable.size() - Zeroable.count() > 1 &&
8280          "We expect at least two non-zero elements!");
8281 
8282   // We only know how to deal with build_vector nodes where elements are either
8283   // zeroable or extract_vector_elt with constant index.
8284   SDValue FirstNonZero;
8285   unsigned FirstNonZeroIdx;
8286   for (unsigned i = 0; i < 4; ++i) {
8287     if (Zeroable[i])
8288       continue;
8289     SDValue Elt = Op.getOperand(i);
8290     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8291         !isa<ConstantSDNode>(Elt.getOperand(1)))
8292       return SDValue();
8293     // Make sure that this node is extracting from a 128-bit vector.
8294     MVT VT = Elt.getOperand(0).getSimpleValueType();
8295     if (!VT.is128BitVector())
8296       return SDValue();
8297     if (!FirstNonZero.getNode()) {
8298       FirstNonZero = Elt;
8299       FirstNonZeroIdx = i;
8300     }
8301   }
8302 
8303   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
8304   SDValue V1 = FirstNonZero.getOperand(0);
8305   MVT VT = V1.getSimpleValueType();
8306 
8307   // See if this build_vector can be lowered as a blend with zero.
8308   SDValue Elt;
8309   unsigned EltMaskIdx, EltIdx;
8310   int Mask[4];
8311   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
8312     if (Zeroable[EltIdx]) {
8313       // The zero vector will be on the right hand side.
8314       Mask[EltIdx] = EltIdx+4;
8315       continue;
8316     }
8317 
8318     Elt = Op->getOperand(EltIdx);
8319     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
8320     EltMaskIdx = Elt.getConstantOperandVal(1);
8321     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
8322       break;
8323     Mask[EltIdx] = EltIdx;
8324   }
8325 
8326   if (EltIdx == 4) {
8327     // Let the shuffle legalizer deal with blend operations.
8328     SDValue VZeroOrUndef = (Zeroable == Undefs)
8329                                ? DAG.getUNDEF(VT)
8330                                : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
8331     if (V1.getSimpleValueType() != VT)
8332       V1 = DAG.getBitcast(VT, V1);
8333     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
8334   }
8335 
8336   // See if we can lower this build_vector to a INSERTPS.
8337   if (!Subtarget.hasSSE41())
8338     return SDValue();
8339 
8340   SDValue V2 = Elt.getOperand(0);
8341   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
8342     V1 = SDValue();
8343 
8344   bool CanFold = true;
8345   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
8346     if (Zeroable[i])
8347       continue;
8348 
8349     SDValue Current = Op->getOperand(i);
8350     SDValue SrcVector = Current->getOperand(0);
8351     if (!V1.getNode())
8352       V1 = SrcVector;
8353     CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
8354   }
8355 
8356   if (!CanFold)
8357     return SDValue();
8358 
8359   assert(V1.getNode() && "Expected at least two non-zero elements!");
8360   if (V1.getSimpleValueType() != MVT::v4f32)
8361     V1 = DAG.getBitcast(MVT::v4f32, V1);
8362   if (V2.getSimpleValueType() != MVT::v4f32)
8363     V2 = DAG.getBitcast(MVT::v4f32, V2);
8364 
8365   // Ok, we can emit an INSERTPS instruction.
8366   unsigned ZMask = Zeroable.to_ulong();
8367 
8368   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
8369   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8370   SDLoc DL(Op);
8371   SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8372                                DAG.getIntPtrConstant(InsertPSMask, DL, true));
8373   return DAG.getBitcast(VT, Result);
8374 }
8375 
8376 /// Return a vector logical shift node.
getVShift(bool isLeft,EVT VT,SDValue SrcOp,unsigned NumBits,SelectionDAG & DAG,const TargetLowering & TLI,const SDLoc & dl)8377 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
8378                          SelectionDAG &DAG, const TargetLowering &TLI,
8379                          const SDLoc &dl) {
8380   assert(VT.is128BitVector() && "Unknown type for VShift");
8381   MVT ShVT = MVT::v16i8;
8382   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
8383   SrcOp = DAG.getBitcast(ShVT, SrcOp);
8384   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
8385   SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
8386   return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
8387 }
8388 
LowerAsSplatVectorLoad(SDValue SrcOp,MVT VT,const SDLoc & dl,SelectionDAG & DAG)8389 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
8390                                       SelectionDAG &DAG) {
8391 
8392   // Check if the scalar load can be widened into a vector load. And if
8393   // the address is "base + cst" see if the cst can be "absorbed" into
8394   // the shuffle mask.
8395   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
8396     SDValue Ptr = LD->getBasePtr();
8397     if (!ISD::isNormalLoad(LD) || !LD->isSimple())
8398       return SDValue();
8399     EVT PVT = LD->getValueType(0);
8400     if (PVT != MVT::i32 && PVT != MVT::f32)
8401       return SDValue();
8402 
8403     int FI = -1;
8404     int64_t Offset = 0;
8405     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
8406       FI = FINode->getIndex();
8407       Offset = 0;
8408     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
8409                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
8410       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
8411       Offset = Ptr.getConstantOperandVal(1);
8412       Ptr = Ptr.getOperand(0);
8413     } else {
8414       return SDValue();
8415     }
8416 
8417     // FIXME: 256-bit vector instructions don't require a strict alignment,
8418     // improve this code to support it better.
8419     Align RequiredAlign(VT.getSizeInBits() / 8);
8420     SDValue Chain = LD->getChain();
8421     // Make sure the stack object alignment is at least 16 or 32.
8422     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
8423     MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
8424     if (!InferredAlign || *InferredAlign < RequiredAlign) {
8425       if (MFI.isFixedObjectIndex(FI)) {
8426         // Can't change the alignment. FIXME: It's possible to compute
8427         // the exact stack offset and reference FI + adjust offset instead.
8428         // If someone *really* cares about this. That's the way to implement it.
8429         return SDValue();
8430       } else {
8431         MFI.setObjectAlignment(FI, RequiredAlign);
8432       }
8433     }
8434 
8435     // (Offset % 16 or 32) must be multiple of 4. Then address is then
8436     // Ptr + (Offset & ~15).
8437     if (Offset < 0)
8438       return SDValue();
8439     if ((Offset % RequiredAlign.value()) & 3)
8440       return SDValue();
8441     int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
8442     if (StartOffset) {
8443       SDLoc DL(Ptr);
8444       Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8445                         DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
8446     }
8447 
8448     int EltNo = (Offset - StartOffset) >> 2;
8449     unsigned NumElems = VT.getVectorNumElements();
8450 
8451     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
8452     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
8453                              LD->getPointerInfo().getWithOffset(StartOffset));
8454 
8455     SmallVector<int, 8> Mask(NumElems, EltNo);
8456 
8457     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
8458   }
8459 
8460   return SDValue();
8461 }
8462 
8463 // Recurse to find a LoadSDNode source and the accumulated ByteOffest.
findEltLoadSrc(SDValue Elt,LoadSDNode * & Ld,int64_t & ByteOffset)8464 static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
8465   if (ISD::isNON_EXTLoad(Elt.getNode())) {
8466     auto *BaseLd = cast<LoadSDNode>(Elt);
8467     if (!BaseLd->isSimple())
8468       return false;
8469     Ld = BaseLd;
8470     ByteOffset = 0;
8471     return true;
8472   }
8473 
8474   switch (Elt.getOpcode()) {
8475   case ISD::BITCAST:
8476   case ISD::TRUNCATE:
8477   case ISD::SCALAR_TO_VECTOR:
8478     return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
8479   case ISD::SRL:
8480     if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8481       uint64_t Idx = IdxC->getZExtValue();
8482       if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
8483         ByteOffset += Idx / 8;
8484         return true;
8485       }
8486     }
8487     break;
8488   case ISD::EXTRACT_VECTOR_ELT:
8489     if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8490       SDValue Src = Elt.getOperand(0);
8491       unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
8492       unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
8493       if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
8494           findEltLoadSrc(Src, Ld, ByteOffset)) {
8495         uint64_t Idx = IdxC->getZExtValue();
8496         ByteOffset += Idx * (SrcSizeInBits / 8);
8497         return true;
8498       }
8499     }
8500     break;
8501   }
8502 
8503   return false;
8504 }
8505 
8506 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
8507 /// elements can be replaced by a single large load which has the same value as
8508 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
8509 ///
8510 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
EltsFromConsecutiveLoads(EVT VT,ArrayRef<SDValue> Elts,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget,bool IsAfterLegalize)8511 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
8512                                         const SDLoc &DL, SelectionDAG &DAG,
8513                                         const X86Subtarget &Subtarget,
8514                                         bool IsAfterLegalize) {
8515   if ((VT.getScalarSizeInBits() % 8) != 0)
8516     return SDValue();
8517 
8518   unsigned NumElems = Elts.size();
8519 
8520   int LastLoadedElt = -1;
8521   APInt LoadMask = APInt::getNullValue(NumElems);
8522   APInt ZeroMask = APInt::getNullValue(NumElems);
8523   APInt UndefMask = APInt::getNullValue(NumElems);
8524 
8525   SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
8526   SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
8527 
8528   // For each element in the initializer, see if we've found a load, zero or an
8529   // undef.
8530   for (unsigned i = 0; i < NumElems; ++i) {
8531     SDValue Elt = peekThroughBitcasts(Elts[i]);
8532     if (!Elt.getNode())
8533       return SDValue();
8534     if (Elt.isUndef()) {
8535       UndefMask.setBit(i);
8536       continue;
8537     }
8538     if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
8539       ZeroMask.setBit(i);
8540       continue;
8541     }
8542 
8543     // Each loaded element must be the correct fractional portion of the
8544     // requested vector load.
8545     unsigned EltSizeInBits = Elt.getValueSizeInBits();
8546     if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
8547       return SDValue();
8548 
8549     if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
8550       return SDValue();
8551     unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
8552     if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
8553       return SDValue();
8554 
8555     LoadMask.setBit(i);
8556     LastLoadedElt = i;
8557   }
8558   assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +
8559           LoadMask.countPopulation()) == NumElems &&
8560          "Incomplete element masks");
8561 
8562   // Handle Special Cases - all undef or undef/zero.
8563   if (UndefMask.countPopulation() == NumElems)
8564     return DAG.getUNDEF(VT);
8565   if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
8566     return VT.isInteger() ? DAG.getConstant(0, DL, VT)
8567                           : DAG.getConstantFP(0.0, DL, VT);
8568 
8569   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8570   int FirstLoadedElt = LoadMask.countTrailingZeros();
8571   SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
8572   EVT EltBaseVT = EltBase.getValueType();
8573   assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
8574          "Register/Memory size mismatch");
8575   LoadSDNode *LDBase = Loads[FirstLoadedElt];
8576   assert(LDBase && "Did not find base load for merging consecutive loads");
8577   unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
8578   unsigned BaseSizeInBytes = BaseSizeInBits / 8;
8579   int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
8580   int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
8581   assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
8582 
8583   // TODO: Support offsetting the base load.
8584   if (ByteOffsets[FirstLoadedElt] != 0)
8585     return SDValue();
8586 
8587   // Check to see if the element's load is consecutive to the base load
8588   // or offset from a previous (already checked) load.
8589   auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
8590     LoadSDNode *Ld = Loads[EltIdx];
8591     int64_t ByteOffset = ByteOffsets[EltIdx];
8592     if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
8593       int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
8594       return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
8595               Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
8596     }
8597     return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
8598                                               EltIdx - FirstLoadedElt);
8599   };
8600 
8601   // Consecutive loads can contain UNDEFS but not ZERO elements.
8602   // Consecutive loads with UNDEFs and ZEROs elements require a
8603   // an additional shuffle stage to clear the ZERO elements.
8604   bool IsConsecutiveLoad = true;
8605   bool IsConsecutiveLoadWithZeros = true;
8606   for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
8607     if (LoadMask[i]) {
8608       if (!CheckConsecutiveLoad(LDBase, i)) {
8609         IsConsecutiveLoad = false;
8610         IsConsecutiveLoadWithZeros = false;
8611         break;
8612       }
8613     } else if (ZeroMask[i]) {
8614       IsConsecutiveLoad = false;
8615     }
8616   }
8617 
8618   auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
8619     auto MMOFlags = LDBase->getMemOperand()->getFlags();
8620     assert(LDBase->isSimple() &&
8621            "Cannot merge volatile or atomic loads.");
8622     SDValue NewLd =
8623         DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
8624                     LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
8625                     MMOFlags);
8626     for (auto *LD : Loads)
8627       if (LD)
8628         DAG.makeEquivalentMemoryOrdering(LD, NewLd);
8629     return NewLd;
8630   };
8631 
8632   // Check if the base load is entirely dereferenceable.
8633   bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
8634       VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
8635 
8636   // LOAD - all consecutive load/undefs (must start/end with a load or be
8637   // entirely dereferenceable). If we have found an entire vector of loads and
8638   // undefs, then return a large load of the entire vector width starting at the
8639   // base pointer. If the vector contains zeros, then attempt to shuffle those
8640   // elements.
8641   if (FirstLoadedElt == 0 &&
8642       (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
8643       (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
8644     if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
8645       return SDValue();
8646 
8647     // Don't create 256-bit non-temporal aligned loads without AVX2 as these
8648     // will lower to regular temporal loads and use the cache.
8649     if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
8650         VT.is256BitVector() && !Subtarget.hasInt256())
8651       return SDValue();
8652 
8653     if (NumElems == 1)
8654       return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
8655 
8656     if (!ZeroMask)
8657       return CreateLoad(VT, LDBase);
8658 
8659     // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
8660     // vector and a zero vector to clear out the zero elements.
8661     if (!IsAfterLegalize && VT.isVector()) {
8662       unsigned NumMaskElts = VT.getVectorNumElements();
8663       if ((NumMaskElts % NumElems) == 0) {
8664         unsigned Scale = NumMaskElts / NumElems;
8665         SmallVector<int, 4> ClearMask(NumMaskElts, -1);
8666         for (unsigned i = 0; i < NumElems; ++i) {
8667           if (UndefMask[i])
8668             continue;
8669           int Offset = ZeroMask[i] ? NumMaskElts : 0;
8670           for (unsigned j = 0; j != Scale; ++j)
8671             ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
8672         }
8673         SDValue V = CreateLoad(VT, LDBase);
8674         SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
8675                                    : DAG.getConstantFP(0.0, DL, VT);
8676         return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
8677       }
8678     }
8679   }
8680 
8681   // If the upper half of a ymm/zmm load is undef then just load the lower half.
8682   if (VT.is256BitVector() || VT.is512BitVector()) {
8683     unsigned HalfNumElems = NumElems / 2;
8684     if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
8685       EVT HalfVT =
8686           EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
8687       SDValue HalfLD =
8688           EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
8689                                    DAG, Subtarget, IsAfterLegalize);
8690       if (HalfLD)
8691         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
8692                            HalfLD, DAG.getIntPtrConstant(0, DL));
8693     }
8694   }
8695 
8696   // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
8697   if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
8698       (LoadSizeInBits == 32 || LoadSizeInBits == 64) &&
8699       ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
8700     MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
8701                                       : MVT::getIntegerVT(LoadSizeInBits);
8702     MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
8703     // Allow v4f32 on SSE1 only targets.
8704     // FIXME: Add more isel patterns so we can just use VT directly.
8705     if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
8706       VecVT = MVT::v4f32;
8707     if (TLI.isTypeLegal(VecVT)) {
8708       SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
8709       SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
8710       SDValue ResNode = DAG.getMemIntrinsicNode(
8711           X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
8712           LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
8713       for (auto *LD : Loads)
8714         if (LD)
8715           DAG.makeEquivalentMemoryOrdering(LD, ResNode);
8716       return DAG.getBitcast(VT, ResNode);
8717     }
8718   }
8719 
8720   // BROADCAST - match the smallest possible repetition pattern, load that
8721   // scalar/subvector element and then broadcast to the entire vector.
8722   if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
8723       (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
8724     for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
8725       unsigned RepeatSize = SubElems * BaseSizeInBits;
8726       unsigned ScalarSize = std::min(RepeatSize, 64u);
8727       if (!Subtarget.hasAVX2() && ScalarSize < 32)
8728         continue;
8729 
8730       // Don't attempt a 1:N subvector broadcast - it should be caught by
8731       // combineConcatVectorOps, else will cause infinite loops.
8732       if (RepeatSize > ScalarSize && SubElems == 1)
8733         continue;
8734 
8735       bool Match = true;
8736       SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
8737       for (unsigned i = 0; i != NumElems && Match; ++i) {
8738         if (!LoadMask[i])
8739           continue;
8740         SDValue Elt = peekThroughBitcasts(Elts[i]);
8741         if (RepeatedLoads[i % SubElems].isUndef())
8742           RepeatedLoads[i % SubElems] = Elt;
8743         else
8744           Match &= (RepeatedLoads[i % SubElems] == Elt);
8745       }
8746 
8747       // We must have loads at both ends of the repetition.
8748       Match &= !RepeatedLoads.front().isUndef();
8749       Match &= !RepeatedLoads.back().isUndef();
8750       if (!Match)
8751         continue;
8752 
8753       EVT RepeatVT =
8754           VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
8755               ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
8756               : EVT::getFloatingPointVT(ScalarSize);
8757       if (RepeatSize > ScalarSize)
8758         RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
8759                                     RepeatSize / ScalarSize);
8760       EVT BroadcastVT =
8761           EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
8762                            VT.getSizeInBits() / ScalarSize);
8763       if (TLI.isTypeLegal(BroadcastVT)) {
8764         if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
8765                 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
8766           SDValue Broadcast = RepeatLoad;
8767           if (RepeatSize > ScalarSize) {
8768             while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
8769               Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
8770           } else {
8771             Broadcast =
8772                 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
8773           }
8774           return DAG.getBitcast(VT, Broadcast);
8775         }
8776       }
8777     }
8778   }
8779 
8780   return SDValue();
8781 }
8782 
8783 // Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
8784 // load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
8785 // are consecutive, non-overlapping, and in the right order.
combineToConsecutiveLoads(EVT VT,SDValue Op,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget,bool IsAfterLegalize)8786 static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
8787                                          SelectionDAG &DAG,
8788                                          const X86Subtarget &Subtarget,
8789                                          bool IsAfterLegalize) {
8790   SmallVector<SDValue, 64> Elts;
8791   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
8792     if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
8793       Elts.push_back(Elt);
8794       continue;
8795     }
8796     return SDValue();
8797   }
8798   assert(Elts.size() == VT.getVectorNumElements());
8799   return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
8800                                   IsAfterLegalize);
8801 }
8802 
getConstantVector(MVT VT,const APInt & SplatValue,unsigned SplatBitSize,LLVMContext & C)8803 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
8804                                    unsigned SplatBitSize, LLVMContext &C) {
8805   unsigned ScalarSize = VT.getScalarSizeInBits();
8806   unsigned NumElm = SplatBitSize / ScalarSize;
8807 
8808   SmallVector<Constant *, 32> ConstantVec;
8809   for (unsigned i = 0; i < NumElm; i++) {
8810     APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
8811     Constant *Const;
8812     if (VT.isFloatingPoint()) {
8813       if (ScalarSize == 32) {
8814         Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
8815       } else {
8816         assert(ScalarSize == 64 && "Unsupported floating point scalar size");
8817         Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
8818       }
8819     } else
8820       Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
8821     ConstantVec.push_back(Const);
8822   }
8823   return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
8824 }
8825 
isFoldableUseOfShuffle(SDNode * N)8826 static bool isFoldableUseOfShuffle(SDNode *N) {
8827   for (auto *U : N->uses()) {
8828     unsigned Opc = U->getOpcode();
8829     // VPERMV/VPERMV3 shuffles can never fold their index operands.
8830     if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
8831       return false;
8832     if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
8833       return false;
8834     if (isTargetShuffle(Opc))
8835       return true;
8836     if (Opc == ISD::BITCAST) // Ignore bitcasts
8837       return isFoldableUseOfShuffle(U);
8838     if (N->hasOneUse())
8839       return true;
8840   }
8841   return false;
8842 }
8843 
8844 /// Attempt to use the vbroadcast instruction to generate a splat value
8845 /// from a splat BUILD_VECTOR which uses:
8846 ///  a. A single scalar load, or a constant.
8847 ///  b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
8848 ///
8849 /// The VBROADCAST node is returned when a pattern is found,
8850 /// or SDValue() otherwise.
lowerBuildVectorAsBroadcast(BuildVectorSDNode * BVOp,const X86Subtarget & Subtarget,SelectionDAG & DAG)8851 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
8852                                            const X86Subtarget &Subtarget,
8853                                            SelectionDAG &DAG) {
8854   // VBROADCAST requires AVX.
8855   // TODO: Splats could be generated for non-AVX CPUs using SSE
8856   // instructions, but there's less potential gain for only 128-bit vectors.
8857   if (!Subtarget.hasAVX())
8858     return SDValue();
8859 
8860   MVT VT = BVOp->getSimpleValueType(0);
8861   unsigned NumElts = VT.getVectorNumElements();
8862   SDLoc dl(BVOp);
8863 
8864   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
8865          "Unsupported vector type for broadcast.");
8866 
8867   // See if the build vector is a repeating sequence of scalars (inc. splat).
8868   SDValue Ld;
8869   BitVector UndefElements;
8870   SmallVector<SDValue, 16> Sequence;
8871   if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
8872     assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
8873     if (Sequence.size() == 1)
8874       Ld = Sequence[0];
8875   }
8876 
8877   // Attempt to use VBROADCASTM
8878   // From this pattern:
8879   // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
8880   // b. t1 = (build_vector t0 t0)
8881   //
8882   // Create (VBROADCASTM v2i1 X)
8883   if (!Sequence.empty() && Subtarget.hasCDI()) {
8884     // If not a splat, are the upper sequence values zeroable?
8885     unsigned SeqLen = Sequence.size();
8886     bool UpperZeroOrUndef =
8887         SeqLen == 1 ||
8888         llvm::all_of(makeArrayRef(Sequence).drop_front(), [](SDValue V) {
8889           return !V || V.isUndef() || isNullConstant(V);
8890         });
8891     SDValue Op0 = Sequence[0];
8892     if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
8893                              (Op0.getOpcode() == ISD::ZERO_EXTEND &&
8894                               Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
8895       SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
8896                              ? Op0.getOperand(0)
8897                              : Op0.getOperand(0).getOperand(0);
8898       MVT MaskVT = BOperand.getSimpleValueType();
8899       MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
8900       if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) ||  // for broadcastmb2q
8901           (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
8902         MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
8903         if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
8904           unsigned Scale = 512 / VT.getSizeInBits();
8905           BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
8906         }
8907         SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
8908         if (BcstVT.getSizeInBits() != VT.getSizeInBits())
8909           Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
8910         return DAG.getBitcast(VT, Bcst);
8911       }
8912     }
8913   }
8914 
8915   unsigned NumUndefElts = UndefElements.count();
8916   if (!Ld || (NumElts - NumUndefElts) <= 1) {
8917     APInt SplatValue, Undef;
8918     unsigned SplatBitSize;
8919     bool HasUndef;
8920     // Check if this is a repeated constant pattern suitable for broadcasting.
8921     if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
8922         SplatBitSize > VT.getScalarSizeInBits() &&
8923         SplatBitSize < VT.getSizeInBits()) {
8924       // Avoid replacing with broadcast when it's a use of a shuffle
8925       // instruction to preserve the present custom lowering of shuffles.
8926       if (isFoldableUseOfShuffle(BVOp))
8927         return SDValue();
8928       // replace BUILD_VECTOR with broadcast of the repeated constants.
8929       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8930       LLVMContext *Ctx = DAG.getContext();
8931       MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
8932       if (Subtarget.hasAVX()) {
8933         if (SplatBitSize == 32 || SplatBitSize == 64 ||
8934             (SplatBitSize < 32 && Subtarget.hasAVX2())) {
8935           // Splatted value can fit in one INTEGER constant in constant pool.
8936           // Load the constant and broadcast it.
8937           MVT CVT = MVT::getIntegerVT(SplatBitSize);
8938           Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
8939           Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
8940           SDValue CP = DAG.getConstantPool(C, PVT);
8941           unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
8942 
8943           Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
8944           SDVTList Tys =
8945               DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
8946           SDValue Ops[] = {DAG.getEntryNode(), CP};
8947           MachinePointerInfo MPI =
8948               MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
8949           SDValue Brdcst = DAG.getMemIntrinsicNode(
8950               X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
8951               MachineMemOperand::MOLoad);
8952           return DAG.getBitcast(VT, Brdcst);
8953         }
8954         if (SplatBitSize > 64) {
8955           // Load the vector of constants and broadcast it.
8956           Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
8957                                              *Ctx);
8958           SDValue VCP = DAG.getConstantPool(VecC, PVT);
8959           unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
8960           MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
8961           Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
8962           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8963           SDValue Ops[] = {DAG.getEntryNode(), VCP};
8964           MachinePointerInfo MPI =
8965               MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
8966           return DAG.getMemIntrinsicNode(
8967               X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,
8968               MachineMemOperand::MOLoad);
8969         }
8970       }
8971     }
8972 
8973     // If we are moving a scalar into a vector (Ld must be set and all elements
8974     // but 1 are undef) and that operation is not obviously supported by
8975     // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
8976     // That's better than general shuffling and may eliminate a load to GPR and
8977     // move from scalar to vector register.
8978     if (!Ld || NumElts - NumUndefElts != 1)
8979       return SDValue();
8980     unsigned ScalarSize = Ld.getValueSizeInBits();
8981     if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
8982       return SDValue();
8983   }
8984 
8985   bool ConstSplatVal =
8986       (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
8987   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
8988 
8989   // TODO: Handle broadcasts of non-constant sequences.
8990 
8991   // Make sure that all of the users of a non-constant load are from the
8992   // BUILD_VECTOR node.
8993   // FIXME: Is the use count needed for non-constant, non-load case?
8994   if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
8995     return SDValue();
8996 
8997   unsigned ScalarSize = Ld.getValueSizeInBits();
8998   bool IsGE256 = (VT.getSizeInBits() >= 256);
8999 
9000   // When optimizing for size, generate up to 5 extra bytes for a broadcast
9001   // instruction to save 8 or more bytes of constant pool data.
9002   // TODO: If multiple splats are generated to load the same constant,
9003   // it may be detrimental to overall size. There needs to be a way to detect
9004   // that condition to know if this is truly a size win.
9005   bool OptForSize = DAG.shouldOptForSize();
9006 
9007   // Handle broadcasting a single constant scalar from the constant pool
9008   // into a vector.
9009   // On Sandybridge (no AVX2), it is still better to load a constant vector
9010   // from the constant pool and not to broadcast it from a scalar.
9011   // But override that restriction when optimizing for size.
9012   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
9013   if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
9014     EVT CVT = Ld.getValueType();
9015     assert(!CVT.isVector() && "Must not broadcast a vector type");
9016 
9017     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
9018     // For size optimization, also splat v2f64 and v2i64, and for size opt
9019     // with AVX2, also splat i8 and i16.
9020     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
9021     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9022         (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
9023       const Constant *C = nullptr;
9024       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
9025         C = CI->getConstantIntValue();
9026       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
9027         C = CF->getConstantFPValue();
9028 
9029       assert(C && "Invalid constant type");
9030 
9031       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9032       SDValue CP =
9033           DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
9034       Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9035 
9036       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9037       SDValue Ops[] = {DAG.getEntryNode(), CP};
9038       MachinePointerInfo MPI =
9039           MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9040       return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
9041                                      MPI, Alignment, MachineMemOperand::MOLoad);
9042     }
9043   }
9044 
9045   // Handle AVX2 in-register broadcasts.
9046   if (!IsLoad && Subtarget.hasInt256() &&
9047       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
9048     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9049 
9050   // The scalar source must be a normal load.
9051   if (!IsLoad)
9052     return SDValue();
9053 
9054   // Make sure the non-chain result is only used by this build vector.
9055   if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
9056     return SDValue();
9057 
9058   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9059       (Subtarget.hasVLX() && ScalarSize == 64)) {
9060     auto *LN = cast<LoadSDNode>(Ld);
9061     SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9062     SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9063     SDValue BCast =
9064         DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9065                                 LN->getMemoryVT(), LN->getMemOperand());
9066     DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9067     return BCast;
9068   }
9069 
9070   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
9071   // double since there is no vbroadcastsd xmm
9072   if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
9073       (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
9074     auto *LN = cast<LoadSDNode>(Ld);
9075     SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9076     SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9077     SDValue BCast =
9078         DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9079                                 LN->getMemoryVT(), LN->getMemOperand());
9080     DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9081     return BCast;
9082   }
9083 
9084   // Unsupported broadcast.
9085   return SDValue();
9086 }
9087 
9088 /// For an EXTRACT_VECTOR_ELT with a constant index return the real
9089 /// underlying vector and index.
9090 ///
9091 /// Modifies \p ExtractedFromVec to the real vector and returns the real
9092 /// index.
getUnderlyingExtractedFromVec(SDValue & ExtractedFromVec,SDValue ExtIdx)9093 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
9094                                          SDValue ExtIdx) {
9095   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
9096   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
9097     return Idx;
9098 
9099   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
9100   // lowered this:
9101   //   (extract_vector_elt (v8f32 %1), Constant<6>)
9102   // to:
9103   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
9104   //                           (extract_subvector (v8f32 %0), Constant<4>),
9105   //                           undef)
9106   //                       Constant<0>)
9107   // In this case the vector is the extract_subvector expression and the index
9108   // is 2, as specified by the shuffle.
9109   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
9110   SDValue ShuffleVec = SVOp->getOperand(0);
9111   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
9112   assert(ShuffleVecVT.getVectorElementType() ==
9113          ExtractedFromVec.getSimpleValueType().getVectorElementType());
9114 
9115   int ShuffleIdx = SVOp->getMaskElt(Idx);
9116   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
9117     ExtractedFromVec = ShuffleVec;
9118     return ShuffleIdx;
9119   }
9120   return Idx;
9121 }
9122 
buildFromShuffleMostly(SDValue Op,SelectionDAG & DAG)9123 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
9124   MVT VT = Op.getSimpleValueType();
9125 
9126   // Skip if insert_vec_elt is not supported.
9127   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9128   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
9129     return SDValue();
9130 
9131   SDLoc DL(Op);
9132   unsigned NumElems = Op.getNumOperands();
9133 
9134   SDValue VecIn1;
9135   SDValue VecIn2;
9136   SmallVector<unsigned, 4> InsertIndices;
9137   SmallVector<int, 8> Mask(NumElems, -1);
9138 
9139   for (unsigned i = 0; i != NumElems; ++i) {
9140     unsigned Opc = Op.getOperand(i).getOpcode();
9141 
9142     if (Opc == ISD::UNDEF)
9143       continue;
9144 
9145     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
9146       // Quit if more than 1 elements need inserting.
9147       if (InsertIndices.size() > 1)
9148         return SDValue();
9149 
9150       InsertIndices.push_back(i);
9151       continue;
9152     }
9153 
9154     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
9155     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
9156 
9157     // Quit if non-constant index.
9158     if (!isa<ConstantSDNode>(ExtIdx))
9159       return SDValue();
9160     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
9161 
9162     // Quit if extracted from vector of different type.
9163     if (ExtractedFromVec.getValueType() != VT)
9164       return SDValue();
9165 
9166     if (!VecIn1.getNode())
9167       VecIn1 = ExtractedFromVec;
9168     else if (VecIn1 != ExtractedFromVec) {
9169       if (!VecIn2.getNode())
9170         VecIn2 = ExtractedFromVec;
9171       else if (VecIn2 != ExtractedFromVec)
9172         // Quit if more than 2 vectors to shuffle
9173         return SDValue();
9174     }
9175 
9176     if (ExtractedFromVec == VecIn1)
9177       Mask[i] = Idx;
9178     else if (ExtractedFromVec == VecIn2)
9179       Mask[i] = Idx + NumElems;
9180   }
9181 
9182   if (!VecIn1.getNode())
9183     return SDValue();
9184 
9185   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
9186   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
9187 
9188   for (unsigned Idx : InsertIndices)
9189     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
9190                      DAG.getIntPtrConstant(Idx, DL));
9191 
9192   return NV;
9193 }
9194 
9195 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
LowerBUILD_VECTORvXi1(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)9196 static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
9197                                      const X86Subtarget &Subtarget) {
9198 
9199   MVT VT = Op.getSimpleValueType();
9200   assert((VT.getVectorElementType() == MVT::i1) &&
9201          "Unexpected type in LowerBUILD_VECTORvXi1!");
9202 
9203   SDLoc dl(Op);
9204   if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
9205       ISD::isBuildVectorAllOnes(Op.getNode()))
9206     return Op;
9207 
9208   uint64_t Immediate = 0;
9209   SmallVector<unsigned, 16> NonConstIdx;
9210   bool IsSplat = true;
9211   bool HasConstElts = false;
9212   int SplatIdx = -1;
9213   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
9214     SDValue In = Op.getOperand(idx);
9215     if (In.isUndef())
9216       continue;
9217     if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
9218       Immediate |= (InC->getZExtValue() & 0x1) << idx;
9219       HasConstElts = true;
9220     } else {
9221       NonConstIdx.push_back(idx);
9222     }
9223     if (SplatIdx < 0)
9224       SplatIdx = idx;
9225     else if (In != Op.getOperand(SplatIdx))
9226       IsSplat = false;
9227   }
9228 
9229   // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
9230   if (IsSplat) {
9231     // The build_vector allows the scalar element to be larger than the vector
9232     // element type. We need to mask it to use as a condition unless we know
9233     // the upper bits are zero.
9234     // FIXME: Use computeKnownBits instead of checking specific opcode?
9235     SDValue Cond = Op.getOperand(SplatIdx);
9236     assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
9237     if (Cond.getOpcode() != ISD::SETCC)
9238       Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
9239                          DAG.getConstant(1, dl, MVT::i8));
9240 
9241     // Perform the select in the scalar domain so we can use cmov.
9242     if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9243       SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
9244                                      DAG.getAllOnesConstant(dl, MVT::i32),
9245                                      DAG.getConstant(0, dl, MVT::i32));
9246       Select = DAG.getBitcast(MVT::v32i1, Select);
9247       return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
9248     } else {
9249       MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9250       SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
9251                                      DAG.getAllOnesConstant(dl, ImmVT),
9252                                      DAG.getConstant(0, dl, ImmVT));
9253       MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9254       Select = DAG.getBitcast(VecVT, Select);
9255       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
9256                          DAG.getIntPtrConstant(0, dl));
9257     }
9258   }
9259 
9260   // insert elements one by one
9261   SDValue DstVec;
9262   if (HasConstElts) {
9263     if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9264       SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
9265       SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
9266       ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
9267       ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
9268       DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
9269     } else {
9270       MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9271       SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
9272       MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9273       DstVec = DAG.getBitcast(VecVT, Imm);
9274       DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
9275                            DAG.getIntPtrConstant(0, dl));
9276     }
9277   } else
9278     DstVec = DAG.getUNDEF(VT);
9279 
9280   for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
9281     unsigned InsertIdx = NonConstIdx[i];
9282     DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
9283                          Op.getOperand(InsertIdx),
9284                          DAG.getIntPtrConstant(InsertIdx, dl));
9285   }
9286   return DstVec;
9287 }
9288 
isHorizOp(unsigned Opcode)9289 LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
9290   switch (Opcode) {
9291   case X86ISD::PACKSS:
9292   case X86ISD::PACKUS:
9293   case X86ISD::FHADD:
9294   case X86ISD::FHSUB:
9295   case X86ISD::HADD:
9296   case X86ISD::HSUB:
9297     return true;
9298   }
9299   return false;
9300 }
9301 
9302 /// This is a helper function of LowerToHorizontalOp().
9303 /// This function checks that the build_vector \p N in input implements a
9304 /// 128-bit partial horizontal operation on a 256-bit vector, but that operation
9305 /// may not match the layout of an x86 256-bit horizontal instruction.
9306 /// In other words, if this returns true, then some extraction/insertion will
9307 /// be required to produce a valid horizontal instruction.
9308 ///
9309 /// Parameter \p Opcode defines the kind of horizontal operation to match.
9310 /// For example, if \p Opcode is equal to ISD::ADD, then this function
9311 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
9312 /// is equal to ISD::SUB, then this function checks if this is a horizontal
9313 /// arithmetic sub.
9314 ///
9315 /// This function only analyzes elements of \p N whose indices are
9316 /// in range [BaseIdx, LastIdx).
9317 ///
9318 /// TODO: This function was originally used to match both real and fake partial
9319 /// horizontal operations, but the index-matching logic is incorrect for that.
9320 /// See the corrected implementation in isHopBuildVector(). Can we reduce this
9321 /// code because it is only used for partial h-op matching now?
isHorizontalBinOpPart(const BuildVectorSDNode * N,unsigned Opcode,SelectionDAG & DAG,unsigned BaseIdx,unsigned LastIdx,SDValue & V0,SDValue & V1)9322 static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
9323                                   SelectionDAG &DAG,
9324                                   unsigned BaseIdx, unsigned LastIdx,
9325                                   SDValue &V0, SDValue &V1) {
9326   EVT VT = N->getValueType(0);
9327   assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
9328   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
9329   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
9330          "Invalid Vector in input!");
9331 
9332   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
9333   bool CanFold = true;
9334   unsigned ExpectedVExtractIdx = BaseIdx;
9335   unsigned NumElts = LastIdx - BaseIdx;
9336   V0 = DAG.getUNDEF(VT);
9337   V1 = DAG.getUNDEF(VT);
9338 
9339   // Check if N implements a horizontal binop.
9340   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
9341     SDValue Op = N->getOperand(i + BaseIdx);
9342 
9343     // Skip UNDEFs.
9344     if (Op->isUndef()) {
9345       // Update the expected vector extract index.
9346       if (i * 2 == NumElts)
9347         ExpectedVExtractIdx = BaseIdx;
9348       ExpectedVExtractIdx += 2;
9349       continue;
9350     }
9351 
9352     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
9353 
9354     if (!CanFold)
9355       break;
9356 
9357     SDValue Op0 = Op.getOperand(0);
9358     SDValue Op1 = Op.getOperand(1);
9359 
9360     // Try to match the following pattern:
9361     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
9362     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9363         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9364         Op0.getOperand(0) == Op1.getOperand(0) &&
9365         isa<ConstantSDNode>(Op0.getOperand(1)) &&
9366         isa<ConstantSDNode>(Op1.getOperand(1)));
9367     if (!CanFold)
9368       break;
9369 
9370     unsigned I0 = Op0.getConstantOperandVal(1);
9371     unsigned I1 = Op1.getConstantOperandVal(1);
9372 
9373     if (i * 2 < NumElts) {
9374       if (V0.isUndef()) {
9375         V0 = Op0.getOperand(0);
9376         if (V0.getValueType() != VT)
9377           return false;
9378       }
9379     } else {
9380       if (V1.isUndef()) {
9381         V1 = Op0.getOperand(0);
9382         if (V1.getValueType() != VT)
9383           return false;
9384       }
9385       if (i * 2 == NumElts)
9386         ExpectedVExtractIdx = BaseIdx;
9387     }
9388 
9389     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
9390     if (I0 == ExpectedVExtractIdx)
9391       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
9392     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
9393       // Try to match the following dag sequence:
9394       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
9395       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
9396     } else
9397       CanFold = false;
9398 
9399     ExpectedVExtractIdx += 2;
9400   }
9401 
9402   return CanFold;
9403 }
9404 
9405 /// Emit a sequence of two 128-bit horizontal add/sub followed by
9406 /// a concat_vector.
9407 ///
9408 /// This is a helper function of LowerToHorizontalOp().
9409 /// This function expects two 256-bit vectors called V0 and V1.
9410 /// At first, each vector is split into two separate 128-bit vectors.
9411 /// Then, the resulting 128-bit vectors are used to implement two
9412 /// horizontal binary operations.
9413 ///
9414 /// The kind of horizontal binary operation is defined by \p X86Opcode.
9415 ///
9416 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
9417 /// the two new horizontal binop.
9418 /// When Mode is set, the first horizontal binop dag node would take as input
9419 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
9420 /// horizontal binop dag node would take as input the lower 128-bit of V1
9421 /// and the upper 128-bit of V1.
9422 ///   Example:
9423 ///     HADD V0_LO, V0_HI
9424 ///     HADD V1_LO, V1_HI
9425 ///
9426 /// Otherwise, the first horizontal binop dag node takes as input the lower
9427 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
9428 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
9429 ///   Example:
9430 ///     HADD V0_LO, V1_LO
9431 ///     HADD V0_HI, V1_HI
9432 ///
9433 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
9434 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
9435 /// the upper 128-bits of the result.
ExpandHorizontalBinOp(const SDValue & V0,const SDValue & V1,const SDLoc & DL,SelectionDAG & DAG,unsigned X86Opcode,bool Mode,bool isUndefLO,bool isUndefHI)9436 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
9437                                      const SDLoc &DL, SelectionDAG &DAG,
9438                                      unsigned X86Opcode, bool Mode,
9439                                      bool isUndefLO, bool isUndefHI) {
9440   MVT VT = V0.getSimpleValueType();
9441   assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
9442          "Invalid nodes in input!");
9443 
9444   unsigned NumElts = VT.getVectorNumElements();
9445   SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
9446   SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
9447   SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
9448   SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
9449   MVT NewVT = V0_LO.getSimpleValueType();
9450 
9451   SDValue LO = DAG.getUNDEF(NewVT);
9452   SDValue HI = DAG.getUNDEF(NewVT);
9453 
9454   if (Mode) {
9455     // Don't emit a horizontal binop if the result is expected to be UNDEF.
9456     if (!isUndefLO && !V0->isUndef())
9457       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
9458     if (!isUndefHI && !V1->isUndef())
9459       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
9460   } else {
9461     // Don't emit a horizontal binop if the result is expected to be UNDEF.
9462     if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
9463       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
9464 
9465     if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
9466       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
9467   }
9468 
9469   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
9470 }
9471 
9472 /// Returns true iff \p BV builds a vector with the result equivalent to
9473 /// the result of ADDSUB/SUBADD operation.
9474 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
9475 /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
9476 /// \p Opnd0 and \p Opnd1.
isAddSubOrSubAdd(const BuildVectorSDNode * BV,const X86Subtarget & Subtarget,SelectionDAG & DAG,SDValue & Opnd0,SDValue & Opnd1,unsigned & NumExtracts,bool & IsSubAdd)9477 static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
9478                              const X86Subtarget &Subtarget, SelectionDAG &DAG,
9479                              SDValue &Opnd0, SDValue &Opnd1,
9480                              unsigned &NumExtracts,
9481                              bool &IsSubAdd) {
9482 
9483   MVT VT = BV->getSimpleValueType(0);
9484   if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
9485     return false;
9486 
9487   unsigned NumElts = VT.getVectorNumElements();
9488   SDValue InVec0 = DAG.getUNDEF(VT);
9489   SDValue InVec1 = DAG.getUNDEF(VT);
9490 
9491   NumExtracts = 0;
9492 
9493   // Odd-numbered elements in the input build vector are obtained from
9494   // adding/subtracting two integer/float elements.
9495   // Even-numbered elements in the input build vector are obtained from
9496   // subtracting/adding two integer/float elements.
9497   unsigned Opc[2] = {0, 0};
9498   for (unsigned i = 0, e = NumElts; i != e; ++i) {
9499     SDValue Op = BV->getOperand(i);
9500 
9501     // Skip 'undef' values.
9502     unsigned Opcode = Op.getOpcode();
9503     if (Opcode == ISD::UNDEF)
9504       continue;
9505 
9506     // Early exit if we found an unexpected opcode.
9507     if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
9508       return false;
9509 
9510     SDValue Op0 = Op.getOperand(0);
9511     SDValue Op1 = Op.getOperand(1);
9512 
9513     // Try to match the following pattern:
9514     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
9515     // Early exit if we cannot match that sequence.
9516     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9517         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9518         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9519         Op0.getOperand(1) != Op1.getOperand(1))
9520       return false;
9521 
9522     unsigned I0 = Op0.getConstantOperandVal(1);
9523     if (I0 != i)
9524       return false;
9525 
9526     // We found a valid add/sub node, make sure its the same opcode as previous
9527     // elements for this parity.
9528     if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
9529       return false;
9530     Opc[i % 2] = Opcode;
9531 
9532     // Update InVec0 and InVec1.
9533     if (InVec0.isUndef()) {
9534       InVec0 = Op0.getOperand(0);
9535       if (InVec0.getSimpleValueType() != VT)
9536         return false;
9537     }
9538     if (InVec1.isUndef()) {
9539       InVec1 = Op1.getOperand(0);
9540       if (InVec1.getSimpleValueType() != VT)
9541         return false;
9542     }
9543 
9544     // Make sure that operands in input to each add/sub node always
9545     // come from a same pair of vectors.
9546     if (InVec0 != Op0.getOperand(0)) {
9547       if (Opcode == ISD::FSUB)
9548         return false;
9549 
9550       // FADD is commutable. Try to commute the operands
9551       // and then test again.
9552       std::swap(Op0, Op1);
9553       if (InVec0 != Op0.getOperand(0))
9554         return false;
9555     }
9556 
9557     if (InVec1 != Op1.getOperand(0))
9558       return false;
9559 
9560     // Increment the number of extractions done.
9561     ++NumExtracts;
9562   }
9563 
9564   // Ensure we have found an opcode for both parities and that they are
9565   // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
9566   // inputs are undef.
9567   if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
9568       InVec0.isUndef() || InVec1.isUndef())
9569     return false;
9570 
9571   IsSubAdd = Opc[0] == ISD::FADD;
9572 
9573   Opnd0 = InVec0;
9574   Opnd1 = InVec1;
9575   return true;
9576 }
9577 
9578 /// Returns true if is possible to fold MUL and an idiom that has already been
9579 /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
9580 /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
9581 /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
9582 ///
9583 /// Prior to calling this function it should be known that there is some
9584 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
9585 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
9586 /// before replacement of such SDNode with ADDSUB operation. Thus the number
9587 /// of \p Opnd0 uses is expected to be equal to 2.
9588 /// For example, this function may be called for the following IR:
9589 ///    %AB = fmul fast <2 x double> %A, %B
9590 ///    %Sub = fsub fast <2 x double> %AB, %C
9591 ///    %Add = fadd fast <2 x double> %AB, %C
9592 ///    %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
9593 ///                            <2 x i32> <i32 0, i32 3>
9594 /// There is a def for %Addsub here, which potentially can be replaced by
9595 /// X86ISD::ADDSUB operation:
9596 ///    %Addsub = X86ISD::ADDSUB %AB, %C
9597 /// and such ADDSUB can further be replaced with FMADDSUB:
9598 ///    %Addsub = FMADDSUB %A, %B, %C.
9599 ///
9600 /// The main reason why this method is called before the replacement of the
9601 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
9602 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
9603 /// FMADDSUB is.
isFMAddSubOrFMSubAdd(const X86Subtarget & Subtarget,SelectionDAG & DAG,SDValue & Opnd0,SDValue & Opnd1,SDValue & Opnd2,unsigned ExpectedUses)9604 static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
9605                                  SelectionDAG &DAG,
9606                                  SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
9607                                  unsigned ExpectedUses) {
9608   if (Opnd0.getOpcode() != ISD::FMUL ||
9609       !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
9610     return false;
9611 
9612   // FIXME: These checks must match the similar ones in
9613   // DAGCombiner::visitFADDForFMACombine. It would be good to have one
9614   // function that would answer if it is Ok to fuse MUL + ADD to FMADD
9615   // or MUL + ADDSUB to FMADDSUB.
9616   const TargetOptions &Options = DAG.getTarget().Options;
9617   bool AllowFusion =
9618       (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
9619   if (!AllowFusion)
9620     return false;
9621 
9622   Opnd2 = Opnd1;
9623   Opnd1 = Opnd0.getOperand(1);
9624   Opnd0 = Opnd0.getOperand(0);
9625 
9626   return true;
9627 }
9628 
9629 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
9630 /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
9631 /// X86ISD::FMSUBADD node.
lowerToAddSubOrFMAddSub(const BuildVectorSDNode * BV,const X86Subtarget & Subtarget,SelectionDAG & DAG)9632 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
9633                                        const X86Subtarget &Subtarget,
9634                                        SelectionDAG &DAG) {
9635   SDValue Opnd0, Opnd1;
9636   unsigned NumExtracts;
9637   bool IsSubAdd;
9638   if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
9639                         IsSubAdd))
9640     return SDValue();
9641 
9642   MVT VT = BV->getSimpleValueType(0);
9643   SDLoc DL(BV);
9644 
9645   // Try to generate X86ISD::FMADDSUB node here.
9646   SDValue Opnd2;
9647   if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
9648     unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
9649     return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
9650   }
9651 
9652   // We only support ADDSUB.
9653   if (IsSubAdd)
9654     return SDValue();
9655 
9656   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
9657   // the ADDSUB idiom has been successfully recognized. There are no known
9658   // X86 targets with 512-bit ADDSUB instructions!
9659   // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
9660   // recognition.
9661   if (VT.is512BitVector())
9662     return SDValue();
9663 
9664   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
9665 }
9666 
isHopBuildVector(const BuildVectorSDNode * BV,SelectionDAG & DAG,unsigned & HOpcode,SDValue & V0,SDValue & V1)9667 static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
9668                              unsigned &HOpcode, SDValue &V0, SDValue &V1) {
9669   // Initialize outputs to known values.
9670   MVT VT = BV->getSimpleValueType(0);
9671   HOpcode = ISD::DELETED_NODE;
9672   V0 = DAG.getUNDEF(VT);
9673   V1 = DAG.getUNDEF(VT);
9674 
9675   // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
9676   // half of the result is calculated independently from the 128-bit halves of
9677   // the inputs, so that makes the index-checking logic below more complicated.
9678   unsigned NumElts = VT.getVectorNumElements();
9679   unsigned GenericOpcode = ISD::DELETED_NODE;
9680   unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
9681   unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
9682   unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
9683   for (unsigned i = 0; i != Num128BitChunks; ++i) {
9684     for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
9685       // Ignore undef elements.
9686       SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
9687       if (Op.isUndef())
9688         continue;
9689 
9690       // If there's an opcode mismatch, we're done.
9691       if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
9692         return false;
9693 
9694       // Initialize horizontal opcode.
9695       if (HOpcode == ISD::DELETED_NODE) {
9696         GenericOpcode = Op.getOpcode();
9697         switch (GenericOpcode) {
9698         case ISD::ADD: HOpcode = X86ISD::HADD; break;
9699         case ISD::SUB: HOpcode = X86ISD::HSUB; break;
9700         case ISD::FADD: HOpcode = X86ISD::FHADD; break;
9701         case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
9702         default: return false;
9703         }
9704       }
9705 
9706       SDValue Op0 = Op.getOperand(0);
9707       SDValue Op1 = Op.getOperand(1);
9708       if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9709           Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9710           Op0.getOperand(0) != Op1.getOperand(0) ||
9711           !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9712           !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
9713         return false;
9714 
9715       // The source vector is chosen based on which 64-bit half of the
9716       // destination vector is being calculated.
9717       if (j < NumEltsIn64Bits) {
9718         if (V0.isUndef())
9719           V0 = Op0.getOperand(0);
9720       } else {
9721         if (V1.isUndef())
9722           V1 = Op0.getOperand(0);
9723       }
9724 
9725       SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
9726       if (SourceVec != Op0.getOperand(0))
9727         return false;
9728 
9729       // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
9730       unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
9731       unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
9732       unsigned ExpectedIndex = i * NumEltsIn128Bits +
9733                                (j % NumEltsIn64Bits) * 2;
9734       if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
9735         continue;
9736 
9737       // If this is not a commutative op, this does not match.
9738       if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
9739         return false;
9740 
9741       // Addition is commutative, so try swapping the extract indexes.
9742       // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
9743       if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
9744         continue;
9745 
9746       // Extract indexes do not match horizontal requirement.
9747       return false;
9748     }
9749   }
9750   // We matched. Opcode and operands are returned by reference as arguments.
9751   return true;
9752 }
9753 
getHopForBuildVector(const BuildVectorSDNode * BV,SelectionDAG & DAG,unsigned HOpcode,SDValue V0,SDValue V1)9754 static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
9755                                     SelectionDAG &DAG, unsigned HOpcode,
9756                                     SDValue V0, SDValue V1) {
9757   // If either input vector is not the same size as the build vector,
9758   // extract/insert the low bits to the correct size.
9759   // This is free (examples: zmm --> xmm, xmm --> ymm).
9760   MVT VT = BV->getSimpleValueType(0);
9761   unsigned Width = VT.getSizeInBits();
9762   if (V0.getValueSizeInBits() > Width)
9763     V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
9764   else if (V0.getValueSizeInBits() < Width)
9765     V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
9766 
9767   if (V1.getValueSizeInBits() > Width)
9768     V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
9769   else if (V1.getValueSizeInBits() < Width)
9770     V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
9771 
9772   unsigned NumElts = VT.getVectorNumElements();
9773   APInt DemandedElts = APInt::getAllOnesValue(NumElts);
9774   for (unsigned i = 0; i != NumElts; ++i)
9775     if (BV->getOperand(i).isUndef())
9776       DemandedElts.clearBit(i);
9777 
9778   // If we don't need the upper xmm, then perform as a xmm hop.
9779   unsigned HalfNumElts = NumElts / 2;
9780   if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
9781     MVT HalfVT = VT.getHalfNumVectorElementsVT();
9782     V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
9783     V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
9784     SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
9785     return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
9786   }
9787 
9788   return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
9789 }
9790 
9791 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
LowerToHorizontalOp(const BuildVectorSDNode * BV,const X86Subtarget & Subtarget,SelectionDAG & DAG)9792 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
9793                                    const X86Subtarget &Subtarget,
9794                                    SelectionDAG &DAG) {
9795   // We need at least 2 non-undef elements to make this worthwhile by default.
9796   unsigned NumNonUndefs =
9797       count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
9798   if (NumNonUndefs < 2)
9799     return SDValue();
9800 
9801   // There are 4 sets of horizontal math operations distinguished by type:
9802   // int/FP at 128-bit/256-bit. Each type was introduced with a different
9803   // subtarget feature. Try to match those "native" patterns first.
9804   MVT VT = BV->getSimpleValueType(0);
9805   if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
9806       ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
9807       ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
9808       ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
9809     unsigned HOpcode;
9810     SDValue V0, V1;
9811     if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
9812       return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
9813   }
9814 
9815   // Try harder to match 256-bit ops by using extract/concat.
9816   if (!Subtarget.hasAVX() || !VT.is256BitVector())
9817     return SDValue();
9818 
9819   // Count the number of UNDEF operands in the build_vector in input.
9820   unsigned NumElts = VT.getVectorNumElements();
9821   unsigned Half = NumElts / 2;
9822   unsigned NumUndefsLO = 0;
9823   unsigned NumUndefsHI = 0;
9824   for (unsigned i = 0, e = Half; i != e; ++i)
9825     if (BV->getOperand(i)->isUndef())
9826       NumUndefsLO++;
9827 
9828   for (unsigned i = Half, e = NumElts; i != e; ++i)
9829     if (BV->getOperand(i)->isUndef())
9830       NumUndefsHI++;
9831 
9832   SDLoc DL(BV);
9833   SDValue InVec0, InVec1;
9834   if (VT == MVT::v8i32 || VT == MVT::v16i16) {
9835     SDValue InVec2, InVec3;
9836     unsigned X86Opcode;
9837     bool CanFold = true;
9838 
9839     if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
9840         isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
9841                               InVec3) &&
9842         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9843         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9844       X86Opcode = X86ISD::HADD;
9845     else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
9846                                    InVec1) &&
9847              isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
9848                                    InVec3) &&
9849              ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9850              ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9851       X86Opcode = X86ISD::HSUB;
9852     else
9853       CanFold = false;
9854 
9855     if (CanFold) {
9856       // Do not try to expand this build_vector into a pair of horizontal
9857       // add/sub if we can emit a pair of scalar add/sub.
9858       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9859         return SDValue();
9860 
9861       // Convert this build_vector into a pair of horizontal binops followed by
9862       // a concat vector. We must adjust the outputs from the partial horizontal
9863       // matching calls above to account for undefined vector halves.
9864       SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
9865       SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
9866       assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
9867       bool isUndefLO = NumUndefsLO == Half;
9868       bool isUndefHI = NumUndefsHI == Half;
9869       return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
9870                                    isUndefHI);
9871     }
9872   }
9873 
9874   if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
9875       VT == MVT::v16i16) {
9876     unsigned X86Opcode;
9877     if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
9878       X86Opcode = X86ISD::HADD;
9879     else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
9880                                    InVec1))
9881       X86Opcode = X86ISD::HSUB;
9882     else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
9883                                    InVec1))
9884       X86Opcode = X86ISD::FHADD;
9885     else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
9886                                    InVec1))
9887       X86Opcode = X86ISD::FHSUB;
9888     else
9889       return SDValue();
9890 
9891     // Don't try to expand this build_vector into a pair of horizontal add/sub
9892     // if we can simply emit a pair of scalar add/sub.
9893     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9894       return SDValue();
9895 
9896     // Convert this build_vector into two horizontal add/sub followed by
9897     // a concat vector.
9898     bool isUndefLO = NumUndefsLO == Half;
9899     bool isUndefHI = NumUndefsHI == Half;
9900     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
9901                                  isUndefLO, isUndefHI);
9902   }
9903 
9904   return SDValue();
9905 }
9906 
9907 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
9908                           SelectionDAG &DAG);
9909 
9910 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
9911 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
9912 /// just apply the bit to the vectors.
9913 /// NOTE: Its not in our interest to start make a general purpose vectorizer
9914 /// from this, but enough scalar bit operations are created from the later
9915 /// legalization + scalarization stages to need basic support.
lowerBuildVectorToBitOp(BuildVectorSDNode * Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)9916 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
9917                                        const X86Subtarget &Subtarget,
9918                                        SelectionDAG &DAG) {
9919   SDLoc DL(Op);
9920   MVT VT = Op->getSimpleValueType(0);
9921   unsigned NumElems = VT.getVectorNumElements();
9922   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9923 
9924   // Check that all elements have the same opcode.
9925   // TODO: Should we allow UNDEFS and if so how many?
9926   unsigned Opcode = Op->getOperand(0).getOpcode();
9927   for (unsigned i = 1; i < NumElems; ++i)
9928     if (Opcode != Op->getOperand(i).getOpcode())
9929       return SDValue();
9930 
9931   // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
9932   bool IsShift = false;
9933   switch (Opcode) {
9934   default:
9935     return SDValue();
9936   case ISD::SHL:
9937   case ISD::SRL:
9938   case ISD::SRA:
9939     IsShift = true;
9940     break;
9941   case ISD::AND:
9942   case ISD::XOR:
9943   case ISD::OR:
9944     // Don't do this if the buildvector is a splat - we'd replace one
9945     // constant with an entire vector.
9946     if (Op->getSplatValue())
9947       return SDValue();
9948     if (!TLI.isOperationLegalOrPromote(Opcode, VT))
9949       return SDValue();
9950     break;
9951   }
9952 
9953   SmallVector<SDValue, 4> LHSElts, RHSElts;
9954   for (SDValue Elt : Op->ops()) {
9955     SDValue LHS = Elt.getOperand(0);
9956     SDValue RHS = Elt.getOperand(1);
9957 
9958     // We expect the canonicalized RHS operand to be the constant.
9959     if (!isa<ConstantSDNode>(RHS))
9960       return SDValue();
9961 
9962     // Extend shift amounts.
9963     if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
9964       if (!IsShift)
9965         return SDValue();
9966       RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
9967     }
9968 
9969     LHSElts.push_back(LHS);
9970     RHSElts.push_back(RHS);
9971   }
9972 
9973   // Limit to shifts by uniform immediates.
9974   // TODO: Only accept vXi8/vXi64 special cases?
9975   // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
9976   if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
9977     return SDValue();
9978 
9979   SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
9980   SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
9981   SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
9982 
9983   if (!IsShift)
9984     return Res;
9985 
9986   // Immediately lower the shift to ensure the constant build vector doesn't
9987   // get converted to a constant pool before the shift is lowered.
9988   return LowerShift(Res, Subtarget, DAG);
9989 }
9990 
9991 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
9992 /// functionality to do this, so it's all zeros, all ones, or some derivation
9993 /// that is cheap to calculate.
materializeVectorConstant(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)9994 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
9995                                          const X86Subtarget &Subtarget) {
9996   SDLoc DL(Op);
9997   MVT VT = Op.getSimpleValueType();
9998 
9999   // Vectors containing all zeros can be matched by pxor and xorps.
10000   if (ISD::isBuildVectorAllZeros(Op.getNode()))
10001     return Op;
10002 
10003   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
10004   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
10005   // vpcmpeqd on 256-bit vectors.
10006   if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
10007     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
10008       return Op;
10009 
10010     return getOnesVector(VT, DAG, DL);
10011   }
10012 
10013   return SDValue();
10014 }
10015 
10016 /// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
10017 /// from a vector of source values and a vector of extraction indices.
10018 /// The vectors might be manipulated to match the type of the permute op.
createVariablePermute(MVT VT,SDValue SrcVec,SDValue IndicesVec,SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget)10019 static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
10020                                      SDLoc &DL, SelectionDAG &DAG,
10021                                      const X86Subtarget &Subtarget) {
10022   MVT ShuffleVT = VT;
10023   EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10024   unsigned NumElts = VT.getVectorNumElements();
10025   unsigned SizeInBits = VT.getSizeInBits();
10026 
10027   // Adjust IndicesVec to match VT size.
10028   assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
10029          "Illegal variable permute mask size");
10030   if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
10031     // Narrow/widen the indices vector to the correct size.
10032     if (IndicesVec.getValueSizeInBits() > SizeInBits)
10033       IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
10034                                     NumElts * VT.getScalarSizeInBits());
10035     else if (IndicesVec.getValueSizeInBits() < SizeInBits)
10036       IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
10037                                   SDLoc(IndicesVec), SizeInBits);
10038     // Zero-extend the index elements within the vector.
10039     if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
10040       IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
10041                                IndicesVT, IndicesVec);
10042   }
10043   IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
10044 
10045   // Handle SrcVec that don't match VT type.
10046   if (SrcVec.getValueSizeInBits() != SizeInBits) {
10047     if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
10048       // Handle larger SrcVec by treating it as a larger permute.
10049       unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
10050       VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
10051       IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10052       IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
10053                                   Subtarget, DAG, SDLoc(IndicesVec));
10054       SDValue NewSrcVec =
10055           createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10056       if (NewSrcVec)
10057         return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
10058       return SDValue();
10059     } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
10060       // Widen smaller SrcVec to match VT.
10061       SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
10062     } else
10063       return SDValue();
10064   }
10065 
10066   auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
10067     assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
10068     EVT SrcVT = Idx.getValueType();
10069     unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
10070     uint64_t IndexScale = 0;
10071     uint64_t IndexOffset = 0;
10072 
10073     // If we're scaling a smaller permute op, then we need to repeat the
10074     // indices, scaling and offsetting them as well.
10075     // e.g. v4i32 -> v16i8 (Scale = 4)
10076     // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
10077     // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
10078     for (uint64_t i = 0; i != Scale; ++i) {
10079       IndexScale |= Scale << (i * NumDstBits);
10080       IndexOffset |= i << (i * NumDstBits);
10081     }
10082 
10083     Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
10084                       DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
10085     Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
10086                       DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
10087     return Idx;
10088   };
10089 
10090   unsigned Opcode = 0;
10091   switch (VT.SimpleTy) {
10092   default:
10093     break;
10094   case MVT::v16i8:
10095     if (Subtarget.hasSSSE3())
10096       Opcode = X86ISD::PSHUFB;
10097     break;
10098   case MVT::v8i16:
10099     if (Subtarget.hasVLX() && Subtarget.hasBWI())
10100       Opcode = X86ISD::VPERMV;
10101     else if (Subtarget.hasSSSE3()) {
10102       Opcode = X86ISD::PSHUFB;
10103       ShuffleVT = MVT::v16i8;
10104     }
10105     break;
10106   case MVT::v4f32:
10107   case MVT::v4i32:
10108     if (Subtarget.hasAVX()) {
10109       Opcode = X86ISD::VPERMILPV;
10110       ShuffleVT = MVT::v4f32;
10111     } else if (Subtarget.hasSSSE3()) {
10112       Opcode = X86ISD::PSHUFB;
10113       ShuffleVT = MVT::v16i8;
10114     }
10115     break;
10116   case MVT::v2f64:
10117   case MVT::v2i64:
10118     if (Subtarget.hasAVX()) {
10119       // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
10120       IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10121       Opcode = X86ISD::VPERMILPV;
10122       ShuffleVT = MVT::v2f64;
10123     } else if (Subtarget.hasSSE41()) {
10124       // SSE41 can compare v2i64 - select between indices 0 and 1.
10125       return DAG.getSelectCC(
10126           DL, IndicesVec,
10127           getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
10128           DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
10129           DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
10130           ISD::CondCode::SETEQ);
10131     }
10132     break;
10133   case MVT::v32i8:
10134     if (Subtarget.hasVLX() && Subtarget.hasVBMI())
10135       Opcode = X86ISD::VPERMV;
10136     else if (Subtarget.hasXOP()) {
10137       SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
10138       SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
10139       SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
10140       SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
10141       return DAG.getNode(
10142           ISD::CONCAT_VECTORS, DL, VT,
10143           DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
10144           DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
10145     } else if (Subtarget.hasAVX()) {
10146       SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
10147       SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
10148       SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
10149       SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
10150       auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
10151                               ArrayRef<SDValue> Ops) {
10152         // Permute Lo and Hi and then select based on index range.
10153         // This works as SHUFB uses bits[3:0] to permute elements and we don't
10154         // care about the bit[7] as its just an index vector.
10155         SDValue Idx = Ops[2];
10156         EVT VT = Idx.getValueType();
10157         return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
10158                                DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
10159                                DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
10160                                ISD::CondCode::SETGT);
10161       };
10162       SDValue Ops[] = {LoLo, HiHi, IndicesVec};
10163       return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
10164                               PSHUFBBuilder);
10165     }
10166     break;
10167   case MVT::v16i16:
10168     if (Subtarget.hasVLX() && Subtarget.hasBWI())
10169       Opcode = X86ISD::VPERMV;
10170     else if (Subtarget.hasAVX()) {
10171       // Scale to v32i8 and perform as v32i8.
10172       IndicesVec = ScaleIndices(IndicesVec, 2);
10173       return DAG.getBitcast(
10174           VT, createVariablePermute(
10175                   MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
10176                   DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
10177     }
10178     break;
10179   case MVT::v8f32:
10180   case MVT::v8i32:
10181     if (Subtarget.hasAVX2())
10182       Opcode = X86ISD::VPERMV;
10183     else if (Subtarget.hasAVX()) {
10184       SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
10185       SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10186                                           {0, 1, 2, 3, 0, 1, 2, 3});
10187       SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10188                                           {4, 5, 6, 7, 4, 5, 6, 7});
10189       if (Subtarget.hasXOP())
10190         return DAG.getBitcast(
10191             VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
10192                             IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10193       // Permute Lo and Hi and then select based on index range.
10194       // This works as VPERMILPS only uses index bits[0:1] to permute elements.
10195       SDValue Res = DAG.getSelectCC(
10196           DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
10197           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
10198           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
10199           ISD::CondCode::SETGT);
10200       return DAG.getBitcast(VT, Res);
10201     }
10202     break;
10203   case MVT::v4i64:
10204   case MVT::v4f64:
10205     if (Subtarget.hasAVX512()) {
10206       if (!Subtarget.hasVLX()) {
10207         MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
10208         SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
10209                                 SDLoc(SrcVec));
10210         IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
10211                                     DAG, SDLoc(IndicesVec));
10212         SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
10213                                             DAG, Subtarget);
10214         return extract256BitVector(Res, 0, DAG, DL);
10215       }
10216       Opcode = X86ISD::VPERMV;
10217     } else if (Subtarget.hasAVX()) {
10218       SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
10219       SDValue LoLo =
10220           DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
10221       SDValue HiHi =
10222           DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
10223       // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
10224       IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10225       if (Subtarget.hasXOP())
10226         return DAG.getBitcast(
10227             VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
10228                             IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10229       // Permute Lo and Hi and then select based on index range.
10230       // This works as VPERMILPD only uses index bit[1] to permute elements.
10231       SDValue Res = DAG.getSelectCC(
10232           DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
10233           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
10234           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
10235           ISD::CondCode::SETGT);
10236       return DAG.getBitcast(VT, Res);
10237     }
10238     break;
10239   case MVT::v64i8:
10240     if (Subtarget.hasVBMI())
10241       Opcode = X86ISD::VPERMV;
10242     break;
10243   case MVT::v32i16:
10244     if (Subtarget.hasBWI())
10245       Opcode = X86ISD::VPERMV;
10246     break;
10247   case MVT::v16f32:
10248   case MVT::v16i32:
10249   case MVT::v8f64:
10250   case MVT::v8i64:
10251     if (Subtarget.hasAVX512())
10252       Opcode = X86ISD::VPERMV;
10253     break;
10254   }
10255   if (!Opcode)
10256     return SDValue();
10257 
10258   assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
10259          (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
10260          "Illegal variable permute shuffle type");
10261 
10262   uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
10263   if (Scale > 1)
10264     IndicesVec = ScaleIndices(IndicesVec, Scale);
10265 
10266   EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
10267   IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
10268 
10269   SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
10270   SDValue Res = Opcode == X86ISD::VPERMV
10271                     ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
10272                     : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
10273   return DAG.getBitcast(VT, Res);
10274 }
10275 
10276 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
10277 // reasoned to be a permutation of a vector by indices in a non-constant vector.
10278 // (build_vector (extract_elt V, (extract_elt I, 0)),
10279 //               (extract_elt V, (extract_elt I, 1)),
10280 //                    ...
10281 // ->
10282 // (vpermv I, V)
10283 //
10284 // TODO: Handle undefs
10285 // TODO: Utilize pshufb and zero mask blending to support more efficient
10286 // construction of vectors with constant-0 elements.
10287 static SDValue
LowerBUILD_VECTORAsVariablePermute(SDValue V,SelectionDAG & DAG,const X86Subtarget & Subtarget)10288 LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
10289                                    const X86Subtarget &Subtarget) {
10290   SDValue SrcVec, IndicesVec;
10291   // Check for a match of the permute source vector and permute index elements.
10292   // This is done by checking that the i-th build_vector operand is of the form:
10293   // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
10294   for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
10295     SDValue Op = V.getOperand(Idx);
10296     if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10297       return SDValue();
10298 
10299     // If this is the first extract encountered in V, set the source vector,
10300     // otherwise verify the extract is from the previously defined source
10301     // vector.
10302     if (!SrcVec)
10303       SrcVec = Op.getOperand(0);
10304     else if (SrcVec != Op.getOperand(0))
10305       return SDValue();
10306     SDValue ExtractedIndex = Op->getOperand(1);
10307     // Peek through extends.
10308     if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
10309         ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
10310       ExtractedIndex = ExtractedIndex.getOperand(0);
10311     if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10312       return SDValue();
10313 
10314     // If this is the first extract from the index vector candidate, set the
10315     // indices vector, otherwise verify the extract is from the previously
10316     // defined indices vector.
10317     if (!IndicesVec)
10318       IndicesVec = ExtractedIndex.getOperand(0);
10319     else if (IndicesVec != ExtractedIndex.getOperand(0))
10320       return SDValue();
10321 
10322     auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
10323     if (!PermIdx || PermIdx->getAPIntValue() != Idx)
10324       return SDValue();
10325   }
10326 
10327   SDLoc DL(V);
10328   MVT VT = V.getSimpleValueType();
10329   return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10330 }
10331 
10332 SDValue
LowerBUILD_VECTOR(SDValue Op,SelectionDAG & DAG) const10333 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
10334   SDLoc dl(Op);
10335 
10336   MVT VT = Op.getSimpleValueType();
10337   MVT EltVT = VT.getVectorElementType();
10338   unsigned NumElems = Op.getNumOperands();
10339 
10340   // Generate vectors for predicate vectors.
10341   if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
10342     return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
10343 
10344   if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
10345     return VectorConstant;
10346 
10347   unsigned EVTBits = EltVT.getSizeInBits();
10348   APInt UndefMask = APInt::getNullValue(NumElems);
10349   APInt ZeroMask = APInt::getNullValue(NumElems);
10350   APInt NonZeroMask = APInt::getNullValue(NumElems);
10351   bool IsAllConstants = true;
10352   SmallSet<SDValue, 8> Values;
10353   unsigned NumConstants = NumElems;
10354   for (unsigned i = 0; i < NumElems; ++i) {
10355     SDValue Elt = Op.getOperand(i);
10356     if (Elt.isUndef()) {
10357       UndefMask.setBit(i);
10358       continue;
10359     }
10360     Values.insert(Elt);
10361     if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
10362       IsAllConstants = false;
10363       NumConstants--;
10364     }
10365     if (X86::isZeroNode(Elt)) {
10366       ZeroMask.setBit(i);
10367     } else {
10368       NonZeroMask.setBit(i);
10369     }
10370   }
10371 
10372   // All undef vector. Return an UNDEF. All zero vectors were handled above.
10373   if (NonZeroMask == 0) {
10374     assert(UndefMask.isAllOnesValue() && "Fully undef mask expected");
10375     return DAG.getUNDEF(VT);
10376   }
10377 
10378   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
10379 
10380   // If the upper elts of a ymm/zmm are undef/zero then we might be better off
10381   // lowering to a smaller build vector and padding with undef/zero.
10382   if ((VT.is256BitVector() || VT.is512BitVector()) &&
10383       !isFoldableUseOfShuffle(BV)) {
10384     unsigned UpperElems = NumElems / 2;
10385     APInt UndefOrZeroMask = UndefMask | ZeroMask;
10386     unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countLeadingOnes();
10387     if (NumUpperUndefsOrZeros >= UpperElems) {
10388       if (VT.is512BitVector() &&
10389           NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
10390         UpperElems = NumElems - (NumElems / 4);
10391       bool UndefUpper = UndefMask.countLeadingOnes() >= UpperElems;
10392       MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
10393       SDValue NewBV =
10394           DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
10395       return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
10396     }
10397   }
10398 
10399   if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
10400     return AddSub;
10401   if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
10402     return HorizontalOp;
10403   if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
10404     return Broadcast;
10405   if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
10406     return BitOp;
10407 
10408   unsigned NumZero = ZeroMask.countPopulation();
10409   unsigned NumNonZero = NonZeroMask.countPopulation();
10410 
10411   // If we are inserting one variable into a vector of non-zero constants, try
10412   // to avoid loading each constant element as a scalar. Load the constants as a
10413   // vector and then insert the variable scalar element. If insertion is not
10414   // supported, fall back to a shuffle to get the scalar blended with the
10415   // constants. Insertion into a zero vector is handled as a special-case
10416   // somewhere below here.
10417   if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
10418       (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
10419        isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
10420     // Create an all-constant vector. The variable element in the old
10421     // build vector is replaced by undef in the constant vector. Save the
10422     // variable scalar element and its index for use in the insertelement.
10423     LLVMContext &Context = *DAG.getContext();
10424     Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
10425     SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
10426     SDValue VarElt;
10427     SDValue InsIndex;
10428     for (unsigned i = 0; i != NumElems; ++i) {
10429       SDValue Elt = Op.getOperand(i);
10430       if (auto *C = dyn_cast<ConstantSDNode>(Elt))
10431         ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
10432       else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
10433         ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
10434       else if (!Elt.isUndef()) {
10435         assert(!VarElt.getNode() && !InsIndex.getNode() &&
10436                "Expected one variable element in this vector");
10437         VarElt = Elt;
10438         InsIndex = DAG.getVectorIdxConstant(i, dl);
10439       }
10440     }
10441     Constant *CV = ConstantVector::get(ConstVecOps);
10442     SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
10443 
10444     // The constants we just created may not be legal (eg, floating point). We
10445     // must lower the vector right here because we can not guarantee that we'll
10446     // legalize it before loading it. This is also why we could not just create
10447     // a new build vector here. If the build vector contains illegal constants,
10448     // it could get split back up into a series of insert elements.
10449     // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
10450     SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
10451     MachineFunction &MF = DAG.getMachineFunction();
10452     MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
10453     SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
10454     unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
10455     unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
10456     if (InsertC < NumEltsInLow128Bits)
10457       return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
10458 
10459     // There's no good way to insert into the high elements of a >128-bit
10460     // vector, so use shuffles to avoid an extract/insert sequence.
10461     assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
10462     assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
10463     SmallVector<int, 8> ShuffleMask;
10464     unsigned NumElts = VT.getVectorNumElements();
10465     for (unsigned i = 0; i != NumElts; ++i)
10466       ShuffleMask.push_back(i == InsertC ? NumElts : i);
10467     SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
10468     return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
10469   }
10470 
10471   // Special case for single non-zero, non-undef, element.
10472   if (NumNonZero == 1) {
10473     unsigned Idx = NonZeroMask.countTrailingZeros();
10474     SDValue Item = Op.getOperand(Idx);
10475 
10476     // If we have a constant or non-constant insertion into the low element of
10477     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
10478     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
10479     // depending on what the source datatype is.
10480     if (Idx == 0) {
10481       if (NumZero == 0)
10482         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10483 
10484       if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
10485           (EltVT == MVT::i64 && Subtarget.is64Bit())) {
10486         assert((VT.is128BitVector() || VT.is256BitVector() ||
10487                 VT.is512BitVector()) &&
10488                "Expected an SSE value type!");
10489         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10490         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
10491         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10492       }
10493 
10494       // We can't directly insert an i8 or i16 into a vector, so zero extend
10495       // it to i32 first.
10496       if (EltVT == MVT::i16 || EltVT == MVT::i8) {
10497         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
10498         MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
10499         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
10500         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10501         return DAG.getBitcast(VT, Item);
10502       }
10503     }
10504 
10505     // Is it a vector logical left shift?
10506     if (NumElems == 2 && Idx == 1 &&
10507         X86::isZeroNode(Op.getOperand(0)) &&
10508         !X86::isZeroNode(Op.getOperand(1))) {
10509       unsigned NumBits = VT.getSizeInBits();
10510       return getVShift(true, VT,
10511                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
10512                                    VT, Op.getOperand(1)),
10513                        NumBits/2, DAG, *this, dl);
10514     }
10515 
10516     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
10517       return SDValue();
10518 
10519     // Otherwise, if this is a vector with i32 or f32 elements, and the element
10520     // is a non-constant being inserted into an element other than the low one,
10521     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
10522     // movd/movss) to move this into the low element, then shuffle it into
10523     // place.
10524     if (EVTBits == 32) {
10525       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10526       return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
10527     }
10528   }
10529 
10530   // Splat is obviously ok. Let legalizer expand it to a shuffle.
10531   if (Values.size() == 1) {
10532     if (EVTBits == 32) {
10533       // Instead of a shuffle like this:
10534       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
10535       // Check if it's possible to issue this instead.
10536       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
10537       unsigned Idx = NonZeroMask.countTrailingZeros();
10538       SDValue Item = Op.getOperand(Idx);
10539       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
10540         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
10541     }
10542     return SDValue();
10543   }
10544 
10545   // A vector full of immediates; various special cases are already
10546   // handled, so this is best done with a single constant-pool load.
10547   if (IsAllConstants)
10548     return SDValue();
10549 
10550   if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
10551       return V;
10552 
10553   // See if we can use a vector load to get all of the elements.
10554   {
10555     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
10556     if (SDValue LD =
10557             EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
10558       return LD;
10559   }
10560 
10561   // If this is a splat of pairs of 32-bit elements, we can use a narrower
10562   // build_vector and broadcast it.
10563   // TODO: We could probably generalize this more.
10564   if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
10565     SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
10566                        DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
10567     auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
10568       // Make sure all the even/odd operands match.
10569       for (unsigned i = 2; i != NumElems; ++i)
10570         if (Ops[i % 2] != Op.getOperand(i))
10571           return false;
10572       return true;
10573     };
10574     if (CanSplat(Op, NumElems, Ops)) {
10575       MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
10576       MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
10577       // Create a new build vector and cast to v2i64/v2f64.
10578       SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
10579                                      DAG.getBuildVector(NarrowVT, dl, Ops));
10580       // Broadcast from v2i64/v2f64 and cast to final VT.
10581       MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
10582       return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
10583                                             NewBV));
10584     }
10585   }
10586 
10587   // For AVX-length vectors, build the individual 128-bit pieces and use
10588   // shuffles to put them in place.
10589   if (VT.getSizeInBits() > 128) {
10590     MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
10591 
10592     // Build both the lower and upper subvector.
10593     SDValue Lower =
10594         DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
10595     SDValue Upper = DAG.getBuildVector(
10596         HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
10597 
10598     // Recreate the wider vector with the lower and upper part.
10599     return concatSubVectors(Lower, Upper, DAG, dl);
10600   }
10601 
10602   // Let legalizer expand 2-wide build_vectors.
10603   if (EVTBits == 64) {
10604     if (NumNonZero == 1) {
10605       // One half is zero or undef.
10606       unsigned Idx = NonZeroMask.countTrailingZeros();
10607       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
10608                                Op.getOperand(Idx));
10609       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
10610     }
10611     return SDValue();
10612   }
10613 
10614   // If element VT is < 32 bits, convert it to inserts into a zero vector.
10615   if (EVTBits == 8 && NumElems == 16)
10616     if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
10617                                           DAG, Subtarget))
10618       return V;
10619 
10620   if (EVTBits == 16 && NumElems == 8)
10621     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
10622                                           DAG, Subtarget))
10623       return V;
10624 
10625   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
10626   if (EVTBits == 32 && NumElems == 4)
10627     if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
10628       return V;
10629 
10630   // If element VT is == 32 bits, turn it into a number of shuffles.
10631   if (NumElems == 4 && NumZero > 0) {
10632     SmallVector<SDValue, 8> Ops(NumElems);
10633     for (unsigned i = 0; i < 4; ++i) {
10634       bool isZero = !NonZeroMask[i];
10635       if (isZero)
10636         Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
10637       else
10638         Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10639     }
10640 
10641     for (unsigned i = 0; i < 2; ++i) {
10642       switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
10643         default: llvm_unreachable("Unexpected NonZero count");
10644         case 0:
10645           Ops[i] = Ops[i*2];  // Must be a zero vector.
10646           break;
10647         case 1:
10648           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
10649           break;
10650         case 2:
10651           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10652           break;
10653         case 3:
10654           Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10655           break;
10656       }
10657     }
10658 
10659     bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
10660     bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
10661     int MaskVec[] = {
10662       Reverse1 ? 1 : 0,
10663       Reverse1 ? 0 : 1,
10664       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
10665       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
10666     };
10667     return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
10668   }
10669 
10670   assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
10671 
10672   // Check for a build vector from mostly shuffle plus few inserting.
10673   if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
10674     return Sh;
10675 
10676   // For SSE 4.1, use insertps to put the high elements into the low element.
10677   if (Subtarget.hasSSE41()) {
10678     SDValue Result;
10679     if (!Op.getOperand(0).isUndef())
10680       Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
10681     else
10682       Result = DAG.getUNDEF(VT);
10683 
10684     for (unsigned i = 1; i < NumElems; ++i) {
10685       if (Op.getOperand(i).isUndef()) continue;
10686       Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
10687                            Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
10688     }
10689     return Result;
10690   }
10691 
10692   // Otherwise, expand into a number of unpckl*, start by extending each of
10693   // our (non-undef) elements to the full vector width with the element in the
10694   // bottom slot of the vector (which generates no code for SSE).
10695   SmallVector<SDValue, 8> Ops(NumElems);
10696   for (unsigned i = 0; i < NumElems; ++i) {
10697     if (!Op.getOperand(i).isUndef())
10698       Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10699     else
10700       Ops[i] = DAG.getUNDEF(VT);
10701   }
10702 
10703   // Next, we iteratively mix elements, e.g. for v4f32:
10704   //   Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
10705   //         : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
10706   //   Step 2: unpcklpd X, Y ==>    <3, 2, 1, 0>
10707   for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
10708     // Generate scaled UNPCKL shuffle mask.
10709     SmallVector<int, 16> Mask;
10710     for(unsigned i = 0; i != Scale; ++i)
10711       Mask.push_back(i);
10712     for (unsigned i = 0; i != Scale; ++i)
10713       Mask.push_back(NumElems+i);
10714     Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
10715 
10716     for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
10717       Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
10718   }
10719   return Ops[0];
10720 }
10721 
10722 // 256-bit AVX can use the vinsertf128 instruction
10723 // to create 256-bit vectors from two other 128-bit ones.
10724 // TODO: Detect subvector broadcast here instead of DAG combine?
LowerAVXCONCAT_VECTORS(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)10725 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
10726                                       const X86Subtarget &Subtarget) {
10727   SDLoc dl(Op);
10728   MVT ResVT = Op.getSimpleValueType();
10729 
10730   assert((ResVT.is256BitVector() ||
10731           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
10732 
10733   unsigned NumOperands = Op.getNumOperands();
10734   unsigned NumZero = 0;
10735   unsigned NumNonZero = 0;
10736   unsigned NonZeros = 0;
10737   for (unsigned i = 0; i != NumOperands; ++i) {
10738     SDValue SubVec = Op.getOperand(i);
10739     if (SubVec.isUndef())
10740       continue;
10741     if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10742       ++NumZero;
10743     else {
10744       assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
10745       NonZeros |= 1 << i;
10746       ++NumNonZero;
10747     }
10748   }
10749 
10750   // If we have more than 2 non-zeros, build each half separately.
10751   if (NumNonZero > 2) {
10752     MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10753     ArrayRef<SDUse> Ops = Op->ops();
10754     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10755                              Ops.slice(0, NumOperands/2));
10756     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10757                              Ops.slice(NumOperands/2));
10758     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10759   }
10760 
10761   // Otherwise, build it up through insert_subvectors.
10762   SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
10763                         : DAG.getUNDEF(ResVT);
10764 
10765   MVT SubVT = Op.getOperand(0).getSimpleValueType();
10766   unsigned NumSubElems = SubVT.getVectorNumElements();
10767   for (unsigned i = 0; i != NumOperands; ++i) {
10768     if ((NonZeros & (1 << i)) == 0)
10769       continue;
10770 
10771     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
10772                       Op.getOperand(i),
10773                       DAG.getIntPtrConstant(i * NumSubElems, dl));
10774   }
10775 
10776   return Vec;
10777 }
10778 
10779 // Returns true if the given node is a type promotion (by concatenating i1
10780 // zeros) of the result of a node that already zeros all upper bits of
10781 // k-register.
10782 // TODO: Merge this with LowerAVXCONCAT_VECTORS?
LowerCONCAT_VECTORSvXi1(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)10783 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
10784                                        const X86Subtarget &Subtarget,
10785                                        SelectionDAG & DAG) {
10786   SDLoc dl(Op);
10787   MVT ResVT = Op.getSimpleValueType();
10788   unsigned NumOperands = Op.getNumOperands();
10789 
10790   assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
10791          "Unexpected number of operands in CONCAT_VECTORS");
10792 
10793   uint64_t Zeros = 0;
10794   uint64_t NonZeros = 0;
10795   for (unsigned i = 0; i != NumOperands; ++i) {
10796     SDValue SubVec = Op.getOperand(i);
10797     if (SubVec.isUndef())
10798       continue;
10799     assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
10800     if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10801       Zeros |= (uint64_t)1 << i;
10802     else
10803       NonZeros |= (uint64_t)1 << i;
10804   }
10805 
10806   unsigned NumElems = ResVT.getVectorNumElements();
10807 
10808   // If we are inserting non-zero vector and there are zeros in LSBs and undef
10809   // in the MSBs we need to emit a KSHIFTL. The generic lowering to
10810   // insert_subvector will give us two kshifts.
10811   if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
10812       Log2_64(NonZeros) != NumOperands - 1) {
10813     MVT ShiftVT = ResVT;
10814     if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
10815       ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
10816     unsigned Idx = Log2_64(NonZeros);
10817     SDValue SubVec = Op.getOperand(Idx);
10818     unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10819     SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
10820                          DAG.getUNDEF(ShiftVT), SubVec,
10821                          DAG.getIntPtrConstant(0, dl));
10822     Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
10823                      DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
10824     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
10825                        DAG.getIntPtrConstant(0, dl));
10826   }
10827 
10828   // If there are zero or one non-zeros we can handle this very simply.
10829   if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
10830     SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
10831     if (!NonZeros)
10832       return Vec;
10833     unsigned Idx = Log2_64(NonZeros);
10834     SDValue SubVec = Op.getOperand(Idx);
10835     unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10836     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
10837                        DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
10838   }
10839 
10840   if (NumOperands > 2) {
10841     MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10842     ArrayRef<SDUse> Ops = Op->ops();
10843     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10844                              Ops.slice(0, NumOperands/2));
10845     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10846                              Ops.slice(NumOperands/2));
10847     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10848   }
10849 
10850   assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?");
10851 
10852   if (ResVT.getVectorNumElements() >= 16)
10853     return Op; // The operation is legal with KUNPCK
10854 
10855   SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
10856                             DAG.getUNDEF(ResVT), Op.getOperand(0),
10857                             DAG.getIntPtrConstant(0, dl));
10858   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
10859                      DAG.getIntPtrConstant(NumElems/2, dl));
10860 }
10861 
LowerCONCAT_VECTORS(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)10862 static SDValue LowerCONCAT_VECTORS(SDValue Op,
10863                                    const X86Subtarget &Subtarget,
10864                                    SelectionDAG &DAG) {
10865   MVT VT = Op.getSimpleValueType();
10866   if (VT.getVectorElementType() == MVT::i1)
10867     return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
10868 
10869   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
10870          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
10871           Op.getNumOperands() == 4)));
10872 
10873   // AVX can use the vinsertf128 instruction to create 256-bit vectors
10874   // from two other 128-bit ones.
10875 
10876   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
10877   return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
10878 }
10879 
10880 //===----------------------------------------------------------------------===//
10881 // Vector shuffle lowering
10882 //
10883 // This is an experimental code path for lowering vector shuffles on x86. It is
10884 // designed to handle arbitrary vector shuffles and blends, gracefully
10885 // degrading performance as necessary. It works hard to recognize idiomatic
10886 // shuffles and lower them to optimal instruction patterns without leaving
10887 // a framework that allows reasonably efficient handling of all vector shuffle
10888 // patterns.
10889 //===----------------------------------------------------------------------===//
10890 
10891 /// Tiny helper function to identify a no-op mask.
10892 ///
10893 /// This is a somewhat boring predicate function. It checks whether the mask
10894 /// array input, which is assumed to be a single-input shuffle mask of the kind
10895 /// used by the X86 shuffle instructions (not a fully general
10896 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
10897 /// in-place shuffle are 'no-op's.
isNoopShuffleMask(ArrayRef<int> Mask)10898 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
10899   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10900     assert(Mask[i] >= -1 && "Out of bound mask element!");
10901     if (Mask[i] >= 0 && Mask[i] != i)
10902       return false;
10903   }
10904   return true;
10905 }
10906 
10907 /// Test whether there are elements crossing LaneSizeInBits lanes in this
10908 /// shuffle mask.
10909 ///
10910 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
10911 /// and we routinely test for these.
isLaneCrossingShuffleMask(unsigned LaneSizeInBits,unsigned ScalarSizeInBits,ArrayRef<int> Mask)10912 static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
10913                                       unsigned ScalarSizeInBits,
10914                                       ArrayRef<int> Mask) {
10915   assert(LaneSizeInBits && ScalarSizeInBits &&
10916          (LaneSizeInBits % ScalarSizeInBits) == 0 &&
10917          "Illegal shuffle lane size");
10918   int LaneSize = LaneSizeInBits / ScalarSizeInBits;
10919   int Size = Mask.size();
10920   for (int i = 0; i < Size; ++i)
10921     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10922       return true;
10923   return false;
10924 }
10925 
10926 /// Test whether there are elements crossing 128-bit lanes in this
10927 /// shuffle mask.
is128BitLaneCrossingShuffleMask(MVT VT,ArrayRef<int> Mask)10928 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
10929   return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
10930 }
10931 
10932 /// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
10933 /// from multiple lanes - this is different to isLaneCrossingShuffleMask to
10934 /// better support 'repeated mask + lane permute' style shuffles.
isMultiLaneShuffleMask(unsigned LaneSizeInBits,unsigned ScalarSizeInBits,ArrayRef<int> Mask)10935 static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
10936                                    unsigned ScalarSizeInBits,
10937                                    ArrayRef<int> Mask) {
10938   assert(LaneSizeInBits && ScalarSizeInBits &&
10939          (LaneSizeInBits % ScalarSizeInBits) == 0 &&
10940          "Illegal shuffle lane size");
10941   int NumElts = Mask.size();
10942   int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
10943   int NumLanes = NumElts / NumEltsPerLane;
10944   if (NumLanes > 1) {
10945     for (int i = 0; i != NumLanes; ++i) {
10946       int SrcLane = -1;
10947       for (int j = 0; j != NumEltsPerLane; ++j) {
10948         int M = Mask[(i * NumEltsPerLane) + j];
10949         if (M < 0)
10950           continue;
10951         int Lane = (M % NumElts) / NumEltsPerLane;
10952         if (SrcLane >= 0 && SrcLane != Lane)
10953           return true;
10954         SrcLane = Lane;
10955       }
10956     }
10957   }
10958   return false;
10959 }
10960 
10961 /// Test whether a shuffle mask is equivalent within each sub-lane.
10962 ///
10963 /// This checks a shuffle mask to see if it is performing the same
10964 /// lane-relative shuffle in each sub-lane. This trivially implies
10965 /// that it is also not lane-crossing. It may however involve a blend from the
10966 /// same lane of a second vector.
10967 ///
10968 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
10969 /// non-trivial to compute in the face of undef lanes. The representation is
10970 /// suitable for use with existing 128-bit shuffles as entries from the second
10971 /// vector have been remapped to [LaneSize, 2*LaneSize).
isRepeatedShuffleMask(unsigned LaneSizeInBits,MVT VT,ArrayRef<int> Mask,SmallVectorImpl<int> & RepeatedMask)10972 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
10973                                   ArrayRef<int> Mask,
10974                                   SmallVectorImpl<int> &RepeatedMask) {
10975   auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10976   RepeatedMask.assign(LaneSize, -1);
10977   int Size = Mask.size();
10978   for (int i = 0; i < Size; ++i) {
10979     assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
10980     if (Mask[i] < 0)
10981       continue;
10982     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10983       // This entry crosses lanes, so there is no way to model this shuffle.
10984       return false;
10985 
10986     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10987     // Adjust second vector indices to start at LaneSize instead of Size.
10988     int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10989                                 : Mask[i] % LaneSize + LaneSize;
10990     if (RepeatedMask[i % LaneSize] < 0)
10991       // This is the first non-undef entry in this slot of a 128-bit lane.
10992       RepeatedMask[i % LaneSize] = LocalM;
10993     else if (RepeatedMask[i % LaneSize] != LocalM)
10994       // Found a mismatch with the repeated mask.
10995       return false;
10996   }
10997   return true;
10998 }
10999 
11000 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
11001 static bool
is128BitLaneRepeatedShuffleMask(MVT VT,ArrayRef<int> Mask,SmallVectorImpl<int> & RepeatedMask)11002 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11003                                 SmallVectorImpl<int> &RepeatedMask) {
11004   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11005 }
11006 
11007 static bool
is128BitLaneRepeatedShuffleMask(MVT VT,ArrayRef<int> Mask)11008 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
11009   SmallVector<int, 32> RepeatedMask;
11010   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11011 }
11012 
11013 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
11014 static bool
is256BitLaneRepeatedShuffleMask(MVT VT,ArrayRef<int> Mask,SmallVectorImpl<int> & RepeatedMask)11015 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11016                                 SmallVectorImpl<int> &RepeatedMask) {
11017   return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
11018 }
11019 
11020 /// Test whether a target shuffle mask is equivalent within each sub-lane.
11021 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,unsigned EltSizeInBits,ArrayRef<int> Mask,SmallVectorImpl<int> & RepeatedMask)11022 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
11023                                         unsigned EltSizeInBits,
11024                                         ArrayRef<int> Mask,
11025                                         SmallVectorImpl<int> &RepeatedMask) {
11026   int LaneSize = LaneSizeInBits / EltSizeInBits;
11027   RepeatedMask.assign(LaneSize, SM_SentinelUndef);
11028   int Size = Mask.size();
11029   for (int i = 0; i < Size; ++i) {
11030     assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
11031     if (Mask[i] == SM_SentinelUndef)
11032       continue;
11033     if (Mask[i] == SM_SentinelZero) {
11034       if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
11035         return false;
11036       RepeatedMask[i % LaneSize] = SM_SentinelZero;
11037       continue;
11038     }
11039     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11040       // This entry crosses lanes, so there is no way to model this shuffle.
11041       return false;
11042 
11043     // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
11044     // later vector indices to start at multiples of LaneSize instead of Size.
11045     int LaneM = Mask[i] / Size;
11046     int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
11047     if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
11048       // This is the first non-undef entry in this slot of a 128-bit lane.
11049       RepeatedMask[i % LaneSize] = LocalM;
11050     else if (RepeatedMask[i % LaneSize] != LocalM)
11051       // Found a mismatch with the repeated mask.
11052       return false;
11053   }
11054   return true;
11055 }
11056 
11057 /// Test whether a target shuffle mask is equivalent within each sub-lane.
11058 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,MVT VT,ArrayRef<int> Mask,SmallVectorImpl<int> & RepeatedMask)11059 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
11060                                         ArrayRef<int> Mask,
11061                                         SmallVectorImpl<int> &RepeatedMask) {
11062   return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
11063                                      Mask, RepeatedMask);
11064 }
11065 
11066 /// Checks whether the vector elements referenced by two shuffle masks are
11067 /// equivalent.
IsElementEquivalent(int MaskSize,SDValue Op,SDValue ExpectedOp,int Idx,int ExpectedIdx)11068 static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
11069                                 int Idx, int ExpectedIdx) {
11070   assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
11071          ExpectedIdx < MaskSize && "Out of range element index");
11072   if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
11073     return false;
11074 
11075   switch (Op.getOpcode()) {
11076   case ISD::BUILD_VECTOR:
11077     // If the values are build vectors, we can look through them to find
11078     // equivalent inputs that make the shuffles equivalent.
11079     // TODO: Handle MaskSize != Op.getNumOperands()?
11080     if (MaskSize == (int)Op.getNumOperands() &&
11081         MaskSize == (int)ExpectedOp.getNumOperands())
11082       return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
11083     break;
11084   case X86ISD::VBROADCAST:
11085   case X86ISD::VBROADCAST_LOAD:
11086     // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
11087     return (Op == ExpectedOp &&
11088             (int)Op.getValueType().getVectorNumElements() == MaskSize);
11089   case X86ISD::HADD:
11090   case X86ISD::HSUB:
11091   case X86ISD::FHADD:
11092   case X86ISD::FHSUB:
11093   case X86ISD::PACKSS:
11094   case X86ISD::PACKUS:
11095     // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
11096     // TODO: Handle MaskSize != NumElts?
11097     // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
11098     if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
11099       MVT VT = Op.getSimpleValueType();
11100       int NumElts = VT.getVectorNumElements();
11101       if (MaskSize == NumElts) {
11102         int NumLanes = VT.getSizeInBits() / 128;
11103         int NumEltsPerLane = NumElts / NumLanes;
11104         int NumHalfEltsPerLane = NumEltsPerLane / 2;
11105         bool SameLane =
11106             (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
11107         bool SameElt =
11108             (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
11109         return SameLane && SameElt;
11110       }
11111     }
11112     break;
11113   }
11114 
11115   return false;
11116 }
11117 
11118 /// Checks whether a shuffle mask is equivalent to an explicit list of
11119 /// arguments.
11120 ///
11121 /// This is a fast way to test a shuffle mask against a fixed pattern:
11122 ///
11123 ///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
11124 ///
11125 /// It returns true if the mask is exactly as wide as the argument list, and
11126 /// each element of the mask is either -1 (signifying undef) or the value given
11127 /// in the argument.
isShuffleEquivalent(ArrayRef<int> Mask,ArrayRef<int> ExpectedMask,SDValue V1=SDValue (),SDValue V2=SDValue ())11128 static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
11129                                 SDValue V1 = SDValue(),
11130                                 SDValue V2 = SDValue()) {
11131   int Size = Mask.size();
11132   if (Size != (int)ExpectedMask.size())
11133     return false;
11134 
11135   for (int i = 0; i < Size; ++i) {
11136     assert(Mask[i] >= -1 && "Out of bound mask element!");
11137     int MaskIdx = Mask[i];
11138     int ExpectedIdx = ExpectedMask[i];
11139     if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
11140       SDValue MaskV = MaskIdx < Size ? V1 : V2;
11141       SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
11142       MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
11143       ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
11144       if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
11145         return false;
11146     }
11147   }
11148   return true;
11149 }
11150 
11151 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
11152 ///
11153 /// The masks must be exactly the same width.
11154 ///
11155 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
11156 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
11157 ///
11158 /// SM_SentinelZero is accepted as a valid negative index but must match in
11159 /// both.
isTargetShuffleEquivalent(MVT VT,ArrayRef<int> Mask,ArrayRef<int> ExpectedMask,SDValue V1=SDValue (),SDValue V2=SDValue ())11160 static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
11161                                       ArrayRef<int> ExpectedMask,
11162                                       SDValue V1 = SDValue(),
11163                                       SDValue V2 = SDValue()) {
11164   int Size = Mask.size();
11165   if (Size != (int)ExpectedMask.size())
11166     return false;
11167   assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
11168          "Illegal target shuffle mask");
11169 
11170   // Check for out-of-range target shuffle mask indices.
11171   if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
11172     return false;
11173 
11174   // Don't use V1/V2 if they're not the same size as the shuffle mask type.
11175   if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())
11176     V1 = SDValue();
11177   if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())
11178     V2 = SDValue();
11179 
11180   for (int i = 0; i < Size; ++i) {
11181     int MaskIdx = Mask[i];
11182     int ExpectedIdx = ExpectedMask[i];
11183     if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
11184       continue;
11185     if (0 <= MaskIdx && 0 <= ExpectedIdx) {
11186       SDValue MaskV = MaskIdx < Size ? V1 : V2;
11187       SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
11188       MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
11189       ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
11190       if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
11191         continue;
11192     }
11193     // TODO - handle SM_Sentinel equivalences.
11194     return false;
11195   }
11196   return true;
11197 }
11198 
11199 // Attempt to create a shuffle mask from a VSELECT condition mask.
createShuffleMaskFromVSELECT(SmallVectorImpl<int> & Mask,SDValue Cond)11200 static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
11201                                          SDValue Cond) {
11202   EVT CondVT = Cond.getValueType();
11203   unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
11204   unsigned NumElts = CondVT.getVectorNumElements();
11205 
11206   APInt UndefElts;
11207   SmallVector<APInt, 32> EltBits;
11208   if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
11209                                      true, false))
11210     return false;
11211 
11212   Mask.resize(NumElts, SM_SentinelUndef);
11213 
11214   for (int i = 0; i != (int)NumElts; ++i) {
11215     Mask[i] = i;
11216     // Arbitrarily choose from the 2nd operand if the select condition element
11217     // is undef.
11218     // TODO: Can we do better by matching patterns such as even/odd?
11219     if (UndefElts[i] || EltBits[i].isNullValue())
11220       Mask[i] += NumElts;
11221   }
11222 
11223   return true;
11224 }
11225 
11226 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
11227 // instructions.
isUnpackWdShuffleMask(ArrayRef<int> Mask,MVT VT)11228 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
11229   if (VT != MVT::v8i32 && VT != MVT::v8f32)
11230     return false;
11231 
11232   SmallVector<int, 8> Unpcklwd;
11233   createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
11234                           /* Unary = */ false);
11235   SmallVector<int, 8> Unpckhwd;
11236   createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
11237                           /* Unary = */ false);
11238   bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd) ||
11239                          isTargetShuffleEquivalent(VT, Mask, Unpckhwd));
11240   return IsUnpackwdMask;
11241 }
11242 
is128BitUnpackShuffleMask(ArrayRef<int> Mask)11243 static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
11244   // Create 128-bit vector type based on mask size.
11245   MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
11246   MVT VT = MVT::getVectorVT(EltVT, Mask.size());
11247 
11248   // We can't assume a canonical shuffle mask, so try the commuted version too.
11249   SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
11250   ShuffleVectorSDNode::commuteMask(CommutedMask);
11251 
11252   // Match any of unary/binary or low/high.
11253   for (unsigned i = 0; i != 4; ++i) {
11254     SmallVector<int, 16> UnpackMask;
11255     createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
11256     if (isTargetShuffleEquivalent(VT, Mask, UnpackMask) ||
11257         isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask))
11258       return true;
11259   }
11260   return false;
11261 }
11262 
11263 /// Return true if a shuffle mask chooses elements identically in its top and
11264 /// bottom halves. For example, any splat mask has the same top and bottom
11265 /// halves. If an element is undefined in only one half of the mask, the halves
11266 /// are not considered identical.
hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask)11267 static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
11268   assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
11269   unsigned HalfSize = Mask.size() / 2;
11270   for (unsigned i = 0; i != HalfSize; ++i) {
11271     if (Mask[i] != Mask[i + HalfSize])
11272       return false;
11273   }
11274   return true;
11275 }
11276 
11277 /// Get a 4-lane 8-bit shuffle immediate for a mask.
11278 ///
11279 /// This helper function produces an 8-bit shuffle immediate corresponding to
11280 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
11281 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
11282 /// example.
11283 ///
11284 /// NB: We rely heavily on "undef" masks preserving the input lane.
getV4X86ShuffleImm(ArrayRef<int> Mask)11285 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
11286   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
11287   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
11288   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
11289   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
11290   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
11291 
11292   // If the mask only uses one non-undef element, then fully 'splat' it to
11293   // improve later broadcast matching.
11294   int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
11295   assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
11296 
11297   int FirstElt = Mask[FirstIndex];
11298   if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
11299     return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
11300 
11301   unsigned Imm = 0;
11302   Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
11303   Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
11304   Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
11305   Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
11306   return Imm;
11307 }
11308 
getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,const SDLoc & DL,SelectionDAG & DAG)11309 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
11310                                           SelectionDAG &DAG) {
11311   return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
11312 }
11313 
11314 // The Shuffle result is as follow:
11315 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
11316 // Each Zeroable's element correspond to a particular Mask's element.
11317 // As described in computeZeroableShuffleElements function.
11318 //
11319 // The function looks for a sub-mask that the nonzero elements are in
11320 // increasing order. If such sub-mask exist. The function returns true.
isNonZeroElementsInOrder(const APInt & Zeroable,ArrayRef<int> Mask,const EVT & VectorType,bool & IsZeroSideLeft)11321 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
11322                                      ArrayRef<int> Mask, const EVT &VectorType,
11323                                      bool &IsZeroSideLeft) {
11324   int NextElement = -1;
11325   // Check if the Mask's nonzero elements are in increasing order.
11326   for (int i = 0, e = Mask.size(); i < e; i++) {
11327     // Checks if the mask's zeros elements are built from only zeros.
11328     assert(Mask[i] >= -1 && "Out of bound mask element!");
11329     if (Mask[i] < 0)
11330       return false;
11331     if (Zeroable[i])
11332       continue;
11333     // Find the lowest non zero element
11334     if (NextElement < 0) {
11335       NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
11336       IsZeroSideLeft = NextElement != 0;
11337     }
11338     // Exit if the mask's non zero elements are not in increasing order.
11339     if (NextElement != Mask[i])
11340       return false;
11341     NextElement++;
11342   }
11343   return true;
11344 }
11345 
11346 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
lowerShuffleWithPSHUFB(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,SDValue V1,SDValue V2,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)11347 static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
11348                                       ArrayRef<int> Mask, SDValue V1,
11349                                       SDValue V2, const APInt &Zeroable,
11350                                       const X86Subtarget &Subtarget,
11351                                       SelectionDAG &DAG) {
11352   int Size = Mask.size();
11353   int LaneSize = 128 / VT.getScalarSizeInBits();
11354   const int NumBytes = VT.getSizeInBits() / 8;
11355   const int NumEltBytes = VT.getScalarSizeInBits() / 8;
11356 
11357   assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
11358          (Subtarget.hasAVX2() && VT.is256BitVector()) ||
11359          (Subtarget.hasBWI() && VT.is512BitVector()));
11360 
11361   SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
11362   // Sign bit set in i8 mask means zero element.
11363   SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
11364 
11365   SDValue V;
11366   for (int i = 0; i < NumBytes; ++i) {
11367     int M = Mask[i / NumEltBytes];
11368     if (M < 0) {
11369       PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
11370       continue;
11371     }
11372     if (Zeroable[i / NumEltBytes]) {
11373       PSHUFBMask[i] = ZeroMask;
11374       continue;
11375     }
11376 
11377     // We can only use a single input of V1 or V2.
11378     SDValue SrcV = (M >= Size ? V2 : V1);
11379     if (V && V != SrcV)
11380       return SDValue();
11381     V = SrcV;
11382     M %= Size;
11383 
11384     // PSHUFB can't cross lanes, ensure this doesn't happen.
11385     if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
11386       return SDValue();
11387 
11388     M = M % LaneSize;
11389     M = M * NumEltBytes + (i % NumEltBytes);
11390     PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
11391   }
11392   assert(V && "Failed to find a source input");
11393 
11394   MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
11395   return DAG.getBitcast(
11396       VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
11397                       DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
11398 }
11399 
11400 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
11401                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
11402                            const SDLoc &dl);
11403 
11404 // X86 has dedicated shuffle that can be lowered to VEXPAND
lowerShuffleToEXPAND(const SDLoc & DL,MVT VT,const APInt & Zeroable,ArrayRef<int> Mask,SDValue & V1,SDValue & V2,SelectionDAG & DAG,const X86Subtarget & Subtarget)11405 static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
11406                                     const APInt &Zeroable,
11407                                     ArrayRef<int> Mask, SDValue &V1,
11408                                     SDValue &V2, SelectionDAG &DAG,
11409                                     const X86Subtarget &Subtarget) {
11410   bool IsLeftZeroSide = true;
11411   if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
11412                                 IsLeftZeroSide))
11413     return SDValue();
11414   unsigned VEXPANDMask = (~Zeroable).getZExtValue();
11415   MVT IntegerType =
11416       MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
11417   SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
11418   unsigned NumElts = VT.getVectorNumElements();
11419   assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
11420          "Unexpected number of vector elements");
11421   SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
11422                               Subtarget, DAG, DL);
11423   SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
11424   SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
11425   return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
11426 }
11427 
matchShuffleWithUNPCK(MVT VT,SDValue & V1,SDValue & V2,unsigned & UnpackOpcode,bool IsUnary,ArrayRef<int> TargetMask,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget)11428 static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
11429                                   unsigned &UnpackOpcode, bool IsUnary,
11430                                   ArrayRef<int> TargetMask, const SDLoc &DL,
11431                                   SelectionDAG &DAG,
11432                                   const X86Subtarget &Subtarget) {
11433   int NumElts = VT.getVectorNumElements();
11434 
11435   bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
11436   for (int i = 0; i != NumElts; i += 2) {
11437     int M1 = TargetMask[i + 0];
11438     int M2 = TargetMask[i + 1];
11439     Undef1 &= (SM_SentinelUndef == M1);
11440     Undef2 &= (SM_SentinelUndef == M2);
11441     Zero1 &= isUndefOrZero(M1);
11442     Zero2 &= isUndefOrZero(M2);
11443   }
11444   assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
11445          "Zeroable shuffle detected");
11446 
11447   // Attempt to match the target mask against the unpack lo/hi mask patterns.
11448   SmallVector<int, 64> Unpckl, Unpckh;
11449   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
11450   if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, V1,
11451                                 (IsUnary ? V1 : V2))) {
11452     UnpackOpcode = X86ISD::UNPCKL;
11453     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11454     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11455     return true;
11456   }
11457 
11458   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
11459   if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, V1,
11460                                 (IsUnary ? V1 : V2))) {
11461     UnpackOpcode = X86ISD::UNPCKH;
11462     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11463     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11464     return true;
11465   }
11466 
11467   // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
11468   if (IsUnary && (Zero1 || Zero2)) {
11469     // Don't bother if we can blend instead.
11470     if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
11471         isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
11472       return false;
11473 
11474     bool MatchLo = true, MatchHi = true;
11475     for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
11476       int M = TargetMask[i];
11477 
11478       // Ignore if the input is known to be zero or the index is undef.
11479       if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
11480           (M == SM_SentinelUndef))
11481         continue;
11482 
11483       MatchLo &= (M == Unpckl[i]);
11484       MatchHi &= (M == Unpckh[i]);
11485     }
11486 
11487     if (MatchLo || MatchHi) {
11488       UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11489       V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11490       V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11491       return true;
11492     }
11493   }
11494 
11495   // If a binary shuffle, commute and try again.
11496   if (!IsUnary) {
11497     ShuffleVectorSDNode::commuteMask(Unpckl);
11498     if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) {
11499       UnpackOpcode = X86ISD::UNPCKL;
11500       std::swap(V1, V2);
11501       return true;
11502     }
11503 
11504     ShuffleVectorSDNode::commuteMask(Unpckh);
11505     if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) {
11506       UnpackOpcode = X86ISD::UNPCKH;
11507       std::swap(V1, V2);
11508       return true;
11509     }
11510   }
11511 
11512   return false;
11513 }
11514 
11515 // X86 has dedicated unpack instructions that can handle specific blend
11516 // operations: UNPCKH and UNPCKL.
lowerShuffleWithUNPCK(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,SDValue V1,SDValue V2,SelectionDAG & DAG)11517 static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
11518                                      ArrayRef<int> Mask, SDValue V1, SDValue V2,
11519                                      SelectionDAG &DAG) {
11520   SmallVector<int, 8> Unpckl;
11521   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
11522   if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11523     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
11524 
11525   SmallVector<int, 8> Unpckh;
11526   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
11527   if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11528     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
11529 
11530   // Commute and try again.
11531   ShuffleVectorSDNode::commuteMask(Unpckl);
11532   if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11533     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
11534 
11535   ShuffleVectorSDNode::commuteMask(Unpckh);
11536   if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11537     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
11538 
11539   return SDValue();
11540 }
11541 
11542 /// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
11543 /// followed by unpack 256-bit.
lowerShuffleWithUNPCK256(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,SDValue V1,SDValue V2,SelectionDAG & DAG)11544 static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
11545                                         ArrayRef<int> Mask, SDValue V1,
11546                                         SDValue V2, SelectionDAG &DAG) {
11547   SmallVector<int, 32> Unpckl, Unpckh;
11548   createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
11549   createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
11550 
11551   unsigned UnpackOpcode;
11552   if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11553     UnpackOpcode = X86ISD::UNPCKL;
11554   else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11555     UnpackOpcode = X86ISD::UNPCKH;
11556   else
11557     return SDValue();
11558 
11559   // This is a "natural" unpack operation (rather than the 128-bit sectored
11560   // operation implemented by AVX). We need to rearrange 64-bit chunks of the
11561   // input in order to use the x86 instruction.
11562   V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
11563                             DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
11564   V1 = DAG.getBitcast(VT, V1);
11565   return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
11566 }
11567 
11568 // Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
11569 // source into the lower elements and zeroing the upper elements.
matchShuffleAsVTRUNC(MVT & SrcVT,MVT & DstVT,MVT VT,ArrayRef<int> Mask,const APInt & Zeroable,const X86Subtarget & Subtarget)11570 static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
11571                                  ArrayRef<int> Mask, const APInt &Zeroable,
11572                                  const X86Subtarget &Subtarget) {
11573   if (!VT.is512BitVector() && !Subtarget.hasVLX())
11574     return false;
11575 
11576   unsigned NumElts = Mask.size();
11577   unsigned EltSizeInBits = VT.getScalarSizeInBits();
11578   unsigned MaxScale = 64 / EltSizeInBits;
11579 
11580   for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11581     unsigned SrcEltBits = EltSizeInBits * Scale;
11582     if (SrcEltBits < 32 && !Subtarget.hasBWI())
11583       continue;
11584     unsigned NumSrcElts = NumElts / Scale;
11585     if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
11586       continue;
11587     unsigned UpperElts = NumElts - NumSrcElts;
11588     if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11589       continue;
11590     SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
11591     SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
11592     DstVT = MVT::getIntegerVT(EltSizeInBits);
11593     if ((NumSrcElts * EltSizeInBits) >= 128) {
11594       // ISD::TRUNCATE
11595       DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
11596     } else {
11597       // X86ISD::VTRUNC
11598       DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
11599     }
11600     return true;
11601   }
11602 
11603   return false;
11604 }
11605 
11606 // Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
11607 // element padding to the final DstVT.
getAVX512TruncNode(const SDLoc & DL,MVT DstVT,SDValue Src,const X86Subtarget & Subtarget,SelectionDAG & DAG,bool ZeroUppers)11608 static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
11609                                   const X86Subtarget &Subtarget,
11610                                   SelectionDAG &DAG, bool ZeroUppers) {
11611   MVT SrcVT = Src.getSimpleValueType();
11612   MVT DstSVT = DstVT.getScalarType();
11613   unsigned NumDstElts = DstVT.getVectorNumElements();
11614   unsigned NumSrcElts = SrcVT.getVectorNumElements();
11615   unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
11616 
11617   if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
11618     return SDValue();
11619 
11620   // Perform a direct ISD::TRUNCATE if possible.
11621   if (NumSrcElts == NumDstElts)
11622     return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
11623 
11624   if (NumSrcElts > NumDstElts) {
11625     MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11626     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11627     return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
11628   }
11629 
11630   if ((NumSrcElts * DstEltSizeInBits) >= 128) {
11631     MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11632     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11633     return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11634                           DstVT.getSizeInBits());
11635   }
11636 
11637   // Non-VLX targets must truncate from a 512-bit type, so we need to
11638   // widen, truncate and then possibly extract the original subvector.
11639   if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
11640     SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
11641     return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
11642   }
11643 
11644   // Fallback to a X86ISD::VTRUNC, padding if necessary.
11645   MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
11646   SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
11647   if (DstVT != TruncVT)
11648     Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11649                            DstVT.getSizeInBits());
11650   return Trunc;
11651 }
11652 
11653 // Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
11654 //
11655 // An example is the following:
11656 //
11657 // t0: ch = EntryToken
11658 //           t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
11659 //         t25: v4i32 = truncate t2
11660 //       t41: v8i16 = bitcast t25
11661 //       t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
11662 //       Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
11663 //     t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
11664 //   t18: v2i64 = bitcast t51
11665 //
11666 // One can just use a single vpmovdw instruction, without avx512vl we need to
11667 // use the zmm variant and extract the lower subvector, padding with zeroes.
11668 // TODO: Merge with lowerShuffleAsVTRUNC.
lowerShuffleWithVPMOV(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)11669 static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
11670                                      SDValue V2, ArrayRef<int> Mask,
11671                                      const APInt &Zeroable,
11672                                      const X86Subtarget &Subtarget,
11673                                      SelectionDAG &DAG) {
11674   assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
11675   if (!Subtarget.hasAVX512())
11676     return SDValue();
11677 
11678   unsigned NumElts = VT.getVectorNumElements();
11679   unsigned EltSizeInBits = VT.getScalarSizeInBits();
11680   unsigned MaxScale = 64 / EltSizeInBits;
11681   for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11682     unsigned NumSrcElts = NumElts / Scale;
11683     unsigned UpperElts = NumElts - NumSrcElts;
11684     if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
11685         !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11686       continue;
11687 
11688     SDValue Src = V1;
11689     if (!Src.hasOneUse())
11690       return SDValue();
11691 
11692     Src = peekThroughOneUseBitcasts(Src);
11693     if (Src.getOpcode() != ISD::TRUNCATE ||
11694         Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))
11695       return SDValue();
11696     Src = Src.getOperand(0);
11697 
11698     // VPMOVWB is only available with avx512bw.
11699     MVT SrcVT = Src.getSimpleValueType();
11700     if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
11701         !Subtarget.hasBWI())
11702       return SDValue();
11703 
11704     bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
11705     return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11706   }
11707 
11708   return SDValue();
11709 }
11710 
11711 // Attempt to match binary shuffle patterns as a truncate.
lowerShuffleAsVTRUNC(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)11712 static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
11713                                     SDValue V2, ArrayRef<int> Mask,
11714                                     const APInt &Zeroable,
11715                                     const X86Subtarget &Subtarget,
11716                                     SelectionDAG &DAG) {
11717   assert((VT.is128BitVector() || VT.is256BitVector()) &&
11718          "Unexpected VTRUNC type");
11719   if (!Subtarget.hasAVX512())
11720     return SDValue();
11721 
11722   unsigned NumElts = VT.getVectorNumElements();
11723   unsigned EltSizeInBits = VT.getScalarSizeInBits();
11724   unsigned MaxScale = 64 / EltSizeInBits;
11725   for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11726     // TODO: Support non-BWI VPMOVWB truncations?
11727     unsigned SrcEltBits = EltSizeInBits * Scale;
11728     if (SrcEltBits < 32 && !Subtarget.hasBWI())
11729       continue;
11730 
11731     // Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...>
11732     // Bail if the V2 elements are undef.
11733     unsigned NumHalfSrcElts = NumElts / Scale;
11734     unsigned NumSrcElts = 2 * NumHalfSrcElts;
11735     if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
11736         isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
11737       continue;
11738 
11739     // The elements beyond the truncation must be undef/zero.
11740     unsigned UpperElts = NumElts - NumSrcElts;
11741     if (UpperElts > 0 &&
11742         !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11743       continue;
11744     bool UndefUppers =
11745         UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
11746 
11747     // As we're using both sources then we need to concat them together
11748     // and truncate from the double-sized src.
11749     MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
11750     SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
11751 
11752     MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11753     MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11754     Src = DAG.getBitcast(SrcVT, Src);
11755     return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11756   }
11757 
11758   return SDValue();
11759 }
11760 
11761 /// Check whether a compaction lowering can be done by dropping even
11762 /// elements and compute how many times even elements must be dropped.
11763 ///
11764 /// This handles shuffles which take every Nth element where N is a power of
11765 /// two. Example shuffle masks:
11766 ///
11767 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
11768 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11769 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
11770 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
11771 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
11772 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
11773 ///
11774 /// Any of these lanes can of course be undef.
11775 ///
11776 /// This routine only supports N <= 3.
11777 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11778 /// for larger N.
11779 ///
11780 /// \returns N above, or the number of times even elements must be dropped if
11781 /// there is such a number. Otherwise returns zero.
canLowerByDroppingEvenElements(ArrayRef<int> Mask,bool IsSingleInput)11782 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11783                                           bool IsSingleInput) {
11784   // The modulus for the shuffle vector entries is based on whether this is
11785   // a single input or not.
11786   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11787   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11788          "We should only be called with masks with a power-of-2 size!");
11789 
11790   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11791 
11792   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11793   // and 2^3 simultaneously. This is because we may have ambiguity with
11794   // partially undef inputs.
11795   bool ViableForN[3] = {true, true, true};
11796 
11797   for (int i = 0, e = Mask.size(); i < e; ++i) {
11798     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11799     // want.
11800     if (Mask[i] < 0)
11801       continue;
11802 
11803     bool IsAnyViable = false;
11804     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11805       if (ViableForN[j]) {
11806         uint64_t N = j + 1;
11807 
11808         // The shuffle mask must be equal to (i * 2^N) % M.
11809         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11810           IsAnyViable = true;
11811         else
11812           ViableForN[j] = false;
11813       }
11814     // Early exit if we exhaust the possible powers of two.
11815     if (!IsAnyViable)
11816       break;
11817   }
11818 
11819   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11820     if (ViableForN[j])
11821       return j + 1;
11822 
11823   // Return 0 as there is no viable power of two.
11824   return 0;
11825 }
11826 
11827 // X86 has dedicated pack instructions that can handle specific truncation
11828 // operations: PACKSS and PACKUS.
11829 // Checks for compaction shuffle masks if MaxStages > 1.
11830 // TODO: Add support for matching multiple PACKSS/PACKUS stages.
matchShuffleWithPACK(MVT VT,MVT & SrcVT,SDValue & V1,SDValue & V2,unsigned & PackOpcode,ArrayRef<int> TargetMask,const SelectionDAG & DAG,const X86Subtarget & Subtarget,unsigned MaxStages=1)11831 static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
11832                                  unsigned &PackOpcode, ArrayRef<int> TargetMask,
11833                                  const SelectionDAG &DAG,
11834                                  const X86Subtarget &Subtarget,
11835                                  unsigned MaxStages = 1) {
11836   unsigned NumElts = VT.getVectorNumElements();
11837   unsigned BitSize = VT.getScalarSizeInBits();
11838   assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
11839          "Illegal maximum compaction");
11840 
11841   auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
11842     unsigned NumSrcBits = PackVT.getScalarSizeInBits();
11843     unsigned NumPackedBits = NumSrcBits - BitSize;
11844     N1 = peekThroughBitcasts(N1);
11845     N2 = peekThroughBitcasts(N2);
11846     unsigned NumBits1 = N1.getScalarValueSizeInBits();
11847     unsigned NumBits2 = N2.getScalarValueSizeInBits();
11848     bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
11849     bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
11850     if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
11851         (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
11852       return false;
11853     if (Subtarget.hasSSE41() || BitSize == 8) {
11854       APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
11855       if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
11856           (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
11857         V1 = N1;
11858         V2 = N2;
11859         SrcVT = PackVT;
11860         PackOpcode = X86ISD::PACKUS;
11861         return true;
11862       }
11863     }
11864     bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
11865     bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
11866     if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
11867          DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
11868         (N2.isUndef() || IsZero2 || IsAllOnes2 ||
11869          DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
11870       V1 = N1;
11871       V2 = N2;
11872       SrcVT = PackVT;
11873       PackOpcode = X86ISD::PACKSS;
11874       return true;
11875     }
11876     return false;
11877   };
11878 
11879   // Attempt to match against wider and wider compaction patterns.
11880   for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
11881     MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
11882     MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
11883 
11884     // Try binary shuffle.
11885     SmallVector<int, 32> BinaryMask;
11886     createPackShuffleMask(VT, BinaryMask, false, NumStages);
11887     if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, V1, V2))
11888       if (MatchPACK(V1, V2, PackVT))
11889         return true;
11890 
11891     // Try unary shuffle.
11892     SmallVector<int, 32> UnaryMask;
11893     createPackShuffleMask(VT, UnaryMask, true, NumStages);
11894     if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, V1))
11895       if (MatchPACK(V1, V1, PackVT))
11896         return true;
11897   }
11898 
11899   return false;
11900 }
11901 
lowerShuffleWithPACK(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,SDValue V1,SDValue V2,SelectionDAG & DAG,const X86Subtarget & Subtarget)11902 static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
11903                                     SDValue V1, SDValue V2, SelectionDAG &DAG,
11904                                     const X86Subtarget &Subtarget) {
11905   MVT PackVT;
11906   unsigned PackOpcode;
11907   unsigned SizeBits = VT.getSizeInBits();
11908   unsigned EltBits = VT.getScalarSizeInBits();
11909   unsigned MaxStages = Log2_32(64 / EltBits);
11910   if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
11911                             Subtarget, MaxStages))
11912     return SDValue();
11913 
11914   unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
11915   unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
11916 
11917   // Don't lower multi-stage packs on AVX512, truncation is better.
11918   if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
11919     return SDValue();
11920 
11921   // Pack to the largest type possible:
11922   // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
11923   unsigned MaxPackBits = 16;
11924   if (CurrentEltBits > 16 &&
11925       (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
11926     MaxPackBits = 32;
11927 
11928   // Repeatedly pack down to the target size.
11929   SDValue Res;
11930   for (unsigned i = 0; i != NumStages; ++i) {
11931     unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
11932     unsigned NumSrcElts = SizeBits / SrcEltBits;
11933     MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11934     MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
11935     MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11936     MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
11937     Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
11938                       DAG.getBitcast(SrcVT, V2));
11939     V1 = V2 = Res;
11940     CurrentEltBits /= 2;
11941   }
11942   assert(Res && Res.getValueType() == VT &&
11943          "Failed to lower compaction shuffle");
11944   return Res;
11945 }
11946 
11947 /// Try to emit a bitmask instruction for a shuffle.
11948 ///
11949 /// This handles cases where we can model a blend exactly as a bitmask due to
11950 /// one of the inputs being zeroable.
lowerShuffleAsBitMask(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)11951 static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
11952                                      SDValue V2, ArrayRef<int> Mask,
11953                                      const APInt &Zeroable,
11954                                      const X86Subtarget &Subtarget,
11955                                      SelectionDAG &DAG) {
11956   MVT MaskVT = VT;
11957   MVT EltVT = VT.getVectorElementType();
11958   SDValue Zero, AllOnes;
11959   // Use f64 if i64 isn't legal.
11960   if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11961     EltVT = MVT::f64;
11962     MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11963   }
11964 
11965   MVT LogicVT = VT;
11966   if (EltVT == MVT::f32 || EltVT == MVT::f64) {
11967     Zero = DAG.getConstantFP(0.0, DL, EltVT);
11968     APFloat AllOnesValue = APFloat::getAllOnesValue(
11969         SelectionDAG::EVTToAPFloatSemantics(EltVT), EltVT.getSizeInBits());
11970     AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11971     LogicVT =
11972         MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
11973   } else {
11974     Zero = DAG.getConstant(0, DL, EltVT);
11975     AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11976   }
11977 
11978   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11979   SDValue V;
11980   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11981     if (Zeroable[i])
11982       continue;
11983     if (Mask[i] % Size != i)
11984       return SDValue(); // Not a blend.
11985     if (!V)
11986       V = Mask[i] < Size ? V1 : V2;
11987     else if (V != (Mask[i] < Size ? V1 : V2))
11988       return SDValue(); // Can only let one input through the mask.
11989 
11990     VMaskOps[i] = AllOnes;
11991   }
11992   if (!V)
11993     return SDValue(); // No non-zeroable elements!
11994 
11995   SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11996   VMask = DAG.getBitcast(LogicVT, VMask);
11997   V = DAG.getBitcast(LogicVT, V);
11998   SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11999   return DAG.getBitcast(VT, And);
12000 }
12001 
12002 /// Try to emit a blend instruction for a shuffle using bit math.
12003 ///
12004 /// This is used as a fallback approach when first class blend instructions are
12005 /// unavailable. Currently it is only suitable for integer vectors, but could
12006 /// be generalized for floating point vectors if desirable.
lowerShuffleAsBitBlend(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)12007 static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
12008                                       SDValue V2, ArrayRef<int> Mask,
12009                                       SelectionDAG &DAG) {
12010   assert(VT.isInteger() && "Only supports integer vector types!");
12011   MVT EltVT = VT.getVectorElementType();
12012   SDValue Zero = DAG.getConstant(0, DL, EltVT);
12013   SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
12014   SmallVector<SDValue, 16> MaskOps;
12015   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12016     if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
12017       return SDValue(); // Shuffled input!
12018     MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
12019   }
12020 
12021   SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
12022   V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
12023   V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
12024   return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12025 }
12026 
12027 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
12028                                     SDValue PreservedSrc,
12029                                     const X86Subtarget &Subtarget,
12030                                     SelectionDAG &DAG);
12031 
matchShuffleAsBlend(SDValue V1,SDValue V2,MutableArrayRef<int> Mask,const APInt & Zeroable,bool & ForceV1Zero,bool & ForceV2Zero,uint64_t & BlendMask)12032 static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
12033                                 MutableArrayRef<int> Mask,
12034                                 const APInt &Zeroable, bool &ForceV1Zero,
12035                                 bool &ForceV2Zero, uint64_t &BlendMask) {
12036   bool V1IsZeroOrUndef =
12037       V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
12038   bool V2IsZeroOrUndef =
12039       V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
12040 
12041   BlendMask = 0;
12042   ForceV1Zero = false, ForceV2Zero = false;
12043   assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
12044 
12045   // Attempt to generate the binary blend mask. If an input is zero then
12046   // we can use any lane.
12047   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12048     int M = Mask[i];
12049     if (M == SM_SentinelUndef)
12050       continue;
12051     if (M == i)
12052       continue;
12053     if (M == i + Size) {
12054       BlendMask |= 1ull << i;
12055       continue;
12056     }
12057     if (Zeroable[i]) {
12058       if (V1IsZeroOrUndef) {
12059         ForceV1Zero = true;
12060         Mask[i] = i;
12061         continue;
12062       }
12063       if (V2IsZeroOrUndef) {
12064         ForceV2Zero = true;
12065         BlendMask |= 1ull << i;
12066         Mask[i] = i + Size;
12067         continue;
12068       }
12069     }
12070     return false;
12071   }
12072   return true;
12073 }
12074 
scaleVectorShuffleBlendMask(uint64_t BlendMask,int Size,int Scale)12075 static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
12076                                             int Scale) {
12077   uint64_t ScaledMask = 0;
12078   for (int i = 0; i != Size; ++i)
12079     if (BlendMask & (1ull << i))
12080       ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
12081   return ScaledMask;
12082 }
12083 
12084 /// Try to emit a blend instruction for a shuffle.
12085 ///
12086 /// This doesn't do any checks for the availability of instructions for blending
12087 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
12088 /// be matched in the backend with the type given. What it does check for is
12089 /// that the shuffle mask is a blend, or convertible into a blend with zero.
lowerShuffleAsBlend(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Original,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)12090 static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
12091                                    SDValue V2, ArrayRef<int> Original,
12092                                    const APInt &Zeroable,
12093                                    const X86Subtarget &Subtarget,
12094                                    SelectionDAG &DAG) {
12095   uint64_t BlendMask = 0;
12096   bool ForceV1Zero = false, ForceV2Zero = false;
12097   SmallVector<int, 64> Mask(Original.begin(), Original.end());
12098   if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
12099                            BlendMask))
12100     return SDValue();
12101 
12102   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
12103   if (ForceV1Zero)
12104     V1 = getZeroVector(VT, Subtarget, DAG, DL);
12105   if (ForceV2Zero)
12106     V2 = getZeroVector(VT, Subtarget, DAG, DL);
12107 
12108   switch (VT.SimpleTy) {
12109   case MVT::v4i64:
12110   case MVT::v8i32:
12111     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
12112     LLVM_FALLTHROUGH;
12113   case MVT::v4f64:
12114   case MVT::v8f32:
12115     assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
12116     LLVM_FALLTHROUGH;
12117   case MVT::v2f64:
12118   case MVT::v2i64:
12119   case MVT::v4f32:
12120   case MVT::v4i32:
12121   case MVT::v8i16:
12122     assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
12123     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
12124                        DAG.getTargetConstant(BlendMask, DL, MVT::i8));
12125   case MVT::v16i16: {
12126     assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
12127     SmallVector<int, 8> RepeatedMask;
12128     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
12129       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
12130       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
12131       BlendMask = 0;
12132       for (int i = 0; i < 8; ++i)
12133         if (RepeatedMask[i] >= 8)
12134           BlendMask |= 1ull << i;
12135       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12136                          DAG.getTargetConstant(BlendMask, DL, MVT::i8));
12137     }
12138     // Use PBLENDW for lower/upper lanes and then blend lanes.
12139     // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
12140     // merge to VSELECT where useful.
12141     uint64_t LoMask = BlendMask & 0xFF;
12142     uint64_t HiMask = (BlendMask >> 8) & 0xFF;
12143     if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
12144       SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12145                                DAG.getTargetConstant(LoMask, DL, MVT::i8));
12146       SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12147                                DAG.getTargetConstant(HiMask, DL, MVT::i8));
12148       return DAG.getVectorShuffle(
12149           MVT::v16i16, DL, Lo, Hi,
12150           {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
12151     }
12152     LLVM_FALLTHROUGH;
12153   }
12154   case MVT::v32i8:
12155     assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
12156     LLVM_FALLTHROUGH;
12157   case MVT::v16i8: {
12158     assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
12159 
12160     // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
12161     if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12162                                                Subtarget, DAG))
12163       return Masked;
12164 
12165     if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
12166       MVT IntegerType =
12167           MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12168       SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12169       return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12170     }
12171 
12172     // If we have VPTERNLOG, we can use that as a bit blend.
12173     if (Subtarget.hasVLX())
12174       if (SDValue BitBlend =
12175               lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
12176         return BitBlend;
12177 
12178     // Scale the blend by the number of bytes per element.
12179     int Scale = VT.getScalarSizeInBits() / 8;
12180 
12181     // This form of blend is always done on bytes. Compute the byte vector
12182     // type.
12183     MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12184 
12185     // x86 allows load folding with blendvb from the 2nd source operand. But
12186     // we are still using LLVM select here (see comment below), so that's V1.
12187     // If V2 can be load-folded and V1 cannot be load-folded, then commute to
12188     // allow that load-folding possibility.
12189     if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
12190       ShuffleVectorSDNode::commuteMask(Mask);
12191       std::swap(V1, V2);
12192     }
12193 
12194     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
12195     // mix of LLVM's code generator and the x86 backend. We tell the code
12196     // generator that boolean values in the elements of an x86 vector register
12197     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
12198     // mapping a select to operand #1, and 'false' mapping to operand #2. The
12199     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
12200     // of the element (the remaining are ignored) and 0 in that high bit would
12201     // mean operand #1 while 1 in the high bit would mean operand #2. So while
12202     // the LLVM model for boolean values in vector elements gets the relevant
12203     // bit set, it is set backwards and over constrained relative to x86's
12204     // actual model.
12205     SmallVector<SDValue, 32> VSELECTMask;
12206     for (int i = 0, Size = Mask.size(); i < Size; ++i)
12207       for (int j = 0; j < Scale; ++j)
12208         VSELECTMask.push_back(
12209             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
12210                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
12211                                           MVT::i8));
12212 
12213     V1 = DAG.getBitcast(BlendVT, V1);
12214     V2 = DAG.getBitcast(BlendVT, V2);
12215     return DAG.getBitcast(
12216         VT,
12217         DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
12218                       V1, V2));
12219   }
12220   case MVT::v16f32:
12221   case MVT::v8f64:
12222   case MVT::v8i64:
12223   case MVT::v16i32:
12224   case MVT::v32i16:
12225   case MVT::v64i8: {
12226     // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
12227     bool OptForSize = DAG.shouldOptForSize();
12228     if (!OptForSize) {
12229       if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12230                                                  Subtarget, DAG))
12231         return Masked;
12232     }
12233 
12234     // Otherwise load an immediate into a GPR, cast to k-register, and use a
12235     // masked move.
12236     MVT IntegerType =
12237         MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12238     SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12239     return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12240   }
12241   default:
12242     llvm_unreachable("Not a supported integer vector type!");
12243   }
12244 }
12245 
12246 /// Try to lower as a blend of elements from two inputs followed by
12247 /// a single-input permutation.
12248 ///
12249 /// This matches the pattern where we can blend elements from two inputs and
12250 /// then reduce the shuffle to a single-input permutation.
lowerShuffleAsBlendAndPermute(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG,bool ImmBlends=false)12251 static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
12252                                              SDValue V1, SDValue V2,
12253                                              ArrayRef<int> Mask,
12254                                              SelectionDAG &DAG,
12255                                              bool ImmBlends = false) {
12256   // We build up the blend mask while checking whether a blend is a viable way
12257   // to reduce the shuffle.
12258   SmallVector<int, 32> BlendMask(Mask.size(), -1);
12259   SmallVector<int, 32> PermuteMask(Mask.size(), -1);
12260 
12261   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12262     if (Mask[i] < 0)
12263       continue;
12264 
12265     assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
12266 
12267     if (BlendMask[Mask[i] % Size] < 0)
12268       BlendMask[Mask[i] % Size] = Mask[i];
12269     else if (BlendMask[Mask[i] % Size] != Mask[i])
12270       return SDValue(); // Can't blend in the needed input!
12271 
12272     PermuteMask[i] = Mask[i] % Size;
12273   }
12274 
12275   // If only immediate blends, then bail if the blend mask can't be widened to
12276   // i16.
12277   unsigned EltSize = VT.getScalarSizeInBits();
12278   if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
12279     return SDValue();
12280 
12281   SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
12282   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
12283 }
12284 
12285 /// Try to lower as an unpack of elements from two inputs followed by
12286 /// a single-input permutation.
12287 ///
12288 /// This matches the pattern where we can unpack elements from two inputs and
12289 /// then reduce the shuffle to a single-input (wider) permutation.
lowerShuffleAsUNPCKAndPermute(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)12290 static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
12291                                              SDValue V1, SDValue V2,
12292                                              ArrayRef<int> Mask,
12293                                              SelectionDAG &DAG) {
12294   int NumElts = Mask.size();
12295   int NumLanes = VT.getSizeInBits() / 128;
12296   int NumLaneElts = NumElts / NumLanes;
12297   int NumHalfLaneElts = NumLaneElts / 2;
12298 
12299   bool MatchLo = true, MatchHi = true;
12300   SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
12301 
12302   // Determine UNPCKL/UNPCKH type and operand order.
12303   for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12304     for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
12305       int M = Mask[Lane + Elt];
12306       if (M < 0)
12307         continue;
12308 
12309       SDValue &Op = Ops[Elt & 1];
12310       if (M < NumElts && (Op.isUndef() || Op == V1))
12311         Op = V1;
12312       else if (NumElts <= M && (Op.isUndef() || Op == V2))
12313         Op = V2;
12314       else
12315         return SDValue();
12316 
12317       int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
12318       MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
12319                  isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
12320       MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
12321                  isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
12322       if (!MatchLo && !MatchHi)
12323         return SDValue();
12324     }
12325   }
12326   assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
12327 
12328   // Now check that each pair of elts come from the same unpack pair
12329   // and set the permute mask based on each pair.
12330   // TODO - Investigate cases where we permute individual elements.
12331   SmallVector<int, 32> PermuteMask(NumElts, -1);
12332   for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12333     for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
12334       int M0 = Mask[Lane + Elt + 0];
12335       int M1 = Mask[Lane + Elt + 1];
12336       if (0 <= M0 && 0 <= M1 &&
12337           (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
12338         return SDValue();
12339       if (0 <= M0)
12340         PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
12341       if (0 <= M1)
12342         PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
12343     }
12344   }
12345 
12346   unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
12347   SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
12348   return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
12349 }
12350 
12351 /// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
12352 /// permuting the elements of the result in place.
lowerShuffleAsByteRotateAndPermute(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)12353 static SDValue lowerShuffleAsByteRotateAndPermute(
12354     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12355     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12356   if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
12357       (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
12358       (VT.is512BitVector() && !Subtarget.hasBWI()))
12359     return SDValue();
12360 
12361   // We don't currently support lane crossing permutes.
12362   if (is128BitLaneCrossingShuffleMask(VT, Mask))
12363     return SDValue();
12364 
12365   int Scale = VT.getScalarSizeInBits() / 8;
12366   int NumLanes = VT.getSizeInBits() / 128;
12367   int NumElts = VT.getVectorNumElements();
12368   int NumEltsPerLane = NumElts / NumLanes;
12369 
12370   // Determine range of mask elts.
12371   bool Blend1 = true;
12372   bool Blend2 = true;
12373   std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
12374   std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
12375   for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12376     for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12377       int M = Mask[Lane + Elt];
12378       if (M < 0)
12379         continue;
12380       if (M < NumElts) {
12381         Blend1 &= (M == (Lane + Elt));
12382         assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
12383         M = M % NumEltsPerLane;
12384         Range1.first = std::min(Range1.first, M);
12385         Range1.second = std::max(Range1.second, M);
12386       } else {
12387         M -= NumElts;
12388         Blend2 &= (M == (Lane + Elt));
12389         assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
12390         M = M % NumEltsPerLane;
12391         Range2.first = std::min(Range2.first, M);
12392         Range2.second = std::max(Range2.second, M);
12393       }
12394     }
12395   }
12396 
12397   // Bail if we don't need both elements.
12398   // TODO - it might be worth doing this for unary shuffles if the permute
12399   // can be widened.
12400   if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
12401       !(0 <= Range2.first && Range2.second < NumEltsPerLane))
12402     return SDValue();
12403 
12404   if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
12405     return SDValue();
12406 
12407   // Rotate the 2 ops so we can access both ranges, then permute the result.
12408   auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
12409     MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12410     SDValue Rotate = DAG.getBitcast(
12411         VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
12412                         DAG.getBitcast(ByteVT, Lo),
12413                         DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
12414     SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
12415     for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12416       for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12417         int M = Mask[Lane + Elt];
12418         if (M < 0)
12419           continue;
12420         if (M < NumElts)
12421           PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
12422         else
12423           PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
12424       }
12425     }
12426     return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
12427   };
12428 
12429   // Check if the ranges are small enough to rotate from either direction.
12430   if (Range2.second < Range1.first)
12431     return RotateAndPermute(V1, V2, Range1.first, 0);
12432   if (Range1.second < Range2.first)
12433     return RotateAndPermute(V2, V1, Range2.first, NumElts);
12434   return SDValue();
12435 }
12436 
12437 /// Generic routine to decompose a shuffle and blend into independent
12438 /// blends and permutes.
12439 ///
12440 /// This matches the extremely common pattern for handling combined
12441 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
12442 /// operations. It will try to pick the best arrangement of shuffles and
12443 /// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
lowerShuffleAsDecomposedShuffleMerge(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)12444 static SDValue lowerShuffleAsDecomposedShuffleMerge(
12445     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12446     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12447   int NumElts = Mask.size();
12448   int NumLanes = VT.getSizeInBits() / 128;
12449   int NumEltsPerLane = NumElts / NumLanes;
12450 
12451   // Shuffle the input elements into the desired positions in V1 and V2 and
12452   // unpack/blend them together.
12453   bool IsAlternating = true;
12454   SmallVector<int, 32> V1Mask(NumElts, -1);
12455   SmallVector<int, 32> V2Mask(NumElts, -1);
12456   SmallVector<int, 32> FinalMask(NumElts, -1);
12457   for (int i = 0; i < NumElts; ++i) {
12458     int M = Mask[i];
12459     if (M >= 0 && M < NumElts) {
12460       V1Mask[i] = M;
12461       FinalMask[i] = i;
12462       IsAlternating &= (i & 1) == 0;
12463     } else if (M >= NumElts) {
12464       V2Mask[i] = M - NumElts;
12465       FinalMask[i] = i + NumElts;
12466       IsAlternating &= (i & 1) == 1;
12467     }
12468   }
12469 
12470   // Try to lower with the simpler initial blend/unpack/rotate strategies unless
12471   // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
12472   // the shuffle may be able to fold with a load or other benefit. However, when
12473   // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
12474   // pre-shuffle first is a better strategy.
12475   if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
12476     // Only prefer immediate blends to unpack/rotate.
12477     if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12478                                                           DAG, true))
12479       return BlendPerm;
12480     if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
12481                                                            DAG))
12482       return UnpackPerm;
12483     if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
12484             DL, VT, V1, V2, Mask, Subtarget, DAG))
12485       return RotatePerm;
12486     // Unpack/rotate failed - try again with variable blends.
12487     if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12488                                                           DAG))
12489       return BlendPerm;
12490   }
12491 
12492   // If the final mask is an alternating blend of vXi8/vXi16, convert to an
12493   // UNPCKL(SHUFFLE, SHUFFLE) pattern.
12494   // TODO: It doesn't have to be alternating - but each lane mustn't have more
12495   // than half the elements coming from each source.
12496   if (IsAlternating && VT.getScalarSizeInBits() < 32) {
12497     V1Mask.assign(NumElts, -1);
12498     V2Mask.assign(NumElts, -1);
12499     FinalMask.assign(NumElts, -1);
12500     for (int i = 0; i != NumElts; i += NumEltsPerLane)
12501       for (int j = 0; j != NumEltsPerLane; ++j) {
12502         int M = Mask[i + j];
12503         if (M >= 0 && M < NumElts) {
12504           V1Mask[i + (j / 2)] = M;
12505           FinalMask[i + j] = i + (j / 2);
12506         } else if (M >= NumElts) {
12507           V2Mask[i + (j / 2)] = M - NumElts;
12508           FinalMask[i + j] = i + (j / 2) + NumElts;
12509         }
12510       }
12511   }
12512 
12513   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
12514   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
12515   return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
12516 }
12517 
12518 /// Try to lower a vector shuffle as a bit rotation.
12519 ///
12520 /// Look for a repeated rotation pattern in each sub group.
12521 /// Returns a ISD::ROTL element rotation amount or -1 if failed.
matchShuffleAsBitRotate(ArrayRef<int> Mask,int NumSubElts)12522 static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
12523   int NumElts = Mask.size();
12524   assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask");
12525 
12526   int RotateAmt = -1;
12527   for (int i = 0; i != NumElts; i += NumSubElts) {
12528     for (int j = 0; j != NumSubElts; ++j) {
12529       int M = Mask[i + j];
12530       if (M < 0)
12531         continue;
12532       if (!isInRange(M, i, i + NumSubElts))
12533         return -1;
12534       int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
12535       if (0 <= RotateAmt && Offset != RotateAmt)
12536         return -1;
12537       RotateAmt = Offset;
12538     }
12539   }
12540   return RotateAmt;
12541 }
12542 
matchShuffleAsBitRotate(MVT & RotateVT,int EltSizeInBits,const X86Subtarget & Subtarget,ArrayRef<int> Mask)12543 static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
12544                                    const X86Subtarget &Subtarget,
12545                                    ArrayRef<int> Mask) {
12546   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12547   assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
12548 
12549   // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
12550   int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
12551   int MaxSubElts = 64 / EltSizeInBits;
12552   for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
12553     int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
12554     if (RotateAmt < 0)
12555       continue;
12556 
12557     int NumElts = Mask.size();
12558     MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
12559     RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
12560     return RotateAmt * EltSizeInBits;
12561   }
12562 
12563   return -1;
12564 }
12565 
12566 /// Lower shuffle using X86ISD::VROTLI rotations.
lowerShuffleAsBitRotate(const SDLoc & DL,MVT VT,SDValue V1,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)12567 static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
12568                                        ArrayRef<int> Mask,
12569                                        const X86Subtarget &Subtarget,
12570                                        SelectionDAG &DAG) {
12571   // Only XOP + AVX512 targets have bit rotation instructions.
12572   // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
12573   bool IsLegal =
12574       (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
12575   if (!IsLegal && Subtarget.hasSSE3())
12576     return SDValue();
12577 
12578   MVT RotateVT;
12579   int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
12580                                           Subtarget, Mask);
12581   if (RotateAmt < 0)
12582     return SDValue();
12583 
12584   // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
12585   // expanded to OR(SRL,SHL), will be more efficient, but if they can
12586   // widen to vXi16 or more then existing lowering should will be better.
12587   if (!IsLegal) {
12588     if ((RotateAmt % 16) == 0)
12589       return SDValue();
12590     // TODO: Use getTargetVShiftByConstNode.
12591     unsigned ShlAmt = RotateAmt;
12592     unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
12593     V1 = DAG.getBitcast(RotateVT, V1);
12594     SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
12595                               DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
12596     SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
12597                               DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
12598     SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
12599     return DAG.getBitcast(VT, Rot);
12600   }
12601 
12602   SDValue Rot =
12603       DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
12604                   DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
12605   return DAG.getBitcast(VT, Rot);
12606 }
12607 
12608 /// Try to match a vector shuffle as an element rotation.
12609 ///
12610 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
matchShuffleAsElementRotate(SDValue & V1,SDValue & V2,ArrayRef<int> Mask)12611 static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
12612                                        ArrayRef<int> Mask) {
12613   int NumElts = Mask.size();
12614 
12615   // We need to detect various ways of spelling a rotation:
12616   //   [11, 12, 13, 14, 15,  0,  1,  2]
12617   //   [-1, 12, 13, 14, -1, -1,  1, -1]
12618   //   [-1, -1, -1, -1, -1, -1,  1,  2]
12619   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
12620   //   [-1,  4,  5,  6, -1, -1,  9, -1]
12621   //   [-1,  4,  5,  6, -1, -1, -1, -1]
12622   int Rotation = 0;
12623   SDValue Lo, Hi;
12624   for (int i = 0; i < NumElts; ++i) {
12625     int M = Mask[i];
12626     assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
12627            "Unexpected mask index.");
12628     if (M < 0)
12629       continue;
12630 
12631     // Determine where a rotated vector would have started.
12632     int StartIdx = i - (M % NumElts);
12633     if (StartIdx == 0)
12634       // The identity rotation isn't interesting, stop.
12635       return -1;
12636 
12637     // If we found the tail of a vector the rotation must be the missing
12638     // front. If we found the head of a vector, it must be how much of the
12639     // head.
12640     int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
12641 
12642     if (Rotation == 0)
12643       Rotation = CandidateRotation;
12644     else if (Rotation != CandidateRotation)
12645       // The rotations don't match, so we can't match this mask.
12646       return -1;
12647 
12648     // Compute which value this mask is pointing at.
12649     SDValue MaskV = M < NumElts ? V1 : V2;
12650 
12651     // Compute which of the two target values this index should be assigned
12652     // to. This reflects whether the high elements are remaining or the low
12653     // elements are remaining.
12654     SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
12655 
12656     // Either set up this value if we've not encountered it before, or check
12657     // that it remains consistent.
12658     if (!TargetV)
12659       TargetV = MaskV;
12660     else if (TargetV != MaskV)
12661       // This may be a rotation, but it pulls from the inputs in some
12662       // unsupported interleaving.
12663       return -1;
12664   }
12665 
12666   // Check that we successfully analyzed the mask, and normalize the results.
12667   assert(Rotation != 0 && "Failed to locate a viable rotation!");
12668   assert((Lo || Hi) && "Failed to find a rotated input vector!");
12669   if (!Lo)
12670     Lo = Hi;
12671   else if (!Hi)
12672     Hi = Lo;
12673 
12674   V1 = Lo;
12675   V2 = Hi;
12676 
12677   return Rotation;
12678 }
12679 
12680 /// Try to lower a vector shuffle as a byte rotation.
12681 ///
12682 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
12683 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
12684 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
12685 /// try to generically lower a vector shuffle through such an pattern. It
12686 /// does not check for the profitability of lowering either as PALIGNR or
12687 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
12688 /// This matches shuffle vectors that look like:
12689 ///
12690 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
12691 ///
12692 /// Essentially it concatenates V1 and V2, shifts right by some number of
12693 /// elements, and takes the low elements as the result. Note that while this is
12694 /// specified as a *right shift* because x86 is little-endian, it is a *left
12695 /// rotate* of the vector lanes.
matchShuffleAsByteRotate(MVT VT,SDValue & V1,SDValue & V2,ArrayRef<int> Mask)12696 static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
12697                                     ArrayRef<int> Mask) {
12698   // Don't accept any shuffles with zero elements.
12699   if (isAnyZero(Mask))
12700     return -1;
12701 
12702   // PALIGNR works on 128-bit lanes.
12703   SmallVector<int, 16> RepeatedMask;
12704   if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
12705     return -1;
12706 
12707   int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
12708   if (Rotation <= 0)
12709     return -1;
12710 
12711   // PALIGNR rotates bytes, so we need to scale the
12712   // rotation based on how many bytes are in the vector lane.
12713   int NumElts = RepeatedMask.size();
12714   int Scale = 16 / NumElts;
12715   return Rotation * Scale;
12716 }
12717 
lowerShuffleAsByteRotate(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)12718 static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
12719                                         SDValue V2, ArrayRef<int> Mask,
12720                                         const X86Subtarget &Subtarget,
12721                                         SelectionDAG &DAG) {
12722   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12723 
12724   SDValue Lo = V1, Hi = V2;
12725   int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
12726   if (ByteRotation <= 0)
12727     return SDValue();
12728 
12729   // Cast the inputs to i8 vector of correct length to match PALIGNR or
12730   // PSLLDQ/PSRLDQ.
12731   MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12732   Lo = DAG.getBitcast(ByteVT, Lo);
12733   Hi = DAG.getBitcast(ByteVT, Hi);
12734 
12735   // SSSE3 targets can use the palignr instruction.
12736   if (Subtarget.hasSSSE3()) {
12737     assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
12738            "512-bit PALIGNR requires BWI instructions");
12739     return DAG.getBitcast(
12740         VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
12741                         DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
12742   }
12743 
12744   assert(VT.is128BitVector() &&
12745          "Rotate-based lowering only supports 128-bit lowering!");
12746   assert(Mask.size() <= 16 &&
12747          "Can shuffle at most 16 bytes in a 128-bit vector!");
12748   assert(ByteVT == MVT::v16i8 &&
12749          "SSE2 rotate lowering only needed for v16i8!");
12750 
12751   // Default SSE2 implementation
12752   int LoByteShift = 16 - ByteRotation;
12753   int HiByteShift = ByteRotation;
12754 
12755   SDValue LoShift =
12756       DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12757                   DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12758   SDValue HiShift =
12759       DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12760                   DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12761   return DAG.getBitcast(VT,
12762                         DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12763 }
12764 
12765 /// Try to lower a vector shuffle as a dword/qword rotation.
12766 ///
12767 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12768 /// rotation of the concatenation of two vectors; This routine will
12769 /// try to generically lower a vector shuffle through such an pattern.
12770 ///
12771 /// Essentially it concatenates V1 and V2, shifts right by some number of
12772 /// elements, and takes the low elements as the result. Note that while this is
12773 /// specified as a *right shift* because x86 is little-endian, it is a *left
12774 /// rotate* of the vector lanes.
lowerShuffleAsVALIGN(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)12775 static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
12776                                     SDValue V2, ArrayRef<int> Mask,
12777                                     const X86Subtarget &Subtarget,
12778                                     SelectionDAG &DAG) {
12779   assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
12780          "Only 32-bit and 64-bit elements are supported!");
12781 
12782   // 128/256-bit vectors are only supported with VLX.
12783   assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
12784          && "VLX required for 128/256-bit vectors");
12785 
12786   SDValue Lo = V1, Hi = V2;
12787   int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12788   if (Rotation <= 0)
12789     return SDValue();
12790 
12791   return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12792                      DAG.getTargetConstant(Rotation, DL, MVT::i8));
12793 }
12794 
12795 /// Try to lower a vector shuffle as a byte shift sequence.
lowerShuffleAsByteShiftMask(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)12796 static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
12797                                            SDValue V2, ArrayRef<int> Mask,
12798                                            const APInt &Zeroable,
12799                                            const X86Subtarget &Subtarget,
12800                                            SelectionDAG &DAG) {
12801   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12802   assert(VT.is128BitVector() && "Only 128-bit vectors supported");
12803 
12804   // We need a shuffle that has zeros at one/both ends and a sequential
12805   // shuffle from one source within.
12806   unsigned ZeroLo = Zeroable.countTrailingOnes();
12807   unsigned ZeroHi = Zeroable.countLeadingOnes();
12808   if (!ZeroLo && !ZeroHi)
12809     return SDValue();
12810 
12811   unsigned NumElts = Mask.size();
12812   unsigned Len = NumElts - (ZeroLo + ZeroHi);
12813   if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12814     return SDValue();
12815 
12816   unsigned Scale = VT.getScalarSizeInBits() / 8;
12817   ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12818   if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12819       !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12820     return SDValue();
12821 
12822   SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12823   Res = DAG.getBitcast(MVT::v16i8, Res);
12824 
12825   // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12826   // inner sequential set of elements, possibly offset:
12827   // 01234567 --> zzzzzz01 --> 1zzzzzzz
12828   // 01234567 --> 4567zzzz --> zzzzz456
12829   // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12830   if (ZeroLo == 0) {
12831     unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12832     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12833                       DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12834     Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12835                       DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12836   } else if (ZeroHi == 0) {
12837     unsigned Shift = Mask[ZeroLo] % NumElts;
12838     Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12839                       DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12840     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12841                       DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12842   } else if (!Subtarget.hasSSSE3()) {
12843     // If we don't have PSHUFB then its worth avoiding an AND constant mask
12844     // by performing 3 byte shifts. Shuffle combining can kick in above that.
12845     // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12846     unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12847     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12848                       DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12849     Shift += Mask[ZeroLo] % NumElts;
12850     Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12851                       DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12852     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12853                       DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12854   } else
12855     return SDValue();
12856 
12857   return DAG.getBitcast(VT, Res);
12858 }
12859 
12860 /// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12861 ///
12862 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12863 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12864 /// matches elements from one of the input vectors shuffled to the left or
12865 /// right with zeroable elements 'shifted in'. It handles both the strictly
12866 /// bit-wise element shifts and the byte shift across an entire 128-bit double
12867 /// quad word lane.
12868 ///
12869 /// PSHL : (little-endian) left bit shift.
12870 /// [ zz, 0, zz,  2 ]
12871 /// [ -1, 4, zz, -1 ]
12872 /// PSRL : (little-endian) right bit shift.
12873 /// [  1, zz,  3, zz]
12874 /// [ -1, -1,  7, zz]
12875 /// PSLLDQ : (little-endian) left byte shift
12876 /// [ zz,  0,  1,  2,  3,  4,  5,  6]
12877 /// [ zz, zz, -1, -1,  2,  3,  4, -1]
12878 /// [ zz, zz, zz, zz, zz, zz, -1,  1]
12879 /// PSRLDQ : (little-endian) right byte shift
12880 /// [  5, 6,  7, zz, zz, zz, zz, zz]
12881 /// [ -1, 5,  6,  7, zz, zz, zz, zz]
12882 /// [  1, 2, -1, -1, -1, -1, zz, zz]
matchShuffleAsShift(MVT & ShiftVT,unsigned & Opcode,unsigned ScalarSizeInBits,ArrayRef<int> Mask,int MaskOffset,const APInt & Zeroable,const X86Subtarget & Subtarget)12883 static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12884                                unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12885                                int MaskOffset, const APInt &Zeroable,
12886                                const X86Subtarget &Subtarget) {
12887   int Size = Mask.size();
12888   unsigned SizeInBits = Size * ScalarSizeInBits;
12889 
12890   auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12891     for (int i = 0; i < Size; i += Scale)
12892       for (int j = 0; j < Shift; ++j)
12893         if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12894           return false;
12895 
12896     return true;
12897   };
12898 
12899   auto MatchShift = [&](int Shift, int Scale, bool Left) {
12900     for (int i = 0; i != Size; i += Scale) {
12901       unsigned Pos = Left ? i + Shift : i;
12902       unsigned Low = Left ? i : i + Shift;
12903       unsigned Len = Scale - Shift;
12904       if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12905         return -1;
12906     }
12907 
12908     int ShiftEltBits = ScalarSizeInBits * Scale;
12909     bool ByteShift = ShiftEltBits > 64;
12910     Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12911                   : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12912     int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12913 
12914     // Normalize the scale for byte shifts to still produce an i64 element
12915     // type.
12916     Scale = ByteShift ? Scale / 2 : Scale;
12917 
12918     // We need to round trip through the appropriate type for the shift.
12919     MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12920     ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12921                         : MVT::getVectorVT(ShiftSVT, Size / Scale);
12922     return (int)ShiftAmt;
12923   };
12924 
12925   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12926   // keep doubling the size of the integer elements up to that. We can
12927   // then shift the elements of the integer vector by whole multiples of
12928   // their width within the elements of the larger integer vector. Test each
12929   // multiple to see if we can find a match with the moved element indices
12930   // and that the shifted in elements are all zeroable.
12931   unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12932   for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12933     for (int Shift = 1; Shift != Scale; ++Shift)
12934       for (bool Left : {true, false})
12935         if (CheckZeros(Shift, Scale, Left)) {
12936           int ShiftAmt = MatchShift(Shift, Scale, Left);
12937           if (0 < ShiftAmt)
12938             return ShiftAmt;
12939         }
12940 
12941   // no match
12942   return -1;
12943 }
12944 
lowerShuffleAsShift(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)12945 static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
12946                                    SDValue V2, ArrayRef<int> Mask,
12947                                    const APInt &Zeroable,
12948                                    const X86Subtarget &Subtarget,
12949                                    SelectionDAG &DAG) {
12950   int Size = Mask.size();
12951   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12952 
12953   MVT ShiftVT;
12954   SDValue V = V1;
12955   unsigned Opcode;
12956 
12957   // Try to match shuffle against V1 shift.
12958   int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12959                                      Mask, 0, Zeroable, Subtarget);
12960 
12961   // If V1 failed, try to match shuffle against V2 shift.
12962   if (ShiftAmt < 0) {
12963     ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12964                                    Mask, Size, Zeroable, Subtarget);
12965     V = V2;
12966   }
12967 
12968   if (ShiftAmt < 0)
12969     return SDValue();
12970 
12971   assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
12972          "Illegal integer vector type");
12973   V = DAG.getBitcast(ShiftVT, V);
12974   V = DAG.getNode(Opcode, DL, ShiftVT, V,
12975                   DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12976   return DAG.getBitcast(VT, V);
12977 }
12978 
12979 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12980 // Remainder of lower half result is zero and upper half is all undef.
matchShuffleAsEXTRQ(MVT VT,SDValue & V1,SDValue & V2,ArrayRef<int> Mask,uint64_t & BitLen,uint64_t & BitIdx,const APInt & Zeroable)12981 static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12982                                 ArrayRef<int> Mask, uint64_t &BitLen,
12983                                 uint64_t &BitIdx, const APInt &Zeroable) {
12984   int Size = Mask.size();
12985   int HalfSize = Size / 2;
12986   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12987   assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
12988 
12989   // Upper half must be undefined.
12990   if (!isUndefUpperHalf(Mask))
12991     return false;
12992 
12993   // Determine the extraction length from the part of the
12994   // lower half that isn't zeroable.
12995   int Len = HalfSize;
12996   for (; Len > 0; --Len)
12997     if (!Zeroable[Len - 1])
12998       break;
12999   assert(Len > 0 && "Zeroable shuffle mask");
13000 
13001   // Attempt to match first Len sequential elements from the lower half.
13002   SDValue Src;
13003   int Idx = -1;
13004   for (int i = 0; i != Len; ++i) {
13005     int M = Mask[i];
13006     if (M == SM_SentinelUndef)
13007       continue;
13008     SDValue &V = (M < Size ? V1 : V2);
13009     M = M % Size;
13010 
13011     // The extracted elements must start at a valid index and all mask
13012     // elements must be in the lower half.
13013     if (i > M || M >= HalfSize)
13014       return false;
13015 
13016     if (Idx < 0 || (Src == V && Idx == (M - i))) {
13017       Src = V;
13018       Idx = M - i;
13019       continue;
13020     }
13021     return false;
13022   }
13023 
13024   if (!Src || Idx < 0)
13025     return false;
13026 
13027   assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
13028   BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
13029   BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
13030   V1 = Src;
13031   return true;
13032 }
13033 
13034 // INSERTQ: Extract lowest Len elements from lower half of second source and
13035 // insert over first source, starting at Idx.
13036 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
matchShuffleAsINSERTQ(MVT VT,SDValue & V1,SDValue & V2,ArrayRef<int> Mask,uint64_t & BitLen,uint64_t & BitIdx)13037 static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
13038                                   ArrayRef<int> Mask, uint64_t &BitLen,
13039                                   uint64_t &BitIdx) {
13040   int Size = Mask.size();
13041   int HalfSize = Size / 2;
13042   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
13043 
13044   // Upper half must be undefined.
13045   if (!isUndefUpperHalf(Mask))
13046     return false;
13047 
13048   for (int Idx = 0; Idx != HalfSize; ++Idx) {
13049     SDValue Base;
13050 
13051     // Attempt to match first source from mask before insertion point.
13052     if (isUndefInRange(Mask, 0, Idx)) {
13053       /* EMPTY */
13054     } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
13055       Base = V1;
13056     } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
13057       Base = V2;
13058     } else {
13059       continue;
13060     }
13061 
13062     // Extend the extraction length looking to match both the insertion of
13063     // the second source and the remaining elements of the first.
13064     for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
13065       SDValue Insert;
13066       int Len = Hi - Idx;
13067 
13068       // Match insertion.
13069       if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
13070         Insert = V1;
13071       } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
13072         Insert = V2;
13073       } else {
13074         continue;
13075       }
13076 
13077       // Match the remaining elements of the lower half.
13078       if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
13079         /* EMPTY */
13080       } else if ((!Base || (Base == V1)) &&
13081                  isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
13082         Base = V1;
13083       } else if ((!Base || (Base == V2)) &&
13084                  isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
13085                                             Size + Hi)) {
13086         Base = V2;
13087       } else {
13088         continue;
13089       }
13090 
13091       BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
13092       BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
13093       V1 = Base;
13094       V2 = Insert;
13095       return true;
13096     }
13097   }
13098 
13099   return false;
13100 }
13101 
13102 /// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
lowerShuffleWithSSE4A(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,SelectionDAG & DAG)13103 static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
13104                                      SDValue V2, ArrayRef<int> Mask,
13105                                      const APInt &Zeroable, SelectionDAG &DAG) {
13106   uint64_t BitLen, BitIdx;
13107   if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
13108     return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
13109                        DAG.getTargetConstant(BitLen, DL, MVT::i8),
13110                        DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13111 
13112   if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
13113     return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
13114                        V2 ? V2 : DAG.getUNDEF(VT),
13115                        DAG.getTargetConstant(BitLen, DL, MVT::i8),
13116                        DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13117 
13118   return SDValue();
13119 }
13120 
13121 /// Lower a vector shuffle as a zero or any extension.
13122 ///
13123 /// Given a specific number of elements, element bit width, and extension
13124 /// stride, produce either a zero or any extension based on the available
13125 /// features of the subtarget. The extended elements are consecutive and
13126 /// begin and can start from an offsetted element index in the input; to
13127 /// avoid excess shuffling the offset must either being in the bottom lane
13128 /// or at the start of a higher lane. All extended elements must be from
13129 /// the same lane.
lowerShuffleAsSpecificZeroOrAnyExtend(const SDLoc & DL,MVT VT,int Scale,int Offset,bool AnyExt,SDValue InputV,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)13130 static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
13131     const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
13132     ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13133   assert(Scale > 1 && "Need a scale to extend.");
13134   int EltBits = VT.getScalarSizeInBits();
13135   int NumElements = VT.getVectorNumElements();
13136   int NumEltsPerLane = 128 / EltBits;
13137   int OffsetLane = Offset / NumEltsPerLane;
13138   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
13139          "Only 8, 16, and 32 bit elements can be extended.");
13140   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
13141   assert(0 <= Offset && "Extension offset must be positive.");
13142   assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
13143          "Extension offset must be in the first lane or start an upper lane.");
13144 
13145   // Check that an index is in same lane as the base offset.
13146   auto SafeOffset = [&](int Idx) {
13147     return OffsetLane == (Idx / NumEltsPerLane);
13148   };
13149 
13150   // Shift along an input so that the offset base moves to the first element.
13151   auto ShuffleOffset = [&](SDValue V) {
13152     if (!Offset)
13153       return V;
13154 
13155     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13156     for (int i = 0; i * Scale < NumElements; ++i) {
13157       int SrcIdx = i + Offset;
13158       ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
13159     }
13160     return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
13161   };
13162 
13163   // Found a valid a/zext mask! Try various lowering strategies based on the
13164   // input type and available ISA extensions.
13165   if (Subtarget.hasSSE41()) {
13166     // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
13167     // PUNPCK will catch this in a later shuffle match.
13168     if (Offset && Scale == 2 && VT.is128BitVector())
13169       return SDValue();
13170     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
13171                                  NumElements / Scale);
13172     InputV = ShuffleOffset(InputV);
13173     InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
13174                                     DL, ExtVT, InputV, DAG);
13175     return DAG.getBitcast(VT, InputV);
13176   }
13177 
13178   assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
13179 
13180   // For any extends we can cheat for larger element sizes and use shuffle
13181   // instructions that can fold with a load and/or copy.
13182   if (AnyExt && EltBits == 32) {
13183     int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
13184                          -1};
13185     return DAG.getBitcast(
13186         VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13187                         DAG.getBitcast(MVT::v4i32, InputV),
13188                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13189   }
13190   if (AnyExt && EltBits == 16 && Scale > 2) {
13191     int PSHUFDMask[4] = {Offset / 2, -1,
13192                          SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
13193     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13194                          DAG.getBitcast(MVT::v4i32, InputV),
13195                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13196     int PSHUFWMask[4] = {1, -1, -1, -1};
13197     unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
13198     return DAG.getBitcast(
13199         VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
13200                         DAG.getBitcast(MVT::v8i16, InputV),
13201                         getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
13202   }
13203 
13204   // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
13205   // to 64-bits.
13206   if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
13207     assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
13208     assert(VT.is128BitVector() && "Unexpected vector width!");
13209 
13210     int LoIdx = Offset * EltBits;
13211     SDValue Lo = DAG.getBitcast(
13212         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13213                                 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13214                                 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
13215 
13216     if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
13217       return DAG.getBitcast(VT, Lo);
13218 
13219     int HiIdx = (Offset + 1) * EltBits;
13220     SDValue Hi = DAG.getBitcast(
13221         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13222                                 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13223                                 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
13224     return DAG.getBitcast(VT,
13225                           DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
13226   }
13227 
13228   // If this would require more than 2 unpack instructions to expand, use
13229   // pshufb when available. We can only use more than 2 unpack instructions
13230   // when zero extending i8 elements which also makes it easier to use pshufb.
13231   if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
13232     assert(NumElements == 16 && "Unexpected byte vector width!");
13233     SDValue PSHUFBMask[16];
13234     for (int i = 0; i < 16; ++i) {
13235       int Idx = Offset + (i / Scale);
13236       if ((i % Scale == 0 && SafeOffset(Idx))) {
13237         PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
13238         continue;
13239       }
13240       PSHUFBMask[i] =
13241           AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
13242     }
13243     InputV = DAG.getBitcast(MVT::v16i8, InputV);
13244     return DAG.getBitcast(
13245         VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
13246                         DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
13247   }
13248 
13249   // If we are extending from an offset, ensure we start on a boundary that
13250   // we can unpack from.
13251   int AlignToUnpack = Offset % (NumElements / Scale);
13252   if (AlignToUnpack) {
13253     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13254     for (int i = AlignToUnpack; i < NumElements; ++i)
13255       ShMask[i - AlignToUnpack] = i;
13256     InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
13257     Offset -= AlignToUnpack;
13258   }
13259 
13260   // Otherwise emit a sequence of unpacks.
13261   do {
13262     unsigned UnpackLoHi = X86ISD::UNPCKL;
13263     if (Offset >= (NumElements / 2)) {
13264       UnpackLoHi = X86ISD::UNPCKH;
13265       Offset -= (NumElements / 2);
13266     }
13267 
13268     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
13269     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
13270                          : getZeroVector(InputVT, Subtarget, DAG, DL);
13271     InputV = DAG.getBitcast(InputVT, InputV);
13272     InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
13273     Scale /= 2;
13274     EltBits *= 2;
13275     NumElements /= 2;
13276   } while (Scale > 1);
13277   return DAG.getBitcast(VT, InputV);
13278 }
13279 
13280 /// Try to lower a vector shuffle as a zero extension on any microarch.
13281 ///
13282 /// This routine will try to do everything in its power to cleverly lower
13283 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
13284 /// check for the profitability of this lowering,  it tries to aggressively
13285 /// match this pattern. It will use all of the micro-architectural details it
13286 /// can to emit an efficient lowering. It handles both blends with all-zero
13287 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
13288 /// masking out later).
13289 ///
13290 /// The reason we have dedicated lowering for zext-style shuffles is that they
13291 /// are both incredibly common and often quite performance sensitive.
lowerShuffleAsZeroOrAnyExtend(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)13292 static SDValue lowerShuffleAsZeroOrAnyExtend(
13293     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13294     const APInt &Zeroable, const X86Subtarget &Subtarget,
13295     SelectionDAG &DAG) {
13296   int Bits = VT.getSizeInBits();
13297   int NumLanes = Bits / 128;
13298   int NumElements = VT.getVectorNumElements();
13299   int NumEltsPerLane = NumElements / NumLanes;
13300   assert(VT.getScalarSizeInBits() <= 32 &&
13301          "Exceeds 32-bit integer zero extension limit");
13302   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
13303 
13304   // Define a helper function to check a particular ext-scale and lower to it if
13305   // valid.
13306   auto Lower = [&](int Scale) -> SDValue {
13307     SDValue InputV;
13308     bool AnyExt = true;
13309     int Offset = 0;
13310     int Matches = 0;
13311     for (int i = 0; i < NumElements; ++i) {
13312       int M = Mask[i];
13313       if (M < 0)
13314         continue; // Valid anywhere but doesn't tell us anything.
13315       if (i % Scale != 0) {
13316         // Each of the extended elements need to be zeroable.
13317         if (!Zeroable[i])
13318           return SDValue();
13319 
13320         // We no longer are in the anyext case.
13321         AnyExt = false;
13322         continue;
13323       }
13324 
13325       // Each of the base elements needs to be consecutive indices into the
13326       // same input vector.
13327       SDValue V = M < NumElements ? V1 : V2;
13328       M = M % NumElements;
13329       if (!InputV) {
13330         InputV = V;
13331         Offset = M - (i / Scale);
13332       } else if (InputV != V)
13333         return SDValue(); // Flip-flopping inputs.
13334 
13335       // Offset must start in the lowest 128-bit lane or at the start of an
13336       // upper lane.
13337       // FIXME: Is it ever worth allowing a negative base offset?
13338       if (!((0 <= Offset && Offset < NumEltsPerLane) ||
13339             (Offset % NumEltsPerLane) == 0))
13340         return SDValue();
13341 
13342       // If we are offsetting, all referenced entries must come from the same
13343       // lane.
13344       if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
13345         return SDValue();
13346 
13347       if ((M % NumElements) != (Offset + (i / Scale)))
13348         return SDValue(); // Non-consecutive strided elements.
13349       Matches++;
13350     }
13351 
13352     // If we fail to find an input, we have a zero-shuffle which should always
13353     // have already been handled.
13354     // FIXME: Maybe handle this here in case during blending we end up with one?
13355     if (!InputV)
13356       return SDValue();
13357 
13358     // If we are offsetting, don't extend if we only match a single input, we
13359     // can always do better by using a basic PSHUF or PUNPCK.
13360     if (Offset != 0 && Matches < 2)
13361       return SDValue();
13362 
13363     return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
13364                                                  InputV, Mask, Subtarget, DAG);
13365   };
13366 
13367   // The widest scale possible for extending is to a 64-bit integer.
13368   assert(Bits % 64 == 0 &&
13369          "The number of bits in a vector must be divisible by 64 on x86!");
13370   int NumExtElements = Bits / 64;
13371 
13372   // Each iteration, try extending the elements half as much, but into twice as
13373   // many elements.
13374   for (; NumExtElements < NumElements; NumExtElements *= 2) {
13375     assert(NumElements % NumExtElements == 0 &&
13376            "The input vector size must be divisible by the extended size.");
13377     if (SDValue V = Lower(NumElements / NumExtElements))
13378       return V;
13379   }
13380 
13381   // General extends failed, but 128-bit vectors may be able to use MOVQ.
13382   if (Bits != 128)
13383     return SDValue();
13384 
13385   // Returns one of the source operands if the shuffle can be reduced to a
13386   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
13387   auto CanZExtLowHalf = [&]() {
13388     for (int i = NumElements / 2; i != NumElements; ++i)
13389       if (!Zeroable[i])
13390         return SDValue();
13391     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
13392       return V1;
13393     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
13394       return V2;
13395     return SDValue();
13396   };
13397 
13398   if (SDValue V = CanZExtLowHalf()) {
13399     V = DAG.getBitcast(MVT::v2i64, V);
13400     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
13401     return DAG.getBitcast(VT, V);
13402   }
13403 
13404   // No viable ext lowering found.
13405   return SDValue();
13406 }
13407 
13408 /// Try to get a scalar value for a specific element of a vector.
13409 ///
13410 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
getScalarValueForVectorElement(SDValue V,int Idx,SelectionDAG & DAG)13411 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
13412                                               SelectionDAG &DAG) {
13413   MVT VT = V.getSimpleValueType();
13414   MVT EltVT = VT.getVectorElementType();
13415   V = peekThroughBitcasts(V);
13416 
13417   // If the bitcasts shift the element size, we can't extract an equivalent
13418   // element from it.
13419   MVT NewVT = V.getSimpleValueType();
13420   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
13421     return SDValue();
13422 
13423   if (V.getOpcode() == ISD::BUILD_VECTOR ||
13424       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
13425     // Ensure the scalar operand is the same size as the destination.
13426     // FIXME: Add support for scalar truncation where possible.
13427     SDValue S = V.getOperand(Idx);
13428     if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
13429       return DAG.getBitcast(EltVT, S);
13430   }
13431 
13432   return SDValue();
13433 }
13434 
13435 /// Helper to test for a load that can be folded with x86 shuffles.
13436 ///
13437 /// This is particularly important because the set of instructions varies
13438 /// significantly based on whether the operand is a load or not.
isShuffleFoldableLoad(SDValue V)13439 static bool isShuffleFoldableLoad(SDValue V) {
13440   V = peekThroughBitcasts(V);
13441   return ISD::isNON_EXTLoad(V.getNode());
13442 }
13443 
13444 /// Try to lower insertion of a single element into a zero vector.
13445 ///
13446 /// This is a common pattern that we have especially efficient patterns to lower
13447 /// across all subtarget feature sets.
lowerShuffleAsElementInsertion(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)13448 static SDValue lowerShuffleAsElementInsertion(
13449     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13450     const APInt &Zeroable, const X86Subtarget &Subtarget,
13451     SelectionDAG &DAG) {
13452   MVT ExtVT = VT;
13453   MVT EltVT = VT.getVectorElementType();
13454 
13455   int V2Index =
13456       find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
13457       Mask.begin();
13458   bool IsV1Zeroable = true;
13459   for (int i = 0, Size = Mask.size(); i < Size; ++i)
13460     if (i != V2Index && !Zeroable[i]) {
13461       IsV1Zeroable = false;
13462       break;
13463     }
13464 
13465   // Check for a single input from a SCALAR_TO_VECTOR node.
13466   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
13467   // all the smarts here sunk into that routine. However, the current
13468   // lowering of BUILD_VECTOR makes that nearly impossible until the old
13469   // vector shuffle lowering is dead.
13470   SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
13471                                                DAG);
13472   if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
13473     // We need to zext the scalar if it is smaller than an i32.
13474     V2S = DAG.getBitcast(EltVT, V2S);
13475     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
13476       // Using zext to expand a narrow element won't work for non-zero
13477       // insertions.
13478       if (!IsV1Zeroable)
13479         return SDValue();
13480 
13481       // Zero-extend directly to i32.
13482       ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
13483       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
13484     }
13485     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
13486   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
13487              EltVT == MVT::i16) {
13488     // Either not inserting from the low element of the input or the input
13489     // element size is too small to use VZEXT_MOVL to clear the high bits.
13490     return SDValue();
13491   }
13492 
13493   if (!IsV1Zeroable) {
13494     // If V1 can't be treated as a zero vector we have fewer options to lower
13495     // this. We can't support integer vectors or non-zero targets cheaply, and
13496     // the V1 elements can't be permuted in any way.
13497     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
13498     if (!VT.isFloatingPoint() || V2Index != 0)
13499       return SDValue();
13500     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
13501     V1Mask[V2Index] = -1;
13502     if (!isNoopShuffleMask(V1Mask))
13503       return SDValue();
13504     if (!VT.is128BitVector())
13505       return SDValue();
13506 
13507     // Otherwise, use MOVSD or MOVSS.
13508     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
13509            "Only two types of floating point element types to handle!");
13510     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
13511                        ExtVT, V1, V2);
13512   }
13513 
13514   // This lowering only works for the low element with floating point vectors.
13515   if (VT.isFloatingPoint() && V2Index != 0)
13516     return SDValue();
13517 
13518   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
13519   if (ExtVT != VT)
13520     V2 = DAG.getBitcast(VT, V2);
13521 
13522   if (V2Index != 0) {
13523     // If we have 4 or fewer lanes we can cheaply shuffle the element into
13524     // the desired position. Otherwise it is more efficient to do a vector
13525     // shift left. We know that we can do a vector shift left because all
13526     // the inputs are zero.
13527     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
13528       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
13529       V2Shuffle[V2Index] = 0;
13530       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
13531     } else {
13532       V2 = DAG.getBitcast(MVT::v16i8, V2);
13533       V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
13534                        DAG.getTargetConstant(
13535                            V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
13536       V2 = DAG.getBitcast(VT, V2);
13537     }
13538   }
13539   return V2;
13540 }
13541 
13542 /// Try to lower broadcast of a single - truncated - integer element,
13543 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
13544 ///
13545 /// This assumes we have AVX2.
lowerShuffleAsTruncBroadcast(const SDLoc & DL,MVT VT,SDValue V0,int BroadcastIdx,const X86Subtarget & Subtarget,SelectionDAG & DAG)13546 static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
13547                                             int BroadcastIdx,
13548                                             const X86Subtarget &Subtarget,
13549                                             SelectionDAG &DAG) {
13550   assert(Subtarget.hasAVX2() &&
13551          "We can only lower integer broadcasts with AVX2!");
13552 
13553   MVT EltVT = VT.getVectorElementType();
13554   MVT V0VT = V0.getSimpleValueType();
13555 
13556   assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
13557   assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
13558 
13559   MVT V0EltVT = V0VT.getVectorElementType();
13560   if (!V0EltVT.isInteger())
13561     return SDValue();
13562 
13563   const unsigned EltSize = EltVT.getSizeInBits();
13564   const unsigned V0EltSize = V0EltVT.getSizeInBits();
13565 
13566   // This is only a truncation if the original element type is larger.
13567   if (V0EltSize <= EltSize)
13568     return SDValue();
13569 
13570   assert(((V0EltSize % EltSize) == 0) &&
13571          "Scalar type sizes must all be powers of 2 on x86!");
13572 
13573   const unsigned V0Opc = V0.getOpcode();
13574   const unsigned Scale = V0EltSize / EltSize;
13575   const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
13576 
13577   if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
13578       V0Opc != ISD::BUILD_VECTOR)
13579     return SDValue();
13580 
13581   SDValue Scalar = V0.getOperand(V0BroadcastIdx);
13582 
13583   // If we're extracting non-least-significant bits, shift so we can truncate.
13584   // Hopefully, we can fold away the trunc/srl/load into the broadcast.
13585   // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
13586   // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
13587   if (const int OffsetIdx = BroadcastIdx % Scale)
13588     Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
13589                          DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
13590 
13591   return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
13592                      DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
13593 }
13594 
13595 /// Test whether this can be lowered with a single SHUFPS instruction.
13596 ///
13597 /// This is used to disable more specialized lowerings when the shufps lowering
13598 /// will happen to be efficient.
isSingleSHUFPSMask(ArrayRef<int> Mask)13599 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
13600   // This routine only handles 128-bit shufps.
13601   assert(Mask.size() == 4 && "Unsupported mask size!");
13602   assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
13603   assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
13604   assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
13605   assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
13606 
13607   // To lower with a single SHUFPS we need to have the low half and high half
13608   // each requiring a single input.
13609   if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
13610     return false;
13611   if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
13612     return false;
13613 
13614   return true;
13615 }
13616 
13617 /// If we are extracting two 128-bit halves of a vector and shuffling the
13618 /// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
13619 /// multi-shuffle lowering.
lowerShuffleOfExtractsAsVperm(const SDLoc & DL,SDValue N0,SDValue N1,ArrayRef<int> Mask,SelectionDAG & DAG)13620 static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
13621                                              SDValue N1, ArrayRef<int> Mask,
13622                                              SelectionDAG &DAG) {
13623   MVT VT = N0.getSimpleValueType();
13624   assert((VT.is128BitVector() &&
13625           (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
13626          "VPERM* family of shuffles requires 32-bit or 64-bit elements");
13627 
13628   // Check that both sources are extracts of the same source vector.
13629   if (!N0.hasOneUse() || !N1.hasOneUse() ||
13630       N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13631       N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13632       N0.getOperand(0) != N1.getOperand(0))
13633     return SDValue();
13634 
13635   SDValue WideVec = N0.getOperand(0);
13636   MVT WideVT = WideVec.getSimpleValueType();
13637   if (!WideVT.is256BitVector())
13638     return SDValue();
13639 
13640   // Match extracts of each half of the wide source vector. Commute the shuffle
13641   // if the extract of the low half is N1.
13642   unsigned NumElts = VT.getVectorNumElements();
13643   SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
13644   const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
13645   const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
13646   if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13647     ShuffleVectorSDNode::commuteMask(NewMask);
13648   else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13649     return SDValue();
13650 
13651   // Final bailout: if the mask is simple, we are better off using an extract
13652   // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13653   // because that avoids a constant load from memory.
13654   if (NumElts == 4 &&
13655       (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
13656     return SDValue();
13657 
13658   // Extend the shuffle mask with undef elements.
13659   NewMask.append(NumElts, -1);
13660 
13661   // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13662   SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13663                                       NewMask);
13664   // This is free: ymm -> xmm.
13665   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13666                      DAG.getIntPtrConstant(0, DL));
13667 }
13668 
13669 /// Try to lower broadcast of a single element.
13670 ///
13671 /// For convenience, this code also bundles all of the subtarget feature set
13672 /// filtering. While a little annoying to re-dispatch on type here, there isn't
13673 /// a convenient way to factor it out.
lowerShuffleAsBroadcast(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)13674 static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
13675                                        SDValue V2, ArrayRef<int> Mask,
13676                                        const X86Subtarget &Subtarget,
13677                                        SelectionDAG &DAG) {
13678   if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13679         (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
13680         (Subtarget.hasAVX2() && VT.isInteger())))
13681     return SDValue();
13682 
13683   // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13684   // we can only broadcast from a register with AVX2.
13685   unsigned NumEltBits = VT.getScalarSizeInBits();
13686   unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13687                         ? X86ISD::MOVDDUP
13688                         : X86ISD::VBROADCAST;
13689   bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13690 
13691   // Check that the mask is a broadcast.
13692   int BroadcastIdx = getSplatIndex(Mask);
13693   if (BroadcastIdx < 0)
13694     return SDValue();
13695   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
13696                                             "a sorted mask where the broadcast "
13697                                             "comes from V1.");
13698 
13699   // Go up the chain of (vector) values to find a scalar load that we can
13700   // combine with the broadcast.
13701   // TODO: Combine this logic with findEltLoadSrc() used by
13702   //       EltsFromConsecutiveLoads().
13703   int BitOffset = BroadcastIdx * NumEltBits;
13704   SDValue V = V1;
13705   for (;;) {
13706     switch (V.getOpcode()) {
13707     case ISD::BITCAST: {
13708       V = V.getOperand(0);
13709       continue;
13710     }
13711     case ISD::CONCAT_VECTORS: {
13712       int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13713       int OpIdx = BitOffset / OpBitWidth;
13714       V = V.getOperand(OpIdx);
13715       BitOffset %= OpBitWidth;
13716       continue;
13717     }
13718     case ISD::EXTRACT_SUBVECTOR: {
13719       // The extraction index adds to the existing offset.
13720       unsigned EltBitWidth = V.getScalarValueSizeInBits();
13721       unsigned Idx = V.getConstantOperandVal(1);
13722       unsigned BeginOffset = Idx * EltBitWidth;
13723       BitOffset += BeginOffset;
13724       V = V.getOperand(0);
13725       continue;
13726     }
13727     case ISD::INSERT_SUBVECTOR: {
13728       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13729       int EltBitWidth = VOuter.getScalarValueSizeInBits();
13730       int Idx = (int)V.getConstantOperandVal(2);
13731       int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13732       int BeginOffset = Idx * EltBitWidth;
13733       int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13734       if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13735         BitOffset -= BeginOffset;
13736         V = VInner;
13737       } else {
13738         V = VOuter;
13739       }
13740       continue;
13741     }
13742     }
13743     break;
13744   }
13745   assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
13746   BroadcastIdx = BitOffset / NumEltBits;
13747 
13748   // Do we need to bitcast the source to retrieve the original broadcast index?
13749   bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13750 
13751   // Check if this is a broadcast of a scalar. We special case lowering
13752   // for scalars so that we can more effectively fold with loads.
13753   // If the original value has a larger element type than the shuffle, the
13754   // broadcast element is in essence truncated. Make that explicit to ease
13755   // folding.
13756   if (BitCastSrc && VT.isInteger())
13757     if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13758             DL, VT, V, BroadcastIdx, Subtarget, DAG))
13759       return TruncBroadcast;
13760 
13761   // Also check the simpler case, where we can directly reuse the scalar.
13762   if (!BitCastSrc &&
13763       ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13764        (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13765     V = V.getOperand(BroadcastIdx);
13766 
13767     // If we can't broadcast from a register, check that the input is a load.
13768     if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13769       return SDValue();
13770   } else if (ISD::isNormalLoad(V.getNode()) &&
13771              cast<LoadSDNode>(V)->isSimple()) {
13772     // We do not check for one-use of the vector load because a broadcast load
13773     // is expected to be a win for code size, register pressure, and possibly
13774     // uops even if the original vector load is not eliminated.
13775 
13776     // Reduce the vector load and shuffle to a broadcasted scalar load.
13777     LoadSDNode *Ld = cast<LoadSDNode>(V);
13778     SDValue BaseAddr = Ld->getOperand(1);
13779     MVT SVT = VT.getScalarType();
13780     unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13781     assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
13782     SDValue NewAddr =
13783         DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
13784 
13785     // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13786     // than MOVDDUP.
13787     // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13788     if (Opcode == X86ISD::VBROADCAST) {
13789       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13790       SDValue Ops[] = {Ld->getChain(), NewAddr};
13791       V = DAG.getMemIntrinsicNode(
13792           X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13793           DAG.getMachineFunction().getMachineMemOperand(
13794               Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13795       DAG.makeEquivalentMemoryOrdering(Ld, V);
13796       return DAG.getBitcast(VT, V);
13797     }
13798     assert(SVT == MVT::f64 && "Unexpected VT!");
13799     V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13800                     DAG.getMachineFunction().getMachineMemOperand(
13801                         Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13802     DAG.makeEquivalentMemoryOrdering(Ld, V);
13803   } else if (!BroadcastFromReg) {
13804     // We can't broadcast from a vector register.
13805     return SDValue();
13806   } else if (BitOffset != 0) {
13807     // We can only broadcast from the zero-element of a vector register,
13808     // but it can be advantageous to broadcast from the zero-element of a
13809     // subvector.
13810     if (!VT.is256BitVector() && !VT.is512BitVector())
13811       return SDValue();
13812 
13813     // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13814     if (VT == MVT::v4f64 || VT == MVT::v4i64)
13815       return SDValue();
13816 
13817     // Only broadcast the zero-element of a 128-bit subvector.
13818     if ((BitOffset % 128) != 0)
13819       return SDValue();
13820 
13821     assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13822            "Unexpected bit-offset");
13823     assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13824            "Unexpected vector size");
13825     unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13826     V = extract128BitVector(V, ExtractIdx, DAG, DL);
13827   }
13828 
13829   // On AVX we can use VBROADCAST directly for scalar sources.
13830   if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13831     V = DAG.getBitcast(MVT::f64, V);
13832     if (Subtarget.hasAVX()) {
13833       V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13834       return DAG.getBitcast(VT, V);
13835     }
13836     V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13837   }
13838 
13839   // If this is a scalar, do the broadcast on this type and bitcast.
13840   if (!V.getValueType().isVector()) {
13841     assert(V.getScalarValueSizeInBits() == NumEltBits &&
13842            "Unexpected scalar size");
13843     MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13844                                        VT.getVectorNumElements());
13845     return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13846   }
13847 
13848   // We only support broadcasting from 128-bit vectors to minimize the
13849   // number of patterns we need to deal with in isel. So extract down to
13850   // 128-bits, removing as many bitcasts as possible.
13851   if (V.getValueSizeInBits() > 128)
13852     V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
13853 
13854   // Otherwise cast V to a vector with the same element type as VT, but
13855   // possibly narrower than VT. Then perform the broadcast.
13856   unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13857   MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13858   return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13859 }
13860 
13861 // Check for whether we can use INSERTPS to perform the shuffle. We only use
13862 // INSERTPS when the V1 elements are already in the correct locations
13863 // because otherwise we can just always use two SHUFPS instructions which
13864 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13865 // perform INSERTPS if a single V1 element is out of place and all V2
13866 // elements are zeroable.
matchShuffleAsInsertPS(SDValue & V1,SDValue & V2,unsigned & InsertPSMask,const APInt & Zeroable,ArrayRef<int> Mask,SelectionDAG & DAG)13867 static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
13868                                    unsigned &InsertPSMask,
13869                                    const APInt &Zeroable,
13870                                    ArrayRef<int> Mask, SelectionDAG &DAG) {
13871   assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
13872   assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
13873   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13874 
13875   // Attempt to match INSERTPS with one element from VA or VB being
13876   // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13877   // are updated.
13878   auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13879                              ArrayRef<int> CandidateMask) {
13880     unsigned ZMask = 0;
13881     int VADstIndex = -1;
13882     int VBDstIndex = -1;
13883     bool VAUsedInPlace = false;
13884 
13885     for (int i = 0; i < 4; ++i) {
13886       // Synthesize a zero mask from the zeroable elements (includes undefs).
13887       if (Zeroable[i]) {
13888         ZMask |= 1 << i;
13889         continue;
13890       }
13891 
13892       // Flag if we use any VA inputs in place.
13893       if (i == CandidateMask[i]) {
13894         VAUsedInPlace = true;
13895         continue;
13896       }
13897 
13898       // We can only insert a single non-zeroable element.
13899       if (VADstIndex >= 0 || VBDstIndex >= 0)
13900         return false;
13901 
13902       if (CandidateMask[i] < 4) {
13903         // VA input out of place for insertion.
13904         VADstIndex = i;
13905       } else {
13906         // VB input for insertion.
13907         VBDstIndex = i;
13908       }
13909     }
13910 
13911     // Don't bother if we have no (non-zeroable) element for insertion.
13912     if (VADstIndex < 0 && VBDstIndex < 0)
13913       return false;
13914 
13915     // Determine element insertion src/dst indices. The src index is from the
13916     // start of the inserted vector, not the start of the concatenated vector.
13917     unsigned VBSrcIndex = 0;
13918     if (VADstIndex >= 0) {
13919       // If we have a VA input out of place, we use VA as the V2 element
13920       // insertion and don't use the original V2 at all.
13921       VBSrcIndex = CandidateMask[VADstIndex];
13922       VBDstIndex = VADstIndex;
13923       VB = VA;
13924     } else {
13925       VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13926     }
13927 
13928     // If no V1 inputs are used in place, then the result is created only from
13929     // the zero mask and the V2 insertion - so remove V1 dependency.
13930     if (!VAUsedInPlace)
13931       VA = DAG.getUNDEF(MVT::v4f32);
13932 
13933     // Update V1, V2 and InsertPSMask accordingly.
13934     V1 = VA;
13935     V2 = VB;
13936 
13937     // Insert the V2 element into the desired position.
13938     InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13939     assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13940     return true;
13941   };
13942 
13943   if (matchAsInsertPS(V1, V2, Mask))
13944     return true;
13945 
13946   // Commute and try again.
13947   SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
13948   ShuffleVectorSDNode::commuteMask(CommutedMask);
13949   if (matchAsInsertPS(V2, V1, CommutedMask))
13950     return true;
13951 
13952   return false;
13953 }
13954 
lowerShuffleAsInsertPS(const SDLoc & DL,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,SelectionDAG & DAG)13955 static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
13956                                       ArrayRef<int> Mask, const APInt &Zeroable,
13957                                       SelectionDAG &DAG) {
13958   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13959   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13960 
13961   // Attempt to match the insertps pattern.
13962   unsigned InsertPSMask = 0;
13963   if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13964     return SDValue();
13965 
13966   // Insert the V2 element into the desired position.
13967   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13968                      DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13969 }
13970 
13971 /// Try to lower a shuffle as a permute of the inputs followed by an
13972 /// UNPCK instruction.
13973 ///
13974 /// This specifically targets cases where we end up with alternating between
13975 /// the two inputs, and so can permute them into something that feeds a single
13976 /// UNPCK instruction. Note that this routine only targets integer vectors
13977 /// because for floating point vectors we have a generalized SHUFPS lowering
13978 /// strategy that handles everything that doesn't *exactly* match an unpack,
13979 /// making this clever lowering unnecessary.
lowerShuffleAsPermuteAndUnpack(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)13980 static SDValue lowerShuffleAsPermuteAndUnpack(
13981     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13982     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13983   assert(!VT.isFloatingPoint() &&
13984          "This routine only supports integer vectors.");
13985   assert(VT.is128BitVector() &&
13986          "This routine only works on 128-bit vectors.");
13987   assert(!V2.isUndef() &&
13988          "This routine should only be used when blending two inputs.");
13989   assert(Mask.size() >= 2 && "Single element masks are invalid.");
13990 
13991   int Size = Mask.size();
13992 
13993   int NumLoInputs =
13994       count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
13995   int NumHiInputs =
13996       count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
13997 
13998   bool UnpackLo = NumLoInputs >= NumHiInputs;
13999 
14000   auto TryUnpack = [&](int ScalarSize, int Scale) {
14001     SmallVector<int, 16> V1Mask((unsigned)Size, -1);
14002     SmallVector<int, 16> V2Mask((unsigned)Size, -1);
14003 
14004     for (int i = 0; i < Size; ++i) {
14005       if (Mask[i] < 0)
14006         continue;
14007 
14008       // Each element of the unpack contains Scale elements from this mask.
14009       int UnpackIdx = i / Scale;
14010 
14011       // We only handle the case where V1 feeds the first slots of the unpack.
14012       // We rely on canonicalization to ensure this is the case.
14013       if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
14014         return SDValue();
14015 
14016       // Setup the mask for this input. The indexing is tricky as we have to
14017       // handle the unpack stride.
14018       SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
14019       VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
14020           Mask[i] % Size;
14021     }
14022 
14023     // If we will have to shuffle both inputs to use the unpack, check whether
14024     // we can just unpack first and shuffle the result. If so, skip this unpack.
14025     if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
14026         !isNoopShuffleMask(V2Mask))
14027       return SDValue();
14028 
14029     // Shuffle the inputs into place.
14030     V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
14031     V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
14032 
14033     // Cast the inputs to the type we will use to unpack them.
14034     MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
14035     V1 = DAG.getBitcast(UnpackVT, V1);
14036     V2 = DAG.getBitcast(UnpackVT, V2);
14037 
14038     // Unpack the inputs and cast the result back to the desired type.
14039     return DAG.getBitcast(
14040         VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14041                         UnpackVT, V1, V2));
14042   };
14043 
14044   // We try each unpack from the largest to the smallest to try and find one
14045   // that fits this mask.
14046   int OrigScalarSize = VT.getScalarSizeInBits();
14047   for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
14048     if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
14049       return Unpack;
14050 
14051   // If we're shuffling with a zero vector then we're better off not doing
14052   // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
14053   if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
14054       ISD::isBuildVectorAllZeros(V2.getNode()))
14055     return SDValue();
14056 
14057   // If none of the unpack-rooted lowerings worked (or were profitable) try an
14058   // initial unpack.
14059   if (NumLoInputs == 0 || NumHiInputs == 0) {
14060     assert((NumLoInputs > 0 || NumHiInputs > 0) &&
14061            "We have to have *some* inputs!");
14062     int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
14063 
14064     // FIXME: We could consider the total complexity of the permute of each
14065     // possible unpacking. Or at the least we should consider how many
14066     // half-crossings are created.
14067     // FIXME: We could consider commuting the unpacks.
14068 
14069     SmallVector<int, 32> PermMask((unsigned)Size, -1);
14070     for (int i = 0; i < Size; ++i) {
14071       if (Mask[i] < 0)
14072         continue;
14073 
14074       assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
14075 
14076       PermMask[i] =
14077           2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
14078     }
14079     return DAG.getVectorShuffle(
14080         VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
14081                             DL, VT, V1, V2),
14082         DAG.getUNDEF(VT), PermMask);
14083   }
14084 
14085   return SDValue();
14086 }
14087 
14088 /// Handle lowering of 2-lane 64-bit floating point shuffles.
14089 ///
14090 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
14091 /// support for floating point shuffles but not integer shuffles. These
14092 /// instructions will incur a domain crossing penalty on some chips though so
14093 /// it is better to avoid lowering through this for integer vectors where
14094 /// possible.
lowerV2F64Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)14095 static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14096                                  const APInt &Zeroable, SDValue V1, SDValue V2,
14097                                  const X86Subtarget &Subtarget,
14098                                  SelectionDAG &DAG) {
14099   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
14100   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
14101   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
14102 
14103   if (V2.isUndef()) {
14104     // Check for being able to broadcast a single element.
14105     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
14106                                                     Mask, Subtarget, DAG))
14107       return Broadcast;
14108 
14109     // Straight shuffle of a single input vector. Simulate this by using the
14110     // single input as both of the "inputs" to this instruction..
14111     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
14112 
14113     if (Subtarget.hasAVX()) {
14114       // If we have AVX, we can use VPERMILPS which will allow folding a load
14115       // into the shuffle.
14116       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
14117                          DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14118     }
14119 
14120     return DAG.getNode(
14121         X86ISD::SHUFP, DL, MVT::v2f64,
14122         Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14123         Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14124         DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14125   }
14126   assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
14127   assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
14128   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
14129   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
14130 
14131   if (Subtarget.hasAVX2())
14132     if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14133       return Extract;
14134 
14135   // When loading a scalar and then shuffling it into a vector we can often do
14136   // the insertion cheaply.
14137   if (SDValue Insertion = lowerShuffleAsElementInsertion(
14138           DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14139     return Insertion;
14140   // Try inverting the insertion since for v2 masks it is easy to do and we
14141   // can't reliably sort the mask one way or the other.
14142   int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
14143                         Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
14144   if (SDValue Insertion = lowerShuffleAsElementInsertion(
14145           DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14146     return Insertion;
14147 
14148   // Try to use one of the special instruction patterns to handle two common
14149   // blend patterns if a zero-blend above didn't work.
14150   if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
14151       isShuffleEquivalent(Mask, {1, 3}, V1, V2))
14152     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
14153       // We can either use a special instruction to load over the low double or
14154       // to move just the low double.
14155       return DAG.getNode(
14156           X86ISD::MOVSD, DL, MVT::v2f64, V2,
14157           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
14158 
14159   if (Subtarget.hasSSE41())
14160     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
14161                                             Zeroable, Subtarget, DAG))
14162       return Blend;
14163 
14164   // Use dedicated unpack instructions for masks that match their pattern.
14165   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
14166     return V;
14167 
14168   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
14169   return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
14170                      DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14171 }
14172 
14173 /// Handle lowering of 2-lane 64-bit integer shuffles.
14174 ///
14175 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
14176 /// the integer unit to minimize domain crossing penalties. However, for blends
14177 /// it falls back to the floating point shuffle operation with appropriate bit
14178 /// casting.
lowerV2I64Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)14179 static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14180                                  const APInt &Zeroable, SDValue V1, SDValue V2,
14181                                  const X86Subtarget &Subtarget,
14182                                  SelectionDAG &DAG) {
14183   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
14184   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
14185   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
14186 
14187   if (V2.isUndef()) {
14188     // Check for being able to broadcast a single element.
14189     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
14190                                                     Mask, Subtarget, DAG))
14191       return Broadcast;
14192 
14193     // Straight shuffle of a single input vector. For everything from SSE2
14194     // onward this has a single fast instruction with no scary immediates.
14195     // We have to map the mask as it is actually a v4i32 shuffle instruction.
14196     V1 = DAG.getBitcast(MVT::v4i32, V1);
14197     int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
14198                           Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
14199                           Mask[1] < 0 ? -1 : (Mask[1] * 2),
14200                           Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
14201     return DAG.getBitcast(
14202         MVT::v2i64,
14203         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14204                     getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
14205   }
14206   assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
14207   assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
14208   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
14209   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
14210 
14211   if (Subtarget.hasAVX2())
14212     if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14213       return Extract;
14214 
14215   // Try to use shift instructions.
14216   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
14217                                           Zeroable, Subtarget, DAG))
14218     return Shift;
14219 
14220   // When loading a scalar and then shuffling it into a vector we can often do
14221   // the insertion cheaply.
14222   if (SDValue Insertion = lowerShuffleAsElementInsertion(
14223           DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14224     return Insertion;
14225   // Try inverting the insertion since for v2 masks it is easy to do and we
14226   // can't reliably sort the mask one way or the other.
14227   int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
14228   if (SDValue Insertion = lowerShuffleAsElementInsertion(
14229           DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14230     return Insertion;
14231 
14232   // We have different paths for blend lowering, but they all must use the
14233   // *exact* same predicate.
14234   bool IsBlendSupported = Subtarget.hasSSE41();
14235   if (IsBlendSupported)
14236     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
14237                                             Zeroable, Subtarget, DAG))
14238       return Blend;
14239 
14240   // Use dedicated unpack instructions for masks that match their pattern.
14241   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
14242     return V;
14243 
14244   // Try to use byte rotation instructions.
14245   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14246   if (Subtarget.hasSSSE3()) {
14247     if (Subtarget.hasVLX())
14248       if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
14249                                                 Subtarget, DAG))
14250         return Rotate;
14251 
14252     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
14253                                                   Subtarget, DAG))
14254       return Rotate;
14255   }
14256 
14257   // If we have direct support for blends, we should lower by decomposing into
14258   // a permute. That will be faster than the domain cross.
14259   if (IsBlendSupported)
14260     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
14261                                                 Subtarget, DAG);
14262 
14263   // We implement this with SHUFPD which is pretty lame because it will likely
14264   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
14265   // However, all the alternatives are still more cycles and newer chips don't
14266   // have this problem. It would be really nice if x86 had better shuffles here.
14267   V1 = DAG.getBitcast(MVT::v2f64, V1);
14268   V2 = DAG.getBitcast(MVT::v2f64, V2);
14269   return DAG.getBitcast(MVT::v2i64,
14270                         DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
14271 }
14272 
14273 /// Lower a vector shuffle using the SHUFPS instruction.
14274 ///
14275 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
14276 /// It makes no assumptions about whether this is the *best* lowering, it simply
14277 /// uses it.
lowerShuffleWithSHUFPS(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,SDValue V1,SDValue V2,SelectionDAG & DAG)14278 static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
14279                                       ArrayRef<int> Mask, SDValue V1,
14280                                       SDValue V2, SelectionDAG &DAG) {
14281   SDValue LowV = V1, HighV = V2;
14282   SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
14283   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14284 
14285   if (NumV2Elements == 1) {
14286     int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
14287 
14288     // Compute the index adjacent to V2Index and in the same half by toggling
14289     // the low bit.
14290     int V2AdjIndex = V2Index ^ 1;
14291 
14292     if (Mask[V2AdjIndex] < 0) {
14293       // Handles all the cases where we have a single V2 element and an undef.
14294       // This will only ever happen in the high lanes because we commute the
14295       // vector otherwise.
14296       if (V2Index < 2)
14297         std::swap(LowV, HighV);
14298       NewMask[V2Index] -= 4;
14299     } else {
14300       // Handle the case where the V2 element ends up adjacent to a V1 element.
14301       // To make this work, blend them together as the first step.
14302       int V1Index = V2AdjIndex;
14303       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
14304       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
14305                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14306 
14307       // Now proceed to reconstruct the final blend as we have the necessary
14308       // high or low half formed.
14309       if (V2Index < 2) {
14310         LowV = V2;
14311         HighV = V1;
14312       } else {
14313         HighV = V2;
14314       }
14315       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
14316       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
14317     }
14318   } else if (NumV2Elements == 2) {
14319     if (Mask[0] < 4 && Mask[1] < 4) {
14320       // Handle the easy case where we have V1 in the low lanes and V2 in the
14321       // high lanes.
14322       NewMask[2] -= 4;
14323       NewMask[3] -= 4;
14324     } else if (Mask[2] < 4 && Mask[3] < 4) {
14325       // We also handle the reversed case because this utility may get called
14326       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
14327       // arrange things in the right direction.
14328       NewMask[0] -= 4;
14329       NewMask[1] -= 4;
14330       HighV = V1;
14331       LowV = V2;
14332     } else {
14333       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
14334       // trying to place elements directly, just blend them and set up the final
14335       // shuffle to place them.
14336 
14337       // The first two blend mask elements are for V1, the second two are for
14338       // V2.
14339       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
14340                           Mask[2] < 4 ? Mask[2] : Mask[3],
14341                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
14342                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
14343       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
14344                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14345 
14346       // Now we do a normal shuffle of V1 by giving V1 as both operands to
14347       // a blend.
14348       LowV = HighV = V1;
14349       NewMask[0] = Mask[0] < 4 ? 0 : 2;
14350       NewMask[1] = Mask[0] < 4 ? 2 : 0;
14351       NewMask[2] = Mask[2] < 4 ? 1 : 3;
14352       NewMask[3] = Mask[2] < 4 ? 3 : 1;
14353     }
14354   } else if (NumV2Elements == 3) {
14355     // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
14356     // we can get here due to other paths (e.g repeated mask matching) that we
14357     // don't want to do another round of lowerVECTOR_SHUFFLE.
14358     ShuffleVectorSDNode::commuteMask(NewMask);
14359     return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
14360   }
14361   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
14362                      getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
14363 }
14364 
14365 /// Lower 4-lane 32-bit floating point shuffles.
14366 ///
14367 /// Uses instructions exclusively from the floating point unit to minimize
14368 /// domain crossing penalties, as these are sufficient to implement all v4f32
14369 /// shuffles.
lowerV4F32Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)14370 static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14371                                  const APInt &Zeroable, SDValue V1, SDValue V2,
14372                                  const X86Subtarget &Subtarget,
14373                                  SelectionDAG &DAG) {
14374   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
14375   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
14376   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
14377 
14378   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14379 
14380   if (NumV2Elements == 0) {
14381     // Check for being able to broadcast a single element.
14382     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
14383                                                     Mask, Subtarget, DAG))
14384       return Broadcast;
14385 
14386     // Use even/odd duplicate instructions for masks that match their pattern.
14387     if (Subtarget.hasSSE3()) {
14388       if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
14389         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
14390       if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
14391         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
14392     }
14393 
14394     if (Subtarget.hasAVX()) {
14395       // If we have AVX, we can use VPERMILPS which will allow folding a load
14396       // into the shuffle.
14397       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
14398                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14399     }
14400 
14401     // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
14402     // in SSE1 because otherwise they are widened to v2f64 and never get here.
14403     if (!Subtarget.hasSSE2()) {
14404       if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
14405         return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
14406       if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
14407         return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
14408     }
14409 
14410     // Otherwise, use a straight shuffle of a single input vector. We pass the
14411     // input vector to both operands to simulate this with a SHUFPS.
14412     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
14413                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14414   }
14415 
14416   if (Subtarget.hasAVX2())
14417     if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14418       return Extract;
14419 
14420   // There are special ways we can lower some single-element blends. However, we
14421   // have custom ways we can lower more complex single-element blends below that
14422   // we defer to if both this and BLENDPS fail to match, so restrict this to
14423   // when the V2 input is targeting element 0 of the mask -- that is the fast
14424   // case here.
14425   if (NumV2Elements == 1 && Mask[0] >= 4)
14426     if (SDValue V = lowerShuffleAsElementInsertion(
14427             DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14428       return V;
14429 
14430   if (Subtarget.hasSSE41()) {
14431     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
14432                                             Zeroable, Subtarget, DAG))
14433       return Blend;
14434 
14435     // Use INSERTPS if we can complete the shuffle efficiently.
14436     if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
14437       return V;
14438 
14439     if (!isSingleSHUFPSMask(Mask))
14440       if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
14441                                                             V2, Mask, DAG))
14442         return BlendPerm;
14443   }
14444 
14445   // Use low/high mov instructions. These are only valid in SSE1 because
14446   // otherwise they are widened to v2f64 and never get here.
14447   if (!Subtarget.hasSSE2()) {
14448     if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
14449       return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
14450     if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
14451       return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
14452   }
14453 
14454   // Use dedicated unpack instructions for masks that match their pattern.
14455   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
14456     return V;
14457 
14458   // Otherwise fall back to a SHUFPS lowering strategy.
14459   return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
14460 }
14461 
14462 /// Lower 4-lane i32 vector shuffles.
14463 ///
14464 /// We try to handle these with integer-domain shuffles where we can, but for
14465 /// blends we use the floating point domain blend instructions.
lowerV4I32Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)14466 static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14467                                  const APInt &Zeroable, SDValue V1, SDValue V2,
14468                                  const X86Subtarget &Subtarget,
14469                                  SelectionDAG &DAG) {
14470   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
14471   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
14472   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
14473 
14474   // Whenever we can lower this as a zext, that instruction is strictly faster
14475   // than any alternative. It also allows us to fold memory operands into the
14476   // shuffle in many cases.
14477   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
14478                                                    Zeroable, Subtarget, DAG))
14479     return ZExt;
14480 
14481   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14482 
14483   if (NumV2Elements == 0) {
14484     // Try to use broadcast unless the mask only has one non-undef element.
14485     if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
14486       if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
14487                                                       Mask, Subtarget, DAG))
14488         return Broadcast;
14489     }
14490 
14491     // Straight shuffle of a single input vector. For everything from SSE2
14492     // onward this has a single fast instruction with no scary immediates.
14493     // We coerce the shuffle pattern to be compatible with UNPCK instructions
14494     // but we aren't actually going to use the UNPCK instruction because doing
14495     // so prevents folding a load into this instruction or making a copy.
14496     const int UnpackLoMask[] = {0, 0, 1, 1};
14497     const int UnpackHiMask[] = {2, 2, 3, 3};
14498     if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
14499       Mask = UnpackLoMask;
14500     else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
14501       Mask = UnpackHiMask;
14502 
14503     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14504                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14505   }
14506 
14507   if (Subtarget.hasAVX2())
14508     if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14509       return Extract;
14510 
14511   // Try to use shift instructions.
14512   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
14513                                           Zeroable, Subtarget, DAG))
14514     return Shift;
14515 
14516   // There are special ways we can lower some single-element blends.
14517   if (NumV2Elements == 1)
14518     if (SDValue V = lowerShuffleAsElementInsertion(
14519             DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14520       return V;
14521 
14522   // We have different paths for blend lowering, but they all must use the
14523   // *exact* same predicate.
14524   bool IsBlendSupported = Subtarget.hasSSE41();
14525   if (IsBlendSupported)
14526     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
14527                                             Zeroable, Subtarget, DAG))
14528       return Blend;
14529 
14530   if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
14531                                              Zeroable, Subtarget, DAG))
14532     return Masked;
14533 
14534   // Use dedicated unpack instructions for masks that match their pattern.
14535   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
14536     return V;
14537 
14538   // Try to use byte rotation instructions.
14539   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14540   if (Subtarget.hasSSSE3()) {
14541     if (Subtarget.hasVLX())
14542       if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
14543                                                 Subtarget, DAG))
14544         return Rotate;
14545 
14546     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
14547                                                   Subtarget, DAG))
14548       return Rotate;
14549   }
14550 
14551   // Assume that a single SHUFPS is faster than an alternative sequence of
14552   // multiple instructions (even if the CPU has a domain penalty).
14553   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14554   if (!isSingleSHUFPSMask(Mask)) {
14555     // If we have direct support for blends, we should lower by decomposing into
14556     // a permute. That will be faster than the domain cross.
14557     if (IsBlendSupported)
14558       return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
14559                                                   Subtarget, DAG);
14560 
14561     // Try to lower by permuting the inputs into an unpack instruction.
14562     if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
14563                                                         Mask, Subtarget, DAG))
14564       return Unpack;
14565   }
14566 
14567   // We implement this with SHUFPS because it can blend from two vectors.
14568   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
14569   // up the inputs, bypassing domain shift penalties that we would incur if we
14570   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
14571   // relevant.
14572   SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
14573   SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
14574   SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
14575   return DAG.getBitcast(MVT::v4i32, ShufPS);
14576 }
14577 
14578 /// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
14579 /// shuffle lowering, and the most complex part.
14580 ///
14581 /// The lowering strategy is to try to form pairs of input lanes which are
14582 /// targeted at the same half of the final vector, and then use a dword shuffle
14583 /// to place them onto the right half, and finally unpack the paired lanes into
14584 /// their final position.
14585 ///
14586 /// The exact breakdown of how to form these dword pairs and align them on the
14587 /// correct sides is really tricky. See the comments within the function for
14588 /// more of the details.
14589 ///
14590 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
14591 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
14592 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
14593 /// vector, form the analogous 128-bit 8-element Mask.
lowerV8I16GeneralSingleInputShuffle(const SDLoc & DL,MVT VT,SDValue V,MutableArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)14594 static SDValue lowerV8I16GeneralSingleInputShuffle(
14595     const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
14596     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14597   assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
14598   MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
14599 
14600   assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
14601   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
14602   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
14603 
14604   // Attempt to directly match PSHUFLW or PSHUFHW.
14605   if (isUndefOrInRange(LoMask, 0, 4) &&
14606       isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
14607     return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14608                        getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14609   }
14610   if (isUndefOrInRange(HiMask, 4, 8) &&
14611       isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
14612     for (int i = 0; i != 4; ++i)
14613       HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
14614     return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14615                        getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14616   }
14617 
14618   SmallVector<int, 4> LoInputs;
14619   copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
14620   array_pod_sort(LoInputs.begin(), LoInputs.end());
14621   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
14622   SmallVector<int, 4> HiInputs;
14623   copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
14624   array_pod_sort(HiInputs.begin(), HiInputs.end());
14625   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
14626   int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
14627   int NumHToL = LoInputs.size() - NumLToL;
14628   int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
14629   int NumHToH = HiInputs.size() - NumLToH;
14630   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
14631   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
14632   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
14633   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
14634 
14635   // If we are shuffling values from one half - check how many different DWORD
14636   // pairs we need to create. If only 1 or 2 then we can perform this as a
14637   // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
14638   auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
14639                                ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
14640     V = DAG.getNode(ShufWOp, DL, VT, V,
14641                     getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14642     V = DAG.getBitcast(PSHUFDVT, V);
14643     V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
14644                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
14645     return DAG.getBitcast(VT, V);
14646   };
14647 
14648   if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
14649     int PSHUFDMask[4] = { -1, -1, -1, -1 };
14650     SmallVector<std::pair<int, int>, 4> DWordPairs;
14651     int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
14652 
14653     // Collect the different DWORD pairs.
14654     for (int DWord = 0; DWord != 4; ++DWord) {
14655       int M0 = Mask[2 * DWord + 0];
14656       int M1 = Mask[2 * DWord + 1];
14657       M0 = (M0 >= 0 ? M0 % 4 : M0);
14658       M1 = (M1 >= 0 ? M1 % 4 : M1);
14659       if (M0 < 0 && M1 < 0)
14660         continue;
14661 
14662       bool Match = false;
14663       for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
14664         auto &DWordPair = DWordPairs[j];
14665         if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
14666             (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
14667           DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
14668           DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
14669           PSHUFDMask[DWord] = DOffset + j;
14670           Match = true;
14671           break;
14672         }
14673       }
14674       if (!Match) {
14675         PSHUFDMask[DWord] = DOffset + DWordPairs.size();
14676         DWordPairs.push_back(std::make_pair(M0, M1));
14677       }
14678     }
14679 
14680     if (DWordPairs.size() <= 2) {
14681       DWordPairs.resize(2, std::make_pair(-1, -1));
14682       int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
14683                               DWordPairs[1].first, DWordPairs[1].second};
14684       if ((NumHToL + NumHToH) == 0)
14685         return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
14686       if ((NumLToL + NumLToH) == 0)
14687         return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
14688     }
14689   }
14690 
14691   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
14692   // such inputs we can swap two of the dwords across the half mark and end up
14693   // with <=2 inputs to each half in each half. Once there, we can fall through
14694   // to the generic code below. For example:
14695   //
14696   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14697   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
14698   //
14699   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
14700   // and an existing 2-into-2 on the other half. In this case we may have to
14701   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
14702   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
14703   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
14704   // because any other situation (including a 3-into-1 or 1-into-3 in the other
14705   // half than the one we target for fixing) will be fixed when we re-enter this
14706   // path. We will also combine away any sequence of PSHUFD instructions that
14707   // result into a single instruction. Here is an example of the tricky case:
14708   //
14709   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14710   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
14711   //
14712   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
14713   //
14714   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
14715   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
14716   //
14717   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14718   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14719   //
14720   // The result is fine to be handled by the generic logic.
14721   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14722                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14723                           int AOffset, int BOffset) {
14724     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
14725            "Must call this with A having 3 or 1 inputs from the A half.");
14726     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
14727            "Must call this with B having 1 or 3 inputs from the B half.");
14728     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
14729            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14730 
14731     bool ThreeAInputs = AToAInputs.size() == 3;
14732 
14733     // Compute the index of dword with only one word among the three inputs in
14734     // a half by taking the sum of the half with three inputs and subtracting
14735     // the sum of the actual three inputs. The difference is the remaining
14736     // slot.
14737     int ADWord = 0, BDWord = 0;
14738     int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14739     int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14740     int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14741     ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14742     int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14743     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14744     int TripleNonInputIdx =
14745         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14746     TripleDWord = TripleNonInputIdx / 2;
14747 
14748     // We use xor with one to compute the adjacent DWord to whichever one the
14749     // OneInput is in.
14750     OneInputDWord = (OneInput / 2) ^ 1;
14751 
14752     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14753     // and BToA inputs. If there is also such a problem with the BToB and AToB
14754     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14755     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14756     // is essential that we don't *create* a 3<-1 as then we might oscillate.
14757     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14758       // Compute how many inputs will be flipped by swapping these DWords. We
14759       // need
14760       // to balance this to ensure we don't form a 3-1 shuffle in the other
14761       // half.
14762       int NumFlippedAToBInputs =
14763           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
14764           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
14765       int NumFlippedBToBInputs =
14766           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
14767           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
14768       if ((NumFlippedAToBInputs == 1 &&
14769            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14770           (NumFlippedBToBInputs == 1 &&
14771            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14772         // We choose whether to fix the A half or B half based on whether that
14773         // half has zero flipped inputs. At zero, we may not be able to fix it
14774         // with that half. We also bias towards fixing the B half because that
14775         // will more commonly be the high half, and we have to bias one way.
14776         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14777                                                        ArrayRef<int> Inputs) {
14778           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14779           bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14780           // Determine whether the free index is in the flipped dword or the
14781           // unflipped dword based on where the pinned index is. We use this bit
14782           // in an xor to conditionally select the adjacent dword.
14783           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14784           bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14785           if (IsFixIdxInput == IsFixFreeIdxInput)
14786             FixFreeIdx += 1;
14787           IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14788           assert(IsFixIdxInput != IsFixFreeIdxInput &&
14789                  "We need to be changing the number of flipped inputs!");
14790           int PSHUFHalfMask[] = {0, 1, 2, 3};
14791           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14792           V = DAG.getNode(
14793               FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14794               MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14795               getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14796 
14797           for (int &M : Mask)
14798             if (M >= 0 && M == FixIdx)
14799               M = FixFreeIdx;
14800             else if (M >= 0 && M == FixFreeIdx)
14801               M = FixIdx;
14802         };
14803         if (NumFlippedBToBInputs != 0) {
14804           int BPinnedIdx =
14805               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14806           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14807         } else {
14808           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
14809           int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14810           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14811         }
14812       }
14813     }
14814 
14815     int PSHUFDMask[] = {0, 1, 2, 3};
14816     PSHUFDMask[ADWord] = BDWord;
14817     PSHUFDMask[BDWord] = ADWord;
14818     V = DAG.getBitcast(
14819         VT,
14820         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14821                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14822 
14823     // Adjust the mask to match the new locations of A and B.
14824     for (int &M : Mask)
14825       if (M >= 0 && M/2 == ADWord)
14826         M = 2 * BDWord + M % 2;
14827       else if (M >= 0 && M/2 == BDWord)
14828         M = 2 * ADWord + M % 2;
14829 
14830     // Recurse back into this routine to re-compute state now that this isn't
14831     // a 3 and 1 problem.
14832     return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14833   };
14834   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14835     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14836   if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14837     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14838 
14839   // At this point there are at most two inputs to the low and high halves from
14840   // each half. That means the inputs can always be grouped into dwords and
14841   // those dwords can then be moved to the correct half with a dword shuffle.
14842   // We use at most one low and one high word shuffle to collect these paired
14843   // inputs into dwords, and finally a dword shuffle to place them.
14844   int PSHUFLMask[4] = {-1, -1, -1, -1};
14845   int PSHUFHMask[4] = {-1, -1, -1, -1};
14846   int PSHUFDMask[4] = {-1, -1, -1, -1};
14847 
14848   // First fix the masks for all the inputs that are staying in their
14849   // original halves. This will then dictate the targets of the cross-half
14850   // shuffles.
14851   auto fixInPlaceInputs =
14852       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14853                     MutableArrayRef<int> SourceHalfMask,
14854                     MutableArrayRef<int> HalfMask, int HalfOffset) {
14855     if (InPlaceInputs.empty())
14856       return;
14857     if (InPlaceInputs.size() == 1) {
14858       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14859           InPlaceInputs[0] - HalfOffset;
14860       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14861       return;
14862     }
14863     if (IncomingInputs.empty()) {
14864       // Just fix all of the in place inputs.
14865       for (int Input : InPlaceInputs) {
14866         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14867         PSHUFDMask[Input / 2] = Input / 2;
14868       }
14869       return;
14870     }
14871 
14872     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
14873     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14874         InPlaceInputs[0] - HalfOffset;
14875     // Put the second input next to the first so that they are packed into
14876     // a dword. We find the adjacent index by toggling the low bit.
14877     int AdjIndex = InPlaceInputs[0] ^ 1;
14878     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14879     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
14880     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14881   };
14882   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14883   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14884 
14885   // Now gather the cross-half inputs and place them into a free dword of
14886   // their target half.
14887   // FIXME: This operation could almost certainly be simplified dramatically to
14888   // look more like the 3-1 fixing operation.
14889   auto moveInputsToRightHalf = [&PSHUFDMask](
14890       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14891       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14892       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14893       int DestOffset) {
14894     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14895       return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14896     };
14897     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14898                                                int Word) {
14899       int LowWord = Word & ~1;
14900       int HighWord = Word | 1;
14901       return isWordClobbered(SourceHalfMask, LowWord) ||
14902              isWordClobbered(SourceHalfMask, HighWord);
14903     };
14904 
14905     if (IncomingInputs.empty())
14906       return;
14907 
14908     if (ExistingInputs.empty()) {
14909       // Map any dwords with inputs from them into the right half.
14910       for (int Input : IncomingInputs) {
14911         // If the source half mask maps over the inputs, turn those into
14912         // swaps and use the swapped lane.
14913         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14914           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14915             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14916                 Input - SourceOffset;
14917             // We have to swap the uses in our half mask in one sweep.
14918             for (int &M : HalfMask)
14919               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14920                 M = Input;
14921               else if (M == Input)
14922                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14923           } else {
14924             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
14925                        Input - SourceOffset &&
14926                    "Previous placement doesn't match!");
14927           }
14928           // Note that this correctly re-maps both when we do a swap and when
14929           // we observe the other side of the swap above. We rely on that to
14930           // avoid swapping the members of the input list directly.
14931           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14932         }
14933 
14934         // Map the input's dword into the correct half.
14935         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14936           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14937         else
14938           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
14939                      Input / 2 &&
14940                  "Previous placement doesn't match!");
14941       }
14942 
14943       // And just directly shift any other-half mask elements to be same-half
14944       // as we will have mirrored the dword containing the element into the
14945       // same position within that half.
14946       for (int &M : HalfMask)
14947         if (M >= SourceOffset && M < SourceOffset + 4) {
14948           M = M - SourceOffset + DestOffset;
14949           assert(M >= 0 && "This should never wrap below zero!");
14950         }
14951       return;
14952     }
14953 
14954     // Ensure we have the input in a viable dword of its current half. This
14955     // is particularly tricky because the original position may be clobbered
14956     // by inputs being moved and *staying* in that half.
14957     if (IncomingInputs.size() == 1) {
14958       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14959         int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14960                          SourceOffset;
14961         SourceHalfMask[InputFixed - SourceOffset] =
14962             IncomingInputs[0] - SourceOffset;
14963         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
14964                      InputFixed);
14965         IncomingInputs[0] = InputFixed;
14966       }
14967     } else if (IncomingInputs.size() == 2) {
14968       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14969           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14970         // We have two non-adjacent or clobbered inputs we need to extract from
14971         // the source half. To do this, we need to map them into some adjacent
14972         // dword slot in the source mask.
14973         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14974                               IncomingInputs[1] - SourceOffset};
14975 
14976         // If there is a free slot in the source half mask adjacent to one of
14977         // the inputs, place the other input in it. We use (Index XOR 1) to
14978         // compute an adjacent index.
14979         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14980             SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14981           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14982           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14983           InputsFixed[1] = InputsFixed[0] ^ 1;
14984         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14985                    SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14986           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14987           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14988           InputsFixed[0] = InputsFixed[1] ^ 1;
14989         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14990                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14991           // The two inputs are in the same DWord but it is clobbered and the
14992           // adjacent DWord isn't used at all. Move both inputs to the free
14993           // slot.
14994           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14995           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14996           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14997           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14998         } else {
14999           // The only way we hit this point is if there is no clobbering
15000           // (because there are no off-half inputs to this half) and there is no
15001           // free slot adjacent to one of the inputs. In this case, we have to
15002           // swap an input with a non-input.
15003           for (int i = 0; i < 4; ++i)
15004             assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
15005                    "We can't handle any clobbers here!");
15006           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
15007                  "Cannot have adjacent inputs here!");
15008 
15009           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
15010           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
15011 
15012           // We also have to update the final source mask in this case because
15013           // it may need to undo the above swap.
15014           for (int &M : FinalSourceHalfMask)
15015             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
15016               M = InputsFixed[1] + SourceOffset;
15017             else if (M == InputsFixed[1] + SourceOffset)
15018               M = (InputsFixed[0] ^ 1) + SourceOffset;
15019 
15020           InputsFixed[1] = InputsFixed[0] ^ 1;
15021         }
15022 
15023         // Point everything at the fixed inputs.
15024         for (int &M : HalfMask)
15025           if (M == IncomingInputs[0])
15026             M = InputsFixed[0] + SourceOffset;
15027           else if (M == IncomingInputs[1])
15028             M = InputsFixed[1] + SourceOffset;
15029 
15030         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
15031         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
15032       }
15033     } else {
15034       llvm_unreachable("Unhandled input size!");
15035     }
15036 
15037     // Now hoist the DWord down to the right half.
15038     int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
15039     assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
15040     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
15041     for (int &M : HalfMask)
15042       for (int Input : IncomingInputs)
15043         if (M == Input)
15044           M = FreeDWord * 2 + Input % 2;
15045   };
15046   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
15047                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
15048   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
15049                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
15050 
15051   // Now enact all the shuffles we've computed to move the inputs into their
15052   // target half.
15053   if (!isNoopShuffleMask(PSHUFLMask))
15054     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15055                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
15056   if (!isNoopShuffleMask(PSHUFHMask))
15057     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15058                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
15059   if (!isNoopShuffleMask(PSHUFDMask))
15060     V = DAG.getBitcast(
15061         VT,
15062         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
15063                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15064 
15065   // At this point, each half should contain all its inputs, and we can then
15066   // just shuffle them into their final position.
15067   assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
15068          "Failed to lift all the high half inputs to the low mask!");
15069   assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
15070          "Failed to lift all the low half inputs to the high mask!");
15071 
15072   // Do a half shuffle for the low mask.
15073   if (!isNoopShuffleMask(LoMask))
15074     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15075                     getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
15076 
15077   // Do a half shuffle with the high mask after shifting its values down.
15078   for (int &M : HiMask)
15079     if (M >= 0)
15080       M -= 4;
15081   if (!isNoopShuffleMask(HiMask))
15082     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15083                     getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
15084 
15085   return V;
15086 }
15087 
15088 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
15089 /// blend if only one input is used.
lowerShuffleAsBlendOfPSHUFBs(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,SelectionDAG & DAG,bool & V1InUse,bool & V2InUse)15090 static SDValue lowerShuffleAsBlendOfPSHUFBs(
15091     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15092     const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
15093   assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&
15094          "Lane crossing shuffle masks not supported");
15095 
15096   int NumBytes = VT.getSizeInBits() / 8;
15097   int Size = Mask.size();
15098   int Scale = NumBytes / Size;
15099 
15100   SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15101   SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15102   V1InUse = false;
15103   V2InUse = false;
15104 
15105   for (int i = 0; i < NumBytes; ++i) {
15106     int M = Mask[i / Scale];
15107     if (M < 0)
15108       continue;
15109 
15110     const int ZeroMask = 0x80;
15111     int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
15112     int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
15113     if (Zeroable[i / Scale])
15114       V1Idx = V2Idx = ZeroMask;
15115 
15116     V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
15117     V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
15118     V1InUse |= (ZeroMask != V1Idx);
15119     V2InUse |= (ZeroMask != V2Idx);
15120   }
15121 
15122   MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
15123   if (V1InUse)
15124     V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
15125                      DAG.getBuildVector(ShufVT, DL, V1Mask));
15126   if (V2InUse)
15127     V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
15128                      DAG.getBuildVector(ShufVT, DL, V2Mask));
15129 
15130   // If we need shuffled inputs from both, blend the two.
15131   SDValue V;
15132   if (V1InUse && V2InUse)
15133     V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
15134   else
15135     V = V1InUse ? V1 : V2;
15136 
15137   // Cast the result back to the correct type.
15138   return DAG.getBitcast(VT, V);
15139 }
15140 
15141 /// Generic lowering of 8-lane i16 shuffles.
15142 ///
15143 /// This handles both single-input shuffles and combined shuffle/blends with
15144 /// two inputs. The single input shuffles are immediately delegated to
15145 /// a dedicated lowering routine.
15146 ///
15147 /// The blends are lowered in one of three fundamental ways. If there are few
15148 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
15149 /// of the input is significantly cheaper when lowered as an interleaving of
15150 /// the two inputs, try to interleave them. Otherwise, blend the low and high
15151 /// halves of the inputs separately (making them have relatively few inputs)
15152 /// and then concatenate them.
lowerV8I16Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)15153 static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15154                                  const APInt &Zeroable, SDValue V1, SDValue V2,
15155                                  const X86Subtarget &Subtarget,
15156                                  SelectionDAG &DAG) {
15157   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
15158   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
15159   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
15160 
15161   // Whenever we can lower this as a zext, that instruction is strictly faster
15162   // than any alternative.
15163   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
15164                                                    Zeroable, Subtarget, DAG))
15165     return ZExt;
15166 
15167   // Try to use lower using a truncation.
15168   if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15169                                         Subtarget, DAG))
15170     return V;
15171 
15172   int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
15173 
15174   if (NumV2Inputs == 0) {
15175     // Try to use shift instructions.
15176     if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
15177                                             Zeroable, Subtarget, DAG))
15178       return Shift;
15179 
15180     // Check for being able to broadcast a single element.
15181     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
15182                                                     Mask, Subtarget, DAG))
15183       return Broadcast;
15184 
15185     // Try to use bit rotation instructions.
15186     if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
15187                                                  Subtarget, DAG))
15188       return Rotate;
15189 
15190     // Use dedicated unpack instructions for masks that match their pattern.
15191     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15192       return V;
15193 
15194     // Use dedicated pack instructions for masks that match their pattern.
15195     if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15196                                          Subtarget))
15197       return V;
15198 
15199     // Try to use byte rotation instructions.
15200     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
15201                                                   Subtarget, DAG))
15202       return Rotate;
15203 
15204     // Make a copy of the mask so it can be modified.
15205     SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
15206     return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
15207                                                Subtarget, DAG);
15208   }
15209 
15210   assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
15211          "All single-input shuffles should be canonicalized to be V1-input "
15212          "shuffles.");
15213 
15214   // Try to use shift instructions.
15215   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
15216                                           Zeroable, Subtarget, DAG))
15217     return Shift;
15218 
15219   // See if we can use SSE4A Extraction / Insertion.
15220   if (Subtarget.hasSSE4A())
15221     if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
15222                                           Zeroable, DAG))
15223       return V;
15224 
15225   // There are special ways we can lower some single-element blends.
15226   if (NumV2Inputs == 1)
15227     if (SDValue V = lowerShuffleAsElementInsertion(
15228             DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15229       return V;
15230 
15231   // We have different paths for blend lowering, but they all must use the
15232   // *exact* same predicate.
15233   bool IsBlendSupported = Subtarget.hasSSE41();
15234   if (IsBlendSupported)
15235     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
15236                                             Zeroable, Subtarget, DAG))
15237       return Blend;
15238 
15239   if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
15240                                              Zeroable, Subtarget, DAG))
15241     return Masked;
15242 
15243   // Use dedicated unpack instructions for masks that match their pattern.
15244   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15245     return V;
15246 
15247   // Use dedicated pack instructions for masks that match their pattern.
15248   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15249                                        Subtarget))
15250     return V;
15251 
15252   // Try to use lower using a truncation.
15253   if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15254                                        Subtarget, DAG))
15255     return V;
15256 
15257   // Try to use byte rotation instructions.
15258   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
15259                                                 Subtarget, DAG))
15260     return Rotate;
15261 
15262   if (SDValue BitBlend =
15263           lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
15264     return BitBlend;
15265 
15266   // Try to use byte shift instructions to mask.
15267   if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
15268                                               Zeroable, Subtarget, DAG))
15269     return V;
15270 
15271   // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
15272   // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
15273   // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
15274   int NumEvenDrops = canLowerByDroppingEvenElements(Mask, false);
15275   if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
15276       !Subtarget.hasVLX()) {
15277     SmallVector<SDValue, 8> DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32));
15278     for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
15279       DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
15280     SDValue DWordClearMask = DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
15281     V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
15282                      DWordClearMask);
15283     V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
15284                      DWordClearMask);
15285     // Now pack things back together.
15286     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
15287     if (NumEvenDrops == 2) {
15288       Result = DAG.getBitcast(MVT::v4i32, Result);
15289       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
15290     }
15291     return Result;
15292   }
15293 
15294   // Try to lower by permuting the inputs into an unpack instruction.
15295   if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
15296                                                       Mask, Subtarget, DAG))
15297     return Unpack;
15298 
15299   // If we can't directly blend but can use PSHUFB, that will be better as it
15300   // can both shuffle and set up the inefficient blend.
15301   if (!IsBlendSupported && Subtarget.hasSSSE3()) {
15302     bool V1InUse, V2InUse;
15303     return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
15304                                         Zeroable, DAG, V1InUse, V2InUse);
15305   }
15306 
15307   // We can always bit-blend if we have to so the fallback strategy is to
15308   // decompose into single-input permutes and blends/unpacks.
15309   return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
15310                                               Mask, Subtarget, DAG);
15311 }
15312 
15313 // Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
15314 // sub-512-bit shuffles are padded to 512-bits for the shuffle and then
15315 // the active subvector is extracted.
lowerShuffleWithPERMV(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)15316 static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
15317                                      ArrayRef<int> Mask, SDValue V1, SDValue V2,
15318                                      const X86Subtarget &Subtarget,
15319                                      SelectionDAG &DAG) {
15320   MVT MaskVT = VT.changeTypeToInteger();
15321   SDValue MaskNode;
15322   MVT ShuffleVT = VT;
15323   if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
15324     V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
15325     V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
15326     ShuffleVT = V1.getSimpleValueType();
15327 
15328     // Adjust mask to correct indices for the second input.
15329     int NumElts = VT.getVectorNumElements();
15330     unsigned Scale = 512 / VT.getSizeInBits();
15331     SmallVector<int, 32> AdjustedMask(Mask.begin(), Mask.end());
15332     for (int &M : AdjustedMask)
15333       if (NumElts <= M)
15334         M += (Scale - 1) * NumElts;
15335     MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
15336     MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
15337   } else {
15338     MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
15339   }
15340 
15341   SDValue Result;
15342   if (V2.isUndef())
15343     Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
15344   else
15345     Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
15346 
15347   if (VT != ShuffleVT)
15348     Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
15349 
15350   return Result;
15351 }
15352 
15353 /// Generic lowering of v16i8 shuffles.
15354 ///
15355 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
15356 /// detect any complexity reducing interleaving. If that doesn't help, it uses
15357 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
15358 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
15359 /// back together.
lowerV16I8Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)15360 static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15361                                  const APInt &Zeroable, SDValue V1, SDValue V2,
15362                                  const X86Subtarget &Subtarget,
15363                                  SelectionDAG &DAG) {
15364   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
15365   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
15366   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
15367 
15368   // Try to use shift instructions.
15369   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
15370                                           Zeroable, Subtarget, DAG))
15371     return Shift;
15372 
15373   // Try to use byte rotation instructions.
15374   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
15375                                                 Subtarget, DAG))
15376     return Rotate;
15377 
15378   // Use dedicated pack instructions for masks that match their pattern.
15379   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
15380                                        Subtarget))
15381     return V;
15382 
15383   // Try to use a zext lowering.
15384   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
15385                                                    Zeroable, Subtarget, DAG))
15386     return ZExt;
15387 
15388   // Try to use lower using a truncation.
15389   if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15390                                         Subtarget, DAG))
15391     return V;
15392 
15393   if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15394                                        Subtarget, DAG))
15395     return V;
15396 
15397   // See if we can use SSE4A Extraction / Insertion.
15398   if (Subtarget.hasSSE4A())
15399     if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
15400                                           Zeroable, DAG))
15401       return V;
15402 
15403   int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
15404 
15405   // For single-input shuffles, there are some nicer lowering tricks we can use.
15406   if (NumV2Elements == 0) {
15407     // Check for being able to broadcast a single element.
15408     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
15409                                                     Mask, Subtarget, DAG))
15410       return Broadcast;
15411 
15412     // Try to use bit rotation instructions.
15413     if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
15414                                                  Subtarget, DAG))
15415       return Rotate;
15416 
15417     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
15418       return V;
15419 
15420     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
15421     // Notably, this handles splat and partial-splat shuffles more efficiently.
15422     // However, it only makes sense if the pre-duplication shuffle simplifies
15423     // things significantly. Currently, this means we need to be able to
15424     // express the pre-duplication shuffle as an i16 shuffle.
15425     //
15426     // FIXME: We should check for other patterns which can be widened into an
15427     // i16 shuffle as well.
15428     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
15429       for (int i = 0; i < 16; i += 2)
15430         if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
15431           return false;
15432 
15433       return true;
15434     };
15435     auto tryToWidenViaDuplication = [&]() -> SDValue {
15436       if (!canWidenViaDuplication(Mask))
15437         return SDValue();
15438       SmallVector<int, 4> LoInputs;
15439       copy_if(Mask, std::back_inserter(LoInputs),
15440               [](int M) { return M >= 0 && M < 8; });
15441       array_pod_sort(LoInputs.begin(), LoInputs.end());
15442       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
15443                      LoInputs.end());
15444       SmallVector<int, 4> HiInputs;
15445       copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
15446       array_pod_sort(HiInputs.begin(), HiInputs.end());
15447       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
15448                      HiInputs.end());
15449 
15450       bool TargetLo = LoInputs.size() >= HiInputs.size();
15451       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
15452       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
15453 
15454       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
15455       SmallDenseMap<int, int, 8> LaneMap;
15456       for (int I : InPlaceInputs) {
15457         PreDupI16Shuffle[I/2] = I/2;
15458         LaneMap[I] = I;
15459       }
15460       int j = TargetLo ? 0 : 4, je = j + 4;
15461       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
15462         // Check if j is already a shuffle of this input. This happens when
15463         // there are two adjacent bytes after we move the low one.
15464         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
15465           // If we haven't yet mapped the input, search for a slot into which
15466           // we can map it.
15467           while (j < je && PreDupI16Shuffle[j] >= 0)
15468             ++j;
15469 
15470           if (j == je)
15471             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
15472             return SDValue();
15473 
15474           // Map this input with the i16 shuffle.
15475           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
15476         }
15477 
15478         // Update the lane map based on the mapping we ended up with.
15479         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
15480       }
15481       V1 = DAG.getBitcast(
15482           MVT::v16i8,
15483           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15484                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
15485 
15486       // Unpack the bytes to form the i16s that will be shuffled into place.
15487       bool EvenInUse = false, OddInUse = false;
15488       for (int i = 0; i < 16; i += 2) {
15489         EvenInUse |= (Mask[i + 0] >= 0);
15490         OddInUse |= (Mask[i + 1] >= 0);
15491         if (EvenInUse && OddInUse)
15492           break;
15493       }
15494       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
15495                        MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
15496                        OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
15497 
15498       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
15499       for (int i = 0; i < 16; ++i)
15500         if (Mask[i] >= 0) {
15501           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
15502           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
15503           if (PostDupI16Shuffle[i / 2] < 0)
15504             PostDupI16Shuffle[i / 2] = MappedMask;
15505           else
15506             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
15507                    "Conflicting entries in the original shuffle!");
15508         }
15509       return DAG.getBitcast(
15510           MVT::v16i8,
15511           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15512                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
15513     };
15514     if (SDValue V = tryToWidenViaDuplication())
15515       return V;
15516   }
15517 
15518   if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
15519                                              Zeroable, Subtarget, DAG))
15520     return Masked;
15521 
15522   // Use dedicated unpack instructions for masks that match their pattern.
15523   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
15524     return V;
15525 
15526   // Try to use byte shift instructions to mask.
15527   if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
15528                                               Zeroable, Subtarget, DAG))
15529     return V;
15530 
15531   // Check for compaction patterns.
15532   bool IsSingleInput = V2.isUndef();
15533   int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput);
15534 
15535   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
15536   // with PSHUFB. It is important to do this before we attempt to generate any
15537   // blends but after all of the single-input lowerings. If the single input
15538   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
15539   // want to preserve that and we can DAG combine any longer sequences into
15540   // a PSHUFB in the end. But once we start blending from multiple inputs,
15541   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
15542   // and there are *very* few patterns that would actually be faster than the
15543   // PSHUFB approach because of its ability to zero lanes.
15544   //
15545   // If the mask is a binary compaction, we can more efficiently perform this
15546   // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
15547   //
15548   // FIXME: The only exceptions to the above are blends which are exact
15549   // interleavings with direct instructions supporting them. We currently don't
15550   // handle those well here.
15551   if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
15552     bool V1InUse = false;
15553     bool V2InUse = false;
15554 
15555     SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
15556         DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
15557 
15558     // If both V1 and V2 are in use and we can use a direct blend or an unpack,
15559     // do so. This avoids using them to handle blends-with-zero which is
15560     // important as a single pshufb is significantly faster for that.
15561     if (V1InUse && V2InUse) {
15562       if (Subtarget.hasSSE41())
15563         if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
15564                                                 Zeroable, Subtarget, DAG))
15565           return Blend;
15566 
15567       // We can use an unpack to do the blending rather than an or in some
15568       // cases. Even though the or may be (very minorly) more efficient, we
15569       // preference this lowering because there are common cases where part of
15570       // the complexity of the shuffles goes away when we do the final blend as
15571       // an unpack.
15572       // FIXME: It might be worth trying to detect if the unpack-feeding
15573       // shuffles will both be pshufb, in which case we shouldn't bother with
15574       // this.
15575       if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
15576               DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15577         return Unpack;
15578 
15579       // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
15580       if (Subtarget.hasVBMI())
15581         return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
15582                                      DAG);
15583 
15584       // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
15585       if (Subtarget.hasXOP()) {
15586         SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
15587         return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
15588       }
15589 
15590       // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
15591       // PALIGNR will be cheaper than the second PSHUFB+OR.
15592       if (SDValue V = lowerShuffleAsByteRotateAndPermute(
15593               DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15594         return V;
15595     }
15596 
15597     return PSHUFB;
15598   }
15599 
15600   // There are special ways we can lower some single-element blends.
15601   if (NumV2Elements == 1)
15602     if (SDValue V = lowerShuffleAsElementInsertion(
15603             DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
15604       return V;
15605 
15606   if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
15607     return Blend;
15608 
15609   // Check whether a compaction lowering can be done. This handles shuffles
15610   // which take every Nth element for some even N. See the helper function for
15611   // details.
15612   //
15613   // We special case these as they can be particularly efficiently handled with
15614   // the PACKUSB instruction on x86 and they show up in common patterns of
15615   // rearranging bytes to truncate wide elements.
15616   if (NumEvenDrops) {
15617     // NumEvenDrops is the power of two stride of the elements. Another way of
15618     // thinking about it is that we need to drop the even elements this many
15619     // times to get the original input.
15620 
15621     // First we need to zero all the dropped bytes.
15622     assert(NumEvenDrops <= 3 &&
15623            "No support for dropping even elements more than 3 times.");
15624     SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
15625     for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
15626       WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
15627     SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
15628     V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
15629                      WordClearMask);
15630     if (!IsSingleInput)
15631       V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
15632                        WordClearMask);
15633 
15634     // Now pack things back together.
15635     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15636                                  IsSingleInput ? V1 : V2);
15637     for (int i = 1; i < NumEvenDrops; ++i) {
15638       Result = DAG.getBitcast(MVT::v8i16, Result);
15639       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
15640     }
15641     return Result;
15642   }
15643 
15644   // Handle multi-input cases by blending/unpacking single-input shuffles.
15645   if (NumV2Elements > 0)
15646     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15647                                                 Subtarget, DAG);
15648 
15649   // The fallback path for single-input shuffles widens this into two v8i16
15650   // vectors with unpacks, shuffles those, and then pulls them back together
15651   // with a pack.
15652   SDValue V = V1;
15653 
15654   std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15655   std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15656   for (int i = 0; i < 16; ++i)
15657     if (Mask[i] >= 0)
15658       (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15659 
15660   SDValue VLoHalf, VHiHalf;
15661   // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15662   // them out and avoid using UNPCK{L,H} to extract the elements of V as
15663   // i16s.
15664   if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15665       none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15666     // Use a mask to drop the high bytes.
15667     VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15668     VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15669                           DAG.getConstant(0x00FF, DL, MVT::v8i16));
15670 
15671     // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15672     VHiHalf = DAG.getUNDEF(MVT::v8i16);
15673 
15674     // Squash the masks to point directly into VLoHalf.
15675     for (int &M : LoBlendMask)
15676       if (M >= 0)
15677         M /= 2;
15678     for (int &M : HiBlendMask)
15679       if (M >= 0)
15680         M /= 2;
15681   } else {
15682     // Otherwise just unpack the low half of V into VLoHalf and the high half into
15683     // VHiHalf so that we can blend them as i16s.
15684     SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15685 
15686     VLoHalf = DAG.getBitcast(
15687         MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15688     VHiHalf = DAG.getBitcast(
15689         MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15690   }
15691 
15692   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15693   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15694 
15695   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15696 }
15697 
15698 /// Dispatching routine to lower various 128-bit x86 vector shuffles.
15699 ///
15700 /// This routine breaks down the specific type of 128-bit shuffle and
15701 /// dispatches to the lowering routines accordingly.
lower128BitShuffle(const SDLoc & DL,ArrayRef<int> Mask,MVT VT,SDValue V1,SDValue V2,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)15702 static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
15703                                   MVT VT, SDValue V1, SDValue V2,
15704                                   const APInt &Zeroable,
15705                                   const X86Subtarget &Subtarget,
15706                                   SelectionDAG &DAG) {
15707   switch (VT.SimpleTy) {
15708   case MVT::v2i64:
15709     return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15710   case MVT::v2f64:
15711     return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15712   case MVT::v4i32:
15713     return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15714   case MVT::v4f32:
15715     return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15716   case MVT::v8i16:
15717     return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15718   case MVT::v16i8:
15719     return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15720 
15721   default:
15722     llvm_unreachable("Unimplemented!");
15723   }
15724 }
15725 
15726 /// Generic routine to split vector shuffle into half-sized shuffles.
15727 ///
15728 /// This routine just extracts two subvectors, shuffles them independently, and
15729 /// then concatenates them back together. This should work effectively with all
15730 /// AVX vector shuffle types.
splitAndLowerShuffle(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)15731 static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
15732                                     SDValue V2, ArrayRef<int> Mask,
15733                                     SelectionDAG &DAG) {
15734   assert(VT.getSizeInBits() >= 256 &&
15735          "Only for 256-bit or wider vector shuffles!");
15736   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
15737   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
15738 
15739   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15740   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15741 
15742   int NumElements = VT.getVectorNumElements();
15743   int SplitNumElements = NumElements / 2;
15744   MVT ScalarVT = VT.getVectorElementType();
15745   MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15746 
15747   // Use splitVector/extractSubVector so that split build-vectors just build two
15748   // narrower build vectors. This helps shuffling with splats and zeros.
15749   auto SplitVector = [&](SDValue V) {
15750     SDValue LoV, HiV;
15751     std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15752     return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15753                           DAG.getBitcast(SplitVT, HiV));
15754   };
15755 
15756   SDValue LoV1, HiV1, LoV2, HiV2;
15757   std::tie(LoV1, HiV1) = SplitVector(V1);
15758   std::tie(LoV2, HiV2) = SplitVector(V2);
15759 
15760   // Now create two 4-way blends of these half-width vectors.
15761   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15762     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
15763     SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15764     SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15765     SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15766     for (int i = 0; i < SplitNumElements; ++i) {
15767       int M = HalfMask[i];
15768       if (M >= NumElements) {
15769         if (M >= NumElements + SplitNumElements)
15770           UseHiV2 = true;
15771         else
15772           UseLoV2 = true;
15773         V2BlendMask[i] = M - NumElements;
15774         BlendMask[i] = SplitNumElements + i;
15775       } else if (M >= 0) {
15776         if (M >= SplitNumElements)
15777           UseHiV1 = true;
15778         else
15779           UseLoV1 = true;
15780         V1BlendMask[i] = M;
15781         BlendMask[i] = i;
15782       }
15783     }
15784 
15785     // Because the lowering happens after all combining takes place, we need to
15786     // manually combine these blend masks as much as possible so that we create
15787     // a minimal number of high-level vector shuffle nodes.
15788 
15789     // First try just blending the halves of V1 or V2.
15790     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15791       return DAG.getUNDEF(SplitVT);
15792     if (!UseLoV2 && !UseHiV2)
15793       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15794     if (!UseLoV1 && !UseHiV1)
15795       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15796 
15797     SDValue V1Blend, V2Blend;
15798     if (UseLoV1 && UseHiV1) {
15799       V1Blend =
15800         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15801     } else {
15802       // We only use half of V1 so map the usage down into the final blend mask.
15803       V1Blend = UseLoV1 ? LoV1 : HiV1;
15804       for (int i = 0; i < SplitNumElements; ++i)
15805         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15806           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15807     }
15808     if (UseLoV2 && UseHiV2) {
15809       V2Blend =
15810         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15811     } else {
15812       // We only use half of V2 so map the usage down into the final blend mask.
15813       V2Blend = UseLoV2 ? LoV2 : HiV2;
15814       for (int i = 0; i < SplitNumElements; ++i)
15815         if (BlendMask[i] >= SplitNumElements)
15816           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15817     }
15818     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15819   };
15820   SDValue Lo = HalfBlend(LoMask);
15821   SDValue Hi = HalfBlend(HiMask);
15822   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15823 }
15824 
15825 /// Either split a vector in halves or decompose the shuffles and the
15826 /// blend/unpack.
15827 ///
15828 /// This is provided as a good fallback for many lowerings of non-single-input
15829 /// shuffles with more than one 128-bit lane. In those cases, we want to select
15830 /// between splitting the shuffle into 128-bit components and stitching those
15831 /// back together vs. extracting the single-input shuffles and blending those
15832 /// results.
lowerShuffleAsSplitOrBlend(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)15833 static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
15834                                           SDValue V2, ArrayRef<int> Mask,
15835                                           const X86Subtarget &Subtarget,
15836                                           SelectionDAG &DAG) {
15837   assert(!V2.isUndef() && "This routine must not be used to lower single-input "
15838          "shuffles as it could then recurse on itself.");
15839   int Size = Mask.size();
15840 
15841   // If this can be modeled as a broadcast of two elements followed by a blend,
15842   // prefer that lowering. This is especially important because broadcasts can
15843   // often fold with memory operands.
15844   auto DoBothBroadcast = [&] {
15845     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15846     for (int M : Mask)
15847       if (M >= Size) {
15848         if (V2BroadcastIdx < 0)
15849           V2BroadcastIdx = M - Size;
15850         else if (M - Size != V2BroadcastIdx)
15851           return false;
15852       } else if (M >= 0) {
15853         if (V1BroadcastIdx < 0)
15854           V1BroadcastIdx = M;
15855         else if (M != V1BroadcastIdx)
15856           return false;
15857       }
15858     return true;
15859   };
15860   if (DoBothBroadcast())
15861     return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
15862                                                 DAG);
15863 
15864   // If the inputs all stem from a single 128-bit lane of each input, then we
15865   // split them rather than blending because the split will decompose to
15866   // unusually few instructions.
15867   int LaneCount = VT.getSizeInBits() / 128;
15868   int LaneSize = Size / LaneCount;
15869   SmallBitVector LaneInputs[2];
15870   LaneInputs[0].resize(LaneCount, false);
15871   LaneInputs[1].resize(LaneCount, false);
15872   for (int i = 0; i < Size; ++i)
15873     if (Mask[i] >= 0)
15874       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15875   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15876     return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
15877 
15878   // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15879   // requires that the decomposed single-input shuffles don't end up here.
15880   return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
15881                                               DAG);
15882 }
15883 
15884 // Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15885 // TODO: Extend to support v8f32 (+ 512-bit shuffles).
lowerShuffleAsLanePermuteAndSHUFP(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)15886 static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
15887                                                  SDValue V1, SDValue V2,
15888                                                  ArrayRef<int> Mask,
15889                                                  SelectionDAG &DAG) {
15890   assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15891 
15892   int LHSMask[4] = {-1, -1, -1, -1};
15893   int RHSMask[4] = {-1, -1, -1, -1};
15894   unsigned SHUFPMask = 0;
15895 
15896   // As SHUFPD uses a single LHS/RHS element per lane, we can always
15897   // perform the shuffle once the lanes have been shuffled in place.
15898   for (int i = 0; i != 4; ++i) {
15899     int M = Mask[i];
15900     if (M < 0)
15901       continue;
15902     int LaneBase = i & ~1;
15903     auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15904     LaneMask[LaneBase + (M & 1)] = M;
15905     SHUFPMask |= (M & 1) << i;
15906   }
15907 
15908   SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15909   SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15910   return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15911                      DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
15912 }
15913 
15914 /// Lower a vector shuffle crossing multiple 128-bit lanes as
15915 /// a lane permutation followed by a per-lane permutation.
15916 ///
15917 /// This is mainly for cases where we can have non-repeating permutes
15918 /// in each lane.
15919 ///
15920 /// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15921 /// we should investigate merging them.
lowerShuffleAsLanePermuteAndPermute(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG,const X86Subtarget & Subtarget)15922 static SDValue lowerShuffleAsLanePermuteAndPermute(
15923     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15924     SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15925   int NumElts = VT.getVectorNumElements();
15926   int NumLanes = VT.getSizeInBits() / 128;
15927   int NumEltsPerLane = NumElts / NumLanes;
15928   bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15929 
15930   /// Attempts to find a sublane permute with the given size
15931   /// that gets all elements into their target lanes.
15932   ///
15933   /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15934   /// If unsuccessful, returns false and may overwrite InLaneMask.
15935   auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15936     int NumSublanesPerLane = NumSublanes / NumLanes;
15937     int NumEltsPerSublane = NumElts / NumSublanes;
15938 
15939     SmallVector<int, 16> CrossLaneMask;
15940     SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15941     // CrossLaneMask but one entry == one sublane.
15942     SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15943 
15944     for (int i = 0; i != NumElts; ++i) {
15945       int M = Mask[i];
15946       if (M < 0)
15947         continue;
15948 
15949       int SrcSublane = M / NumEltsPerSublane;
15950       int DstLane = i / NumEltsPerLane;
15951 
15952       // We only need to get the elements into the right lane, not sublane.
15953       // So search all sublanes that make up the destination lane.
15954       bool Found = false;
15955       int DstSubStart = DstLane * NumSublanesPerLane;
15956       int DstSubEnd = DstSubStart + NumSublanesPerLane;
15957       for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15958         if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15959           continue;
15960 
15961         Found = true;
15962         CrossLaneMaskLarge[DstSublane] = SrcSublane;
15963         int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15964         InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15965         break;
15966       }
15967       if (!Found)
15968         return SDValue();
15969     }
15970 
15971     // Fill CrossLaneMask using CrossLaneMaskLarge.
15972     narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15973 
15974     if (!CanUseSublanes) {
15975       // If we're only shuffling a single lowest lane and the rest are identity
15976       // then don't bother.
15977       // TODO - isShuffleMaskInputInPlace could be extended to something like
15978       // this.
15979       int NumIdentityLanes = 0;
15980       bool OnlyShuffleLowestLane = true;
15981       for (int i = 0; i != NumLanes; ++i) {
15982         int LaneOffset = i * NumEltsPerLane;
15983         if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15984                                        i * NumEltsPerLane))
15985           NumIdentityLanes++;
15986         else if (CrossLaneMask[LaneOffset] != 0)
15987           OnlyShuffleLowestLane = false;
15988       }
15989       if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15990         return SDValue();
15991     }
15992 
15993     SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15994     return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15995                                 InLaneMask);
15996   };
15997 
15998   // First attempt a solution with full lanes.
15999   if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
16000     return V;
16001 
16002   // The rest of the solutions use sublanes.
16003   if (!CanUseSublanes)
16004     return SDValue();
16005 
16006   // Then attempt a solution with 64-bit sublanes (vpermq).
16007   if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
16008     return V;
16009 
16010   // If that doesn't work and we have fast variable cross-lane shuffle,
16011   // attempt 32-bit sublanes (vpermd).
16012   if (!Subtarget.hasFastVariableCrossLaneShuffle())
16013     return SDValue();
16014 
16015   return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
16016 }
16017 
16018 /// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
16019 /// source with a lane permutation.
16020 ///
16021 /// This lowering strategy results in four instructions in the worst case for a
16022 /// single-input cross lane shuffle which is lower than any other fully general
16023 /// cross-lane shuffle strategy I'm aware of. Special cases for each particular
16024 /// shuffle pattern should be handled prior to trying this lowering.
lowerShuffleAsLanePermuteAndShuffle(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG,const X86Subtarget & Subtarget)16025 static SDValue lowerShuffleAsLanePermuteAndShuffle(
16026     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16027     SelectionDAG &DAG, const X86Subtarget &Subtarget) {
16028   // FIXME: This should probably be generalized for 512-bit vectors as well.
16029   assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
16030   int Size = Mask.size();
16031   int LaneSize = Size / 2;
16032 
16033   // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16034   // Only do this if the elements aren't all from the lower lane,
16035   // otherwise we're (probably) better off doing a split.
16036   if (VT == MVT::v4f64 &&
16037       !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
16038     if (SDValue V =
16039             lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
16040       return V;
16041 
16042   // If there are only inputs from one 128-bit lane, splitting will in fact be
16043   // less expensive. The flags track whether the given lane contains an element
16044   // that crosses to another lane.
16045   if (!Subtarget.hasAVX2()) {
16046     bool LaneCrossing[2] = {false, false};
16047     for (int i = 0; i < Size; ++i)
16048       if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
16049         LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
16050     if (!LaneCrossing[0] || !LaneCrossing[1])
16051       return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16052   } else {
16053     bool LaneUsed[2] = {false, false};
16054     for (int i = 0; i < Size; ++i)
16055       if (Mask[i] >= 0)
16056         LaneUsed[(Mask[i] % Size) / LaneSize] = true;
16057     if (!LaneUsed[0] || !LaneUsed[1])
16058       return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16059   }
16060 
16061   // TODO - we could support shuffling V2 in the Flipped input.
16062   assert(V2.isUndef() &&
16063          "This last part of this routine only works on single input shuffles");
16064 
16065   SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());
16066   for (int i = 0; i < Size; ++i) {
16067     int &M = InLaneMask[i];
16068     if (M < 0)
16069       continue;
16070     if (((M % Size) / LaneSize) != (i / LaneSize))
16071       M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
16072   }
16073   assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
16074          "In-lane shuffle mask expected");
16075 
16076   // Flip the lanes, and shuffle the results which should now be in-lane.
16077   MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
16078   SDValue Flipped = DAG.getBitcast(PVT, V1);
16079   Flipped =
16080       DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
16081   Flipped = DAG.getBitcast(VT, Flipped);
16082   return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
16083 }
16084 
16085 /// Handle lowering 2-lane 128-bit shuffles.
lowerV2X128Shuffle(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)16086 static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
16087                                   SDValue V2, ArrayRef<int> Mask,
16088                                   const APInt &Zeroable,
16089                                   const X86Subtarget &Subtarget,
16090                                   SelectionDAG &DAG) {
16091   if (V2.isUndef()) {
16092     // Attempt to match VBROADCAST*128 subvector broadcast load.
16093     bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
16094     bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
16095     if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
16096         MayFoldLoad(peekThroughOneUseBitcasts(V1))) {
16097       auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
16098       if (!Ld->isNonTemporal()) {
16099         MVT MemVT = VT.getHalfNumVectorElementsVT();
16100         unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
16101         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
16102         SDValue Ptr = DAG.getMemBasePlusOffset(Ld->getBasePtr(),
16103                                                TypeSize::Fixed(Ofs), DL);
16104         SDValue Ops[] = {Ld->getChain(), Ptr};
16105         SDValue BcastLd = DAG.getMemIntrinsicNode(
16106             X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops, MemVT,
16107             DAG.getMachineFunction().getMachineMemOperand(
16108                 Ld->getMemOperand(), Ofs, MemVT.getStoreSize()));
16109         DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
16110         return BcastLd;
16111       }
16112     }
16113 
16114     // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
16115     if (Subtarget.hasAVX2())
16116       return SDValue();
16117   }
16118 
16119   bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
16120 
16121   SmallVector<int, 4> WidenedMask;
16122   if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
16123     return SDValue();
16124 
16125   bool IsLowZero = (Zeroable & 0x3) == 0x3;
16126   bool IsHighZero = (Zeroable & 0xc) == 0xc;
16127 
16128   // Try to use an insert into a zero vector.
16129   if (WidenedMask[0] == 0 && IsHighZero) {
16130     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16131     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16132                               DAG.getIntPtrConstant(0, DL));
16133     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16134                        getZeroVector(VT, Subtarget, DAG, DL), LoV,
16135                        DAG.getIntPtrConstant(0, DL));
16136   }
16137 
16138   // TODO: If minimizing size and one of the inputs is a zero vector and the
16139   // the zero vector has only one use, we could use a VPERM2X128 to save the
16140   // instruction bytes needed to explicitly generate the zero vector.
16141 
16142   // Blends are faster and handle all the non-lane-crossing cases.
16143   if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
16144                                           Subtarget, DAG))
16145     return Blend;
16146 
16147   // If either input operand is a zero vector, use VPERM2X128 because its mask
16148   // allows us to replace the zero input with an implicit zero.
16149   if (!IsLowZero && !IsHighZero) {
16150     // Check for patterns which can be matched with a single insert of a 128-bit
16151     // subvector.
16152     bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
16153     if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
16154 
16155       // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
16156       // this will likely become vinsertf128 which can't fold a 256-bit memop.
16157       if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
16158         MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16159         SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
16160                                      OnlyUsesV1 ? V1 : V2,
16161                                      DAG.getIntPtrConstant(0, DL));
16162         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16163                            DAG.getIntPtrConstant(2, DL));
16164       }
16165     }
16166 
16167     // Try to use SHUF128 if possible.
16168     if (Subtarget.hasVLX()) {
16169       if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
16170         unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
16171                             ((WidenedMask[1] % 2) << 1);
16172         return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
16173                            DAG.getTargetConstant(PermMask, DL, MVT::i8));
16174       }
16175     }
16176   }
16177 
16178   // Otherwise form a 128-bit permutation. After accounting for undefs,
16179   // convert the 64-bit shuffle mask selection values into 128-bit
16180   // selection bits by dividing the indexes by 2 and shifting into positions
16181   // defined by a vperm2*128 instruction's immediate control byte.
16182 
16183   // The immediate permute control byte looks like this:
16184   //    [1:0] - select 128 bits from sources for low half of destination
16185   //    [2]   - ignore
16186   //    [3]   - zero low half of destination
16187   //    [5:4] - select 128 bits from sources for high half of destination
16188   //    [6]   - ignore
16189   //    [7]   - zero high half of destination
16190 
16191   assert((WidenedMask[0] >= 0 || IsLowZero) &&
16192          (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
16193 
16194   unsigned PermMask = 0;
16195   PermMask |= IsLowZero  ? 0x08 : (WidenedMask[0] << 0);
16196   PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
16197 
16198   // Check the immediate mask and replace unused sources with undef.
16199   if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
16200     V1 = DAG.getUNDEF(VT);
16201   if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
16202     V2 = DAG.getUNDEF(VT);
16203 
16204   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
16205                      DAG.getTargetConstant(PermMask, DL, MVT::i8));
16206 }
16207 
16208 /// Lower a vector shuffle by first fixing the 128-bit lanes and then
16209 /// shuffling each lane.
16210 ///
16211 /// This attempts to create a repeated lane shuffle where each lane uses one
16212 /// or two of the lanes of the inputs. The lanes of the input vectors are
16213 /// shuffled in one or two independent shuffles to get the lanes into the
16214 /// position needed by the final shuffle.
lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)16215 static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
16216     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16217     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16218   assert(!V2.isUndef() && "This is only useful with multiple inputs.");
16219 
16220   if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16221     return SDValue();
16222 
16223   int NumElts = Mask.size();
16224   int NumLanes = VT.getSizeInBits() / 128;
16225   int NumLaneElts = 128 / VT.getScalarSizeInBits();
16226   SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
16227   SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
16228 
16229   // First pass will try to fill in the RepeatMask from lanes that need two
16230   // sources.
16231   for (int Lane = 0; Lane != NumLanes; ++Lane) {
16232     int Srcs[2] = {-1, -1};
16233     SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
16234     for (int i = 0; i != NumLaneElts; ++i) {
16235       int M = Mask[(Lane * NumLaneElts) + i];
16236       if (M < 0)
16237         continue;
16238       // Determine which of the possible input lanes (NumLanes from each source)
16239       // this element comes from. Assign that as one of the sources for this
16240       // lane. We can assign up to 2 sources for this lane. If we run out
16241       // sources we can't do anything.
16242       int LaneSrc = M / NumLaneElts;
16243       int Src;
16244       if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
16245         Src = 0;
16246       else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
16247         Src = 1;
16248       else
16249         return SDValue();
16250 
16251       Srcs[Src] = LaneSrc;
16252       InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
16253     }
16254 
16255     // If this lane has two sources, see if it fits with the repeat mask so far.
16256     if (Srcs[1] < 0)
16257       continue;
16258 
16259     LaneSrcs[Lane][0] = Srcs[0];
16260     LaneSrcs[Lane][1] = Srcs[1];
16261 
16262     auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
16263       assert(M1.size() == M2.size() && "Unexpected mask size");
16264       for (int i = 0, e = M1.size(); i != e; ++i)
16265         if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
16266           return false;
16267       return true;
16268     };
16269 
16270     auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
16271       assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
16272       for (int i = 0, e = MergedMask.size(); i != e; ++i) {
16273         int M = Mask[i];
16274         if (M < 0)
16275           continue;
16276         assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
16277                "Unexpected mask element");
16278         MergedMask[i] = M;
16279       }
16280     };
16281 
16282     if (MatchMasks(InLaneMask, RepeatMask)) {
16283       // Merge this lane mask into the final repeat mask.
16284       MergeMasks(InLaneMask, RepeatMask);
16285       continue;
16286     }
16287 
16288     // Didn't find a match. Swap the operands and try again.
16289     std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
16290     ShuffleVectorSDNode::commuteMask(InLaneMask);
16291 
16292     if (MatchMasks(InLaneMask, RepeatMask)) {
16293       // Merge this lane mask into the final repeat mask.
16294       MergeMasks(InLaneMask, RepeatMask);
16295       continue;
16296     }
16297 
16298     // Couldn't find a match with the operands in either order.
16299     return SDValue();
16300   }
16301 
16302   // Now handle any lanes with only one source.
16303   for (int Lane = 0; Lane != NumLanes; ++Lane) {
16304     // If this lane has already been processed, skip it.
16305     if (LaneSrcs[Lane][0] >= 0)
16306       continue;
16307 
16308     for (int i = 0; i != NumLaneElts; ++i) {
16309       int M = Mask[(Lane * NumLaneElts) + i];
16310       if (M < 0)
16311         continue;
16312 
16313       // If RepeatMask isn't defined yet we can define it ourself.
16314       if (RepeatMask[i] < 0)
16315         RepeatMask[i] = M % NumLaneElts;
16316 
16317       if (RepeatMask[i] < NumElts) {
16318         if (RepeatMask[i] != M % NumLaneElts)
16319           return SDValue();
16320         LaneSrcs[Lane][0] = M / NumLaneElts;
16321       } else {
16322         if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
16323           return SDValue();
16324         LaneSrcs[Lane][1] = M / NumLaneElts;
16325       }
16326     }
16327 
16328     if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
16329       return SDValue();
16330   }
16331 
16332   SmallVector<int, 16> NewMask(NumElts, -1);
16333   for (int Lane = 0; Lane != NumLanes; ++Lane) {
16334     int Src = LaneSrcs[Lane][0];
16335     for (int i = 0; i != NumLaneElts; ++i) {
16336       int M = -1;
16337       if (Src >= 0)
16338         M = Src * NumLaneElts + i;
16339       NewMask[Lane * NumLaneElts + i] = M;
16340     }
16341   }
16342   SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16343   // Ensure we didn't get back the shuffle we started with.
16344   // FIXME: This is a hack to make up for some splat handling code in
16345   // getVectorShuffle.
16346   if (isa<ShuffleVectorSDNode>(NewV1) &&
16347       cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
16348     return SDValue();
16349 
16350   for (int Lane = 0; Lane != NumLanes; ++Lane) {
16351     int Src = LaneSrcs[Lane][1];
16352     for (int i = 0; i != NumLaneElts; ++i) {
16353       int M = -1;
16354       if (Src >= 0)
16355         M = Src * NumLaneElts + i;
16356       NewMask[Lane * NumLaneElts + i] = M;
16357     }
16358   }
16359   SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16360   // Ensure we didn't get back the shuffle we started with.
16361   // FIXME: This is a hack to make up for some splat handling code in
16362   // getVectorShuffle.
16363   if (isa<ShuffleVectorSDNode>(NewV2) &&
16364       cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
16365     return SDValue();
16366 
16367   for (int i = 0; i != NumElts; ++i) {
16368     NewMask[i] = RepeatMask[i % NumLaneElts];
16369     if (NewMask[i] < 0)
16370       continue;
16371 
16372     NewMask[i] += (i / NumLaneElts) * NumLaneElts;
16373   }
16374   return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
16375 }
16376 
16377 /// If the input shuffle mask results in a vector that is undefined in all upper
16378 /// or lower half elements and that mask accesses only 2 halves of the
16379 /// shuffle's operands, return true. A mask of half the width with mask indexes
16380 /// adjusted to access the extracted halves of the original shuffle operands is
16381 /// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
16382 /// lower half of each input operand is accessed.
16383 static bool
getHalfShuffleMask(ArrayRef<int> Mask,MutableArrayRef<int> HalfMask,int & HalfIdx1,int & HalfIdx2)16384 getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
16385                    int &HalfIdx1, int &HalfIdx2) {
16386   assert((Mask.size() == HalfMask.size() * 2) &&
16387          "Expected input mask to be twice as long as output");
16388 
16389   // Exactly one half of the result must be undef to allow narrowing.
16390   bool UndefLower = isUndefLowerHalf(Mask);
16391   bool UndefUpper = isUndefUpperHalf(Mask);
16392   if (UndefLower == UndefUpper)
16393     return false;
16394 
16395   unsigned HalfNumElts = HalfMask.size();
16396   unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
16397   HalfIdx1 = -1;
16398   HalfIdx2 = -1;
16399   for (unsigned i = 0; i != HalfNumElts; ++i) {
16400     int M = Mask[i + MaskIndexOffset];
16401     if (M < 0) {
16402       HalfMask[i] = M;
16403       continue;
16404     }
16405 
16406     // Determine which of the 4 half vectors this element is from.
16407     // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
16408     int HalfIdx = M / HalfNumElts;
16409 
16410     // Determine the element index into its half vector source.
16411     int HalfElt = M % HalfNumElts;
16412 
16413     // We can shuffle with up to 2 half vectors, set the new 'half'
16414     // shuffle mask accordingly.
16415     if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
16416       HalfMask[i] = HalfElt;
16417       HalfIdx1 = HalfIdx;
16418       continue;
16419     }
16420     if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
16421       HalfMask[i] = HalfElt + HalfNumElts;
16422       HalfIdx2 = HalfIdx;
16423       continue;
16424     }
16425 
16426     // Too many half vectors referenced.
16427     return false;
16428   }
16429 
16430   return true;
16431 }
16432 
16433 /// Given the output values from getHalfShuffleMask(), create a half width
16434 /// shuffle of extracted vectors followed by an insert back to full width.
getShuffleHalfVectors(const SDLoc & DL,SDValue V1,SDValue V2,ArrayRef<int> HalfMask,int HalfIdx1,int HalfIdx2,bool UndefLower,SelectionDAG & DAG,bool UseConcat=false)16435 static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
16436                                      ArrayRef<int> HalfMask, int HalfIdx1,
16437                                      int HalfIdx2, bool UndefLower,
16438                                      SelectionDAG &DAG, bool UseConcat = false) {
16439   assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
16440   assert(V1.getValueType().isSimple() && "Expecting only simple types");
16441 
16442   MVT VT = V1.getSimpleValueType();
16443   MVT HalfVT = VT.getHalfNumVectorElementsVT();
16444   unsigned HalfNumElts = HalfVT.getVectorNumElements();
16445 
16446   auto getHalfVector = [&](int HalfIdx) {
16447     if (HalfIdx < 0)
16448       return DAG.getUNDEF(HalfVT);
16449     SDValue V = (HalfIdx < 2 ? V1 : V2);
16450     HalfIdx = (HalfIdx % 2) * HalfNumElts;
16451     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
16452                        DAG.getIntPtrConstant(HalfIdx, DL));
16453   };
16454 
16455   // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
16456   SDValue Half1 = getHalfVector(HalfIdx1);
16457   SDValue Half2 = getHalfVector(HalfIdx2);
16458   SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
16459   if (UseConcat) {
16460     SDValue Op0 = V;
16461     SDValue Op1 = DAG.getUNDEF(HalfVT);
16462     if (UndefLower)
16463       std::swap(Op0, Op1);
16464     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
16465   }
16466 
16467   unsigned Offset = UndefLower ? HalfNumElts : 0;
16468   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
16469                      DAG.getIntPtrConstant(Offset, DL));
16470 }
16471 
16472 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
16473 /// This allows for fast cases such as subvector extraction/insertion
16474 /// or shuffling smaller vector types which can lower more efficiently.
lowerShuffleWithUndefHalf(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)16475 static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
16476                                          SDValue V2, ArrayRef<int> Mask,
16477                                          const X86Subtarget &Subtarget,
16478                                          SelectionDAG &DAG) {
16479   assert((VT.is256BitVector() || VT.is512BitVector()) &&
16480          "Expected 256-bit or 512-bit vector");
16481 
16482   bool UndefLower = isUndefLowerHalf(Mask);
16483   if (!UndefLower && !isUndefUpperHalf(Mask))
16484     return SDValue();
16485 
16486   assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
16487          "Completely undef shuffle mask should have been simplified already");
16488 
16489   // Upper half is undef and lower half is whole upper subvector.
16490   // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
16491   MVT HalfVT = VT.getHalfNumVectorElementsVT();
16492   unsigned HalfNumElts = HalfVT.getVectorNumElements();
16493   if (!UndefLower &&
16494       isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
16495     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16496                              DAG.getIntPtrConstant(HalfNumElts, DL));
16497     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16498                        DAG.getIntPtrConstant(0, DL));
16499   }
16500 
16501   // Lower half is undef and upper half is whole lower subvector.
16502   // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
16503   if (UndefLower &&
16504       isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
16505     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16506                              DAG.getIntPtrConstant(0, DL));
16507     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16508                        DAG.getIntPtrConstant(HalfNumElts, DL));
16509   }
16510 
16511   int HalfIdx1, HalfIdx2;
16512   SmallVector<int, 8> HalfMask(HalfNumElts);
16513   if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
16514     return SDValue();
16515 
16516   assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
16517 
16518   // Only shuffle the halves of the inputs when useful.
16519   unsigned NumLowerHalves =
16520       (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
16521   unsigned NumUpperHalves =
16522       (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
16523   assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
16524 
16525   // Determine the larger pattern of undef/halves, then decide if it's worth
16526   // splitting the shuffle based on subtarget capabilities and types.
16527   unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
16528   if (!UndefLower) {
16529     // XXXXuuuu: no insert is needed.
16530     // Always extract lowers when setting lower - these are all free subreg ops.
16531     if (NumUpperHalves == 0)
16532       return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16533                                    UndefLower, DAG);
16534 
16535     if (NumUpperHalves == 1) {
16536       // AVX2 has efficient 32/64-bit element cross-lane shuffles.
16537       if (Subtarget.hasAVX2()) {
16538         // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
16539         if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
16540             !is128BitUnpackShuffleMask(HalfMask) &&
16541             (!isSingleSHUFPSMask(HalfMask) ||
16542              Subtarget.hasFastVariableCrossLaneShuffle()))
16543           return SDValue();
16544         // If this is a unary shuffle (assume that the 2nd operand is
16545         // canonicalized to undef), then we can use vpermpd. Otherwise, we
16546         // are better off extracting the upper half of 1 operand and using a
16547         // narrow shuffle.
16548         if (EltWidth == 64 && V2.isUndef())
16549           return SDValue();
16550       }
16551       // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16552       if (Subtarget.hasAVX512() && VT.is512BitVector())
16553         return SDValue();
16554       // Extract + narrow shuffle is better than the wide alternative.
16555       return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16556                                    UndefLower, DAG);
16557     }
16558 
16559     // Don't extract both uppers, instead shuffle and then extract.
16560     assert(NumUpperHalves == 2 && "Half vector count went wrong");
16561     return SDValue();
16562   }
16563 
16564   // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16565   if (NumUpperHalves == 0) {
16566     // AVX2 has efficient 64-bit element cross-lane shuffles.
16567     // TODO: Refine to account for unary shuffle, splat, and other masks?
16568     if (Subtarget.hasAVX2() && EltWidth == 64)
16569       return SDValue();
16570     // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16571     if (Subtarget.hasAVX512() && VT.is512BitVector())
16572       return SDValue();
16573     // Narrow shuffle + insert is better than the wide alternative.
16574     return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16575                                  UndefLower, DAG);
16576   }
16577 
16578   // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16579   return SDValue();
16580 }
16581 
16582 /// Test whether the specified input (0 or 1) is in-place blended by the
16583 /// given mask.
16584 ///
16585 /// This returns true if the elements from a particular input are already in the
16586 /// slot required by the given mask and require no permutation.
isShuffleMaskInputInPlace(int Input,ArrayRef<int> Mask)16587 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
16588   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
16589   int Size = Mask.size();
16590   for (int i = 0; i < Size; ++i)
16591     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
16592       return false;
16593 
16594   return true;
16595 }
16596 
16597 /// Handle case where shuffle sources are coming from the same 128-bit lane and
16598 /// every lane can be represented as the same repeating mask - allowing us to
16599 /// shuffle the sources with the repeating shuffle and then permute the result
16600 /// to the destination lanes.
lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)16601 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
16602     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16603     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16604   int NumElts = VT.getVectorNumElements();
16605   int NumLanes = VT.getSizeInBits() / 128;
16606   int NumLaneElts = NumElts / NumLanes;
16607 
16608   // On AVX2 we may be able to just shuffle the lowest elements and then
16609   // broadcast the result.
16610   if (Subtarget.hasAVX2()) {
16611     for (unsigned BroadcastSize : {16, 32, 64}) {
16612       if (BroadcastSize <= VT.getScalarSizeInBits())
16613         continue;
16614       int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16615 
16616       // Attempt to match a repeating pattern every NumBroadcastElts,
16617       // accounting for UNDEFs but only references the lowest 128-bit
16618       // lane of the inputs.
16619       auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16620         for (int i = 0; i != NumElts; i += NumBroadcastElts)
16621           for (int j = 0; j != NumBroadcastElts; ++j) {
16622             int M = Mask[i + j];
16623             if (M < 0)
16624               continue;
16625             int &R = RepeatMask[j];
16626             if (0 != ((M % NumElts) / NumLaneElts))
16627               return false;
16628             if (0 <= R && R != M)
16629               return false;
16630             R = M;
16631           }
16632         return true;
16633       };
16634 
16635       SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16636       if (!FindRepeatingBroadcastMask(RepeatMask))
16637         continue;
16638 
16639       // Shuffle the (lowest) repeated elements in place for broadcast.
16640       SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16641 
16642       // Shuffle the actual broadcast.
16643       SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16644       for (int i = 0; i != NumElts; i += NumBroadcastElts)
16645         for (int j = 0; j != NumBroadcastElts; ++j)
16646           BroadcastMask[i + j] = j;
16647       return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16648                                   BroadcastMask);
16649     }
16650   }
16651 
16652   // Bail if the shuffle mask doesn't cross 128-bit lanes.
16653   if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16654     return SDValue();
16655 
16656   // Bail if we already have a repeated lane shuffle mask.
16657   SmallVector<int, 8> RepeatedShuffleMask;
16658   if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
16659     return SDValue();
16660 
16661   // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16662   // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
16663   int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
16664   int NumSubLanes = NumLanes * SubLaneScale;
16665   int NumSubLaneElts = NumLaneElts / SubLaneScale;
16666 
16667   // Check that all the sources are coming from the same lane and see if we can
16668   // form a repeating shuffle mask (local to each sub-lane). At the same time,
16669   // determine the source sub-lane for each destination sub-lane.
16670   int TopSrcSubLane = -1;
16671   SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16672   SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
16673       SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
16674       SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
16675 
16676   for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16677     // Extract the sub-lane mask, check that it all comes from the same lane
16678     // and normalize the mask entries to come from the first lane.
16679     int SrcLane = -1;
16680     SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16681     for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16682       int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16683       if (M < 0)
16684         continue;
16685       int Lane = (M % NumElts) / NumLaneElts;
16686       if ((0 <= SrcLane) && (SrcLane != Lane))
16687         return SDValue();
16688       SrcLane = Lane;
16689       int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16690       SubLaneMask[Elt] = LocalM;
16691     }
16692 
16693     // Whole sub-lane is UNDEF.
16694     if (SrcLane < 0)
16695       continue;
16696 
16697     // Attempt to match against the candidate repeated sub-lane masks.
16698     for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16699       auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16700         for (int i = 0; i != NumSubLaneElts; ++i) {
16701           if (M1[i] < 0 || M2[i] < 0)
16702             continue;
16703           if (M1[i] != M2[i])
16704             return false;
16705         }
16706         return true;
16707       };
16708 
16709       auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16710       if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16711         continue;
16712 
16713       // Merge the sub-lane mask into the matching repeated sub-lane mask.
16714       for (int i = 0; i != NumSubLaneElts; ++i) {
16715         int M = SubLaneMask[i];
16716         if (M < 0)
16717           continue;
16718         assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16719                "Unexpected mask element");
16720         RepeatedSubLaneMask[i] = M;
16721       }
16722 
16723       // Track the top most source sub-lane - by setting the remaining to UNDEF
16724       // we can greatly simplify shuffle matching.
16725       int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16726       TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16727       Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16728       break;
16729     }
16730 
16731     // Bail if we failed to find a matching repeated sub-lane mask.
16732     if (Dst2SrcSubLanes[DstSubLane] < 0)
16733       return SDValue();
16734   }
16735   assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16736          "Unexpected source lane");
16737 
16738   // Create a repeating shuffle mask for the entire vector.
16739   SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16740   for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16741     int Lane = SubLane / SubLaneScale;
16742     auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16743     for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16744       int M = RepeatedSubLaneMask[Elt];
16745       if (M < 0)
16746         continue;
16747       int Idx = (SubLane * NumSubLaneElts) + Elt;
16748       RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16749     }
16750   }
16751   SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16752 
16753   // Shuffle each source sub-lane to its destination.
16754   SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16755   for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16756     int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16757     if (SrcSubLane < 0)
16758       continue;
16759     for (int j = 0; j != NumSubLaneElts; ++j)
16760       SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16761   }
16762 
16763   return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16764                               SubLaneMask);
16765 }
16766 
matchShuffleWithSHUFPD(MVT VT,SDValue & V1,SDValue & V2,bool & ForceV1Zero,bool & ForceV2Zero,unsigned & ShuffleImm,ArrayRef<int> Mask,const APInt & Zeroable)16767 static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
16768                                    bool &ForceV1Zero, bool &ForceV2Zero,
16769                                    unsigned &ShuffleImm, ArrayRef<int> Mask,
16770                                    const APInt &Zeroable) {
16771   int NumElts = VT.getVectorNumElements();
16772   assert(VT.getScalarSizeInBits() == 64 &&
16773          (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16774          "Unexpected data type for VSHUFPD");
16775   assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
16776          "Illegal shuffle mask");
16777 
16778   bool ZeroLane[2] = { true, true };
16779   for (int i = 0; i < NumElts; ++i)
16780     ZeroLane[i & 1] &= Zeroable[i];
16781 
16782   // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
16783   // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
16784   ShuffleImm = 0;
16785   bool ShufpdMask = true;
16786   bool CommutableMask = true;
16787   for (int i = 0; i < NumElts; ++i) {
16788     if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16789       continue;
16790     if (Mask[i] < 0)
16791       return false;
16792     int Val = (i & 6) + NumElts * (i & 1);
16793     int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16794     if (Mask[i] < Val || Mask[i] > Val + 1)
16795       ShufpdMask = false;
16796     if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16797       CommutableMask = false;
16798     ShuffleImm |= (Mask[i] % 2) << i;
16799   }
16800 
16801   if (!ShufpdMask && !CommutableMask)
16802     return false;
16803 
16804   if (!ShufpdMask && CommutableMask)
16805     std::swap(V1, V2);
16806 
16807   ForceV1Zero = ZeroLane[0];
16808   ForceV2Zero = ZeroLane[1];
16809   return true;
16810 }
16811 
lowerShuffleWithSHUFPD(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)16812 static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
16813                                       SDValue V2, ArrayRef<int> Mask,
16814                                       const APInt &Zeroable,
16815                                       const X86Subtarget &Subtarget,
16816                                       SelectionDAG &DAG) {
16817   assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16818          "Unexpected data type for VSHUFPD");
16819 
16820   unsigned Immediate = 0;
16821   bool ForceV1Zero = false, ForceV2Zero = false;
16822   if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16823                               Mask, Zeroable))
16824     return SDValue();
16825 
16826   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16827   if (ForceV1Zero)
16828     V1 = getZeroVector(VT, Subtarget, DAG, DL);
16829   if (ForceV2Zero)
16830     V2 = getZeroVector(VT, Subtarget, DAG, DL);
16831 
16832   return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16833                      DAG.getTargetConstant(Immediate, DL, MVT::i8));
16834 }
16835 
16836 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16837 // by zeroable elements in the remaining 24 elements. Turn this into two
16838 // vmovqb instructions shuffled together.
lowerShuffleAsVTRUNCAndUnpack(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,SelectionDAG & DAG)16839 static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
16840                                              SDValue V1, SDValue V2,
16841                                              ArrayRef<int> Mask,
16842                                              const APInt &Zeroable,
16843                                              SelectionDAG &DAG) {
16844   assert(VT == MVT::v32i8 && "Unexpected type!");
16845 
16846   // The first 8 indices should be every 8th element.
16847   if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16848     return SDValue();
16849 
16850   // Remaining elements need to be zeroable.
16851   if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
16852     return SDValue();
16853 
16854   V1 = DAG.getBitcast(MVT::v4i64, V1);
16855   V2 = DAG.getBitcast(MVT::v4i64, V2);
16856 
16857   V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16858   V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16859 
16860   // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16861   // the upper bits of the result using an unpckldq.
16862   SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16863                                         { 0, 1, 2, 3, 16, 17, 18, 19,
16864                                           4, 5, 6, 7, 20, 21, 22, 23 });
16865   // Insert the unpckldq into a zero vector to widen to v32i8.
16866   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16867                      DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16868                      DAG.getIntPtrConstant(0, DL));
16869 }
16870 
16871 
16872 /// Handle lowering of 4-lane 64-bit floating point shuffles.
16873 ///
16874 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16875 /// isn't available.
lowerV4F64Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)16876 static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16877                                  const APInt &Zeroable, SDValue V1, SDValue V2,
16878                                  const X86Subtarget &Subtarget,
16879                                  SelectionDAG &DAG) {
16880   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16881   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16882   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16883 
16884   if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16885                                      Subtarget, DAG))
16886     return V;
16887 
16888   if (V2.isUndef()) {
16889     // Check for being able to broadcast a single element.
16890     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16891                                                     Mask, Subtarget, DAG))
16892       return Broadcast;
16893 
16894     // Use low duplicate instructions for masks that match their pattern.
16895     if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16896       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16897 
16898     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16899       // Non-half-crossing single input shuffles can be lowered with an
16900       // interleaved permutation.
16901       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16902                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16903       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16904                          DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16905     }
16906 
16907     // With AVX2 we have direct support for this permutation.
16908     if (Subtarget.hasAVX2())
16909       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16910                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16911 
16912     // Try to create an in-lane repeating shuffle mask and then shuffle the
16913     // results into the target lanes.
16914     if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16915             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16916       return V;
16917 
16918     // Try to permute the lanes and then use a per-lane permute.
16919     if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16920                                                         Mask, DAG, Subtarget))
16921       return V;
16922 
16923     // Otherwise, fall back.
16924     return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16925                                                DAG, Subtarget);
16926   }
16927 
16928   // Use dedicated unpack instructions for masks that match their pattern.
16929   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
16930     return V;
16931 
16932   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16933                                           Zeroable, Subtarget, DAG))
16934     return Blend;
16935 
16936   // Check if the blend happens to exactly fit that of SHUFPD.
16937   if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16938                                           Zeroable, Subtarget, DAG))
16939     return Op;
16940 
16941   // If we have lane crossing shuffles AND they don't all come from the lower
16942   // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16943   // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16944   // canonicalize to a blend of splat which isn't necessary for this combine.
16945   if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16946       !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16947       (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16948       (V2.getOpcode() != ISD::BUILD_VECTOR))
16949     if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
16950                                                        Mask, DAG))
16951       return Op;
16952 
16953   // If we have one input in place, then we can permute the other input and
16954   // blend the result.
16955   if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
16956     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16957                                                 Subtarget, DAG);
16958 
16959   // Try to create an in-lane repeating shuffle mask and then shuffle the
16960   // results into the target lanes.
16961   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16962           DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16963     return V;
16964 
16965   // Try to simplify this by merging 128-bit lanes to enable a lane-based
16966   // shuffle. However, if we have AVX2 and either inputs are already in place,
16967   // we will be able to shuffle even across lanes the other input in a single
16968   // instruction so skip this pattern.
16969   if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
16970                                 isShuffleMaskInputInPlace(1, Mask))))
16971     if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
16972             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16973       return V;
16974 
16975   // If we have VLX support, we can use VEXPAND.
16976   if (Subtarget.hasVLX())
16977     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
16978                                          DAG, Subtarget))
16979       return V;
16980 
16981   // If we have AVX2 then we always want to lower with a blend because an v4 we
16982   // can fully permute the elements.
16983   if (Subtarget.hasAVX2())
16984     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16985                                                 Subtarget, DAG);
16986 
16987   // Otherwise fall back on generic lowering.
16988   return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
16989                                     Subtarget, DAG);
16990 }
16991 
16992 /// Handle lowering of 4-lane 64-bit integer shuffles.
16993 ///
16994 /// This routine is only called when we have AVX2 and thus a reasonable
16995 /// instruction set for v4i64 shuffling..
lowerV4I64Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)16996 static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16997                                  const APInt &Zeroable, SDValue V1, SDValue V2,
16998                                  const X86Subtarget &Subtarget,
16999                                  SelectionDAG &DAG) {
17000   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
17001   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
17002   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
17003   assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
17004 
17005   if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
17006                                      Subtarget, DAG))
17007     return V;
17008 
17009   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
17010                                           Zeroable, Subtarget, DAG))
17011     return Blend;
17012 
17013   // Check for being able to broadcast a single element.
17014   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
17015                                                   Subtarget, DAG))
17016     return Broadcast;
17017 
17018   if (V2.isUndef()) {
17019     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17020     // can use lower latency instructions that will operate on both lanes.
17021     SmallVector<int, 2> RepeatedMask;
17022     if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
17023       SmallVector<int, 4> PSHUFDMask;
17024       narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
17025       return DAG.getBitcast(
17026           MVT::v4i64,
17027           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
17028                       DAG.getBitcast(MVT::v8i32, V1),
17029                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17030     }
17031 
17032     // AVX2 provides a direct instruction for permuting a single input across
17033     // lanes.
17034     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
17035                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
17036   }
17037 
17038   // Try to use shift instructions.
17039   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
17040                                           Zeroable, Subtarget, DAG))
17041     return Shift;
17042 
17043   // If we have VLX support, we can use VALIGN or VEXPAND.
17044   if (Subtarget.hasVLX()) {
17045     if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
17046                                               Subtarget, DAG))
17047       return Rotate;
17048 
17049     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
17050                                          DAG, Subtarget))
17051       return V;
17052   }
17053 
17054   // Try to use PALIGNR.
17055   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
17056                                                 Subtarget, DAG))
17057     return Rotate;
17058 
17059   // Use dedicated unpack instructions for masks that match their pattern.
17060   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
17061     return V;
17062 
17063   // If we have one input in place, then we can permute the other input and
17064   // blend the result.
17065   if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
17066     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17067                                                 Subtarget, DAG);
17068 
17069   // Try to create an in-lane repeating shuffle mask and then shuffle the
17070   // results into the target lanes.
17071   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17072           DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17073     return V;
17074 
17075   // Try to simplify this by merging 128-bit lanes to enable a lane-based
17076   // shuffle. However, if we have AVX2 and either inputs are already in place,
17077   // we will be able to shuffle even across lanes the other input in a single
17078   // instruction so skip this pattern.
17079   if (!isShuffleMaskInputInPlace(0, Mask) &&
17080       !isShuffleMaskInputInPlace(1, Mask))
17081     if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17082             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17083       return Result;
17084 
17085   // Otherwise fall back on generic blend lowering.
17086   return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17087                                               Subtarget, DAG);
17088 }
17089 
17090 /// Handle lowering of 8-lane 32-bit floating point shuffles.
17091 ///
17092 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
17093 /// isn't available.
lowerV8F32Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)17094 static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17095                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17096                                  const X86Subtarget &Subtarget,
17097                                  SelectionDAG &DAG) {
17098   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
17099   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
17100   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17101 
17102   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
17103                                           Zeroable, Subtarget, DAG))
17104     return Blend;
17105 
17106   // Check for being able to broadcast a single element.
17107   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
17108                                                   Subtarget, DAG))
17109     return Broadcast;
17110 
17111   // If the shuffle mask is repeated in each 128-bit lane, we have many more
17112   // options to efficiently lower the shuffle.
17113   SmallVector<int, 4> RepeatedMask;
17114   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
17115     assert(RepeatedMask.size() == 4 &&
17116            "Repeated masks must be half the mask width!");
17117 
17118     // Use even/odd duplicate instructions for masks that match their pattern.
17119     if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17120       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
17121     if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17122       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
17123 
17124     if (V2.isUndef())
17125       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
17126                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17127 
17128     // Use dedicated unpack instructions for masks that match their pattern.
17129     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
17130       return V;
17131 
17132     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
17133     // have already handled any direct blends.
17134     return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
17135   }
17136 
17137   // Try to create an in-lane repeating shuffle mask and then shuffle the
17138   // results into the target lanes.
17139   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17140           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17141     return V;
17142 
17143   // If we have a single input shuffle with different shuffle patterns in the
17144   // two 128-bit lanes use the variable mask to VPERMILPS.
17145   if (V2.isUndef()) {
17146     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
17147       SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17148       return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
17149     }
17150     if (Subtarget.hasAVX2()) {
17151       SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17152       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
17153     }
17154     // Otherwise, fall back.
17155     return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
17156                                                DAG, Subtarget);
17157   }
17158 
17159   // Try to simplify this by merging 128-bit lanes to enable a lane-based
17160   // shuffle.
17161   if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17162           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17163     return Result;
17164 
17165   // If we have VLX support, we can use VEXPAND.
17166   if (Subtarget.hasVLX())
17167     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
17168                                          DAG, Subtarget))
17169       return V;
17170 
17171   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17172   // since after split we get a more efficient code using vpunpcklwd and
17173   // vpunpckhwd instrs than vblend.
17174   if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
17175     return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
17176                                       DAG);
17177 
17178   // If we have AVX2 then we always want to lower with a blend because at v8 we
17179   // can fully permute the elements.
17180   if (Subtarget.hasAVX2())
17181     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
17182                                                 Subtarget, DAG);
17183 
17184   // Otherwise fall back on generic lowering.
17185   return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
17186                                     Subtarget, DAG);
17187 }
17188 
17189 /// Handle lowering of 8-lane 32-bit integer shuffles.
17190 ///
17191 /// This routine is only called when we have AVX2 and thus a reasonable
17192 /// instruction set for v8i32 shuffling..
lowerV8I32Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)17193 static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17194                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17195                                  const X86Subtarget &Subtarget,
17196                                  SelectionDAG &DAG) {
17197   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
17198   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
17199   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17200   assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
17201 
17202   // Whenever we can lower this as a zext, that instruction is strictly faster
17203   // than any alternative. It also allows us to fold memory operands into the
17204   // shuffle in many cases.
17205   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
17206                                                    Zeroable, Subtarget, DAG))
17207     return ZExt;
17208 
17209   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17210   // since after split we get a more efficient code than vblend by using
17211   // vpunpcklwd and vpunpckhwd instrs.
17212   if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
17213       !Subtarget.hasAVX512())
17214     return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
17215                                       DAG);
17216 
17217   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
17218                                           Zeroable, Subtarget, DAG))
17219     return Blend;
17220 
17221   // Check for being able to broadcast a single element.
17222   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
17223                                                   Subtarget, DAG))
17224     return Broadcast;
17225 
17226   // If the shuffle mask is repeated in each 128-bit lane we can use more
17227   // efficient instructions that mirror the shuffles across the two 128-bit
17228   // lanes.
17229   SmallVector<int, 4> RepeatedMask;
17230   bool Is128BitLaneRepeatedShuffle =
17231       is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
17232   if (Is128BitLaneRepeatedShuffle) {
17233     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17234     if (V2.isUndef())
17235       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
17236                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17237 
17238     // Use dedicated unpack instructions for masks that match their pattern.
17239     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
17240       return V;
17241   }
17242 
17243   // Try to use shift instructions.
17244   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
17245                                           Zeroable, Subtarget, DAG))
17246     return Shift;
17247 
17248   // If we have VLX support, we can use VALIGN or EXPAND.
17249   if (Subtarget.hasVLX()) {
17250     if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
17251                                               Subtarget, DAG))
17252       return Rotate;
17253 
17254     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
17255                                          DAG, Subtarget))
17256       return V;
17257   }
17258 
17259   // Try to use byte rotation instructions.
17260   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
17261                                                 Subtarget, DAG))
17262     return Rotate;
17263 
17264   // Try to create an in-lane repeating shuffle mask and then shuffle the
17265   // results into the target lanes.
17266   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17267           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17268     return V;
17269 
17270   if (V2.isUndef()) {
17271     // Try to produce a fixed cross-128-bit lane permute followed by unpack
17272     // because that should be faster than the variable permute alternatives.
17273     if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
17274       return V;
17275 
17276     // If the shuffle patterns aren't repeated but it's a single input, directly
17277     // generate a cross-lane VPERMD instruction.
17278     SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17279     return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
17280   }
17281 
17282   // Assume that a single SHUFPS is faster than an alternative sequence of
17283   // multiple instructions (even if the CPU has a domain penalty).
17284   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17285   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17286     SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
17287     SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
17288     SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
17289                                             CastV1, CastV2, DAG);
17290     return DAG.getBitcast(MVT::v8i32, ShufPS);
17291   }
17292 
17293   // Try to simplify this by merging 128-bit lanes to enable a lane-based
17294   // shuffle.
17295   if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17296           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17297     return Result;
17298 
17299   // Otherwise fall back on generic blend lowering.
17300   return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
17301                                               Subtarget, DAG);
17302 }
17303 
17304 /// Handle lowering of 16-lane 16-bit integer shuffles.
17305 ///
17306 /// This routine is only called when we have AVX2 and thus a reasonable
17307 /// instruction set for v16i16 shuffling..
lowerV16I16Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)17308 static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17309                                   const APInt &Zeroable, SDValue V1, SDValue V2,
17310                                   const X86Subtarget &Subtarget,
17311                                   SelectionDAG &DAG) {
17312   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
17313   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
17314   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17315   assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
17316 
17317   // Whenever we can lower this as a zext, that instruction is strictly faster
17318   // than any alternative. It also allows us to fold memory operands into the
17319   // shuffle in many cases.
17320   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17321           DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17322     return ZExt;
17323 
17324   // Check for being able to broadcast a single element.
17325   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
17326                                                   Subtarget, DAG))
17327     return Broadcast;
17328 
17329   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
17330                                           Zeroable, Subtarget, DAG))
17331     return Blend;
17332 
17333   // Use dedicated unpack instructions for masks that match their pattern.
17334   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
17335     return V;
17336 
17337   // Use dedicated pack instructions for masks that match their pattern.
17338   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
17339                                        Subtarget))
17340     return V;
17341 
17342   // Try to use lower using a truncation.
17343   if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17344                                        Subtarget, DAG))
17345     return V;
17346 
17347   // Try to use shift instructions.
17348   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
17349                                           Zeroable, Subtarget, DAG))
17350     return Shift;
17351 
17352   // Try to use byte rotation instructions.
17353   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17354                                                 Subtarget, DAG))
17355     return Rotate;
17356 
17357   // Try to create an in-lane repeating shuffle mask and then shuffle the
17358   // results into the target lanes.
17359   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17360           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17361     return V;
17362 
17363   if (V2.isUndef()) {
17364     // Try to use bit rotation instructions.
17365     if (SDValue Rotate =
17366             lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17367       return Rotate;
17368 
17369     // Try to produce a fixed cross-128-bit lane permute followed by unpack
17370     // because that should be faster than the variable permute alternatives.
17371     if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
17372       return V;
17373 
17374     // There are no generalized cross-lane shuffle operations available on i16
17375     // element types.
17376     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17377       if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17378               DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17379         return V;
17380 
17381       return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17382                                                  DAG, Subtarget);
17383     }
17384 
17385     SmallVector<int, 8> RepeatedMask;
17386     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17387       // As this is a single-input shuffle, the repeated mask should be
17388       // a strictly valid v8i16 mask that we can pass through to the v8i16
17389       // lowering to handle even the v16 case.
17390       return lowerV8I16GeneralSingleInputShuffle(
17391           DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17392     }
17393   }
17394 
17395   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17396                                               Zeroable, Subtarget, DAG))
17397     return PSHUFB;
17398 
17399   // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17400   if (Subtarget.hasBWI())
17401     return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17402 
17403   // Try to simplify this by merging 128-bit lanes to enable a lane-based
17404   // shuffle.
17405   if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17406           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17407     return Result;
17408 
17409   // Try to permute the lanes and then use a per-lane permute.
17410   if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17411           DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17412     return V;
17413 
17414   // Otherwise fall back on generic lowering.
17415   return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
17416                                     Subtarget, DAG);
17417 }
17418 
17419 /// Handle lowering of 32-lane 8-bit integer shuffles.
17420 ///
17421 /// This routine is only called when we have AVX2 and thus a reasonable
17422 /// instruction set for v32i8 shuffling..
lowerV32I8Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)17423 static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17424                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17425                                  const X86Subtarget &Subtarget,
17426                                  SelectionDAG &DAG) {
17427   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17428   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17429   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17430   assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
17431 
17432   // Whenever we can lower this as a zext, that instruction is strictly faster
17433   // than any alternative. It also allows us to fold memory operands into the
17434   // shuffle in many cases.
17435   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17436                                                    Zeroable, Subtarget, DAG))
17437     return ZExt;
17438 
17439   // Check for being able to broadcast a single element.
17440   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17441                                                   Subtarget, DAG))
17442     return Broadcast;
17443 
17444   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17445                                           Zeroable, Subtarget, DAG))
17446     return Blend;
17447 
17448   // Use dedicated unpack instructions for masks that match their pattern.
17449   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
17450     return V;
17451 
17452   // Use dedicated pack instructions for masks that match their pattern.
17453   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
17454                                        Subtarget))
17455     return V;
17456 
17457   // Try to use lower using a truncation.
17458   if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17459                                        Subtarget, DAG))
17460     return V;
17461 
17462   // Try to use shift instructions.
17463   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
17464                                           Zeroable, Subtarget, DAG))
17465     return Shift;
17466 
17467   // Try to use byte rotation instructions.
17468   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17469                                                 Subtarget, DAG))
17470     return Rotate;
17471 
17472   // Try to use bit rotation instructions.
17473   if (V2.isUndef())
17474     if (SDValue Rotate =
17475             lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17476       return Rotate;
17477 
17478   // Try to create an in-lane repeating shuffle mask and then shuffle the
17479   // results into the target lanes.
17480   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17481           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17482     return V;
17483 
17484   // There are no generalized cross-lane shuffle operations available on i8
17485   // element types.
17486   if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17487     // Try to produce a fixed cross-128-bit lane permute followed by unpack
17488     // because that should be faster than the variable permute alternatives.
17489     if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
17490       return V;
17491 
17492     if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17493             DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17494       return V;
17495 
17496     return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17497                                                DAG, Subtarget);
17498   }
17499 
17500   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17501                                               Zeroable, Subtarget, DAG))
17502     return PSHUFB;
17503 
17504   // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17505   if (Subtarget.hasVBMI())
17506     return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17507 
17508   // Try to simplify this by merging 128-bit lanes to enable a lane-based
17509   // shuffle.
17510   if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17511           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17512     return Result;
17513 
17514   // Try to permute the lanes and then use a per-lane permute.
17515   if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17516           DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17517     return V;
17518 
17519   // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17520   // by zeroable elements in the remaining 24 elements. Turn this into two
17521   // vmovqb instructions shuffled together.
17522   if (Subtarget.hasVLX())
17523     if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17524                                                   Mask, Zeroable, DAG))
17525       return V;
17526 
17527   // Otherwise fall back on generic lowering.
17528   return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
17529                                     Subtarget, DAG);
17530 }
17531 
17532 /// High-level routine to lower various 256-bit x86 vector shuffles.
17533 ///
17534 /// This routine either breaks down the specific type of a 256-bit x86 vector
17535 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
17536 /// together based on the available instructions.
lower256BitShuffle(const SDLoc & DL,ArrayRef<int> Mask,MVT VT,SDValue V1,SDValue V2,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)17537 static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
17538                                   SDValue V1, SDValue V2, const APInt &Zeroable,
17539                                   const X86Subtarget &Subtarget,
17540                                   SelectionDAG &DAG) {
17541   // If we have a single input to the zero element, insert that into V1 if we
17542   // can do so cheaply.
17543   int NumElts = VT.getVectorNumElements();
17544   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17545 
17546   if (NumV2Elements == 1 && Mask[0] >= NumElts)
17547     if (SDValue Insertion = lowerShuffleAsElementInsertion(
17548             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17549       return Insertion;
17550 
17551   // Handle special cases where the lower or upper half is UNDEF.
17552   if (SDValue V =
17553           lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17554     return V;
17555 
17556   // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17557   // can check for those subtargets here and avoid much of the subtarget
17558   // querying in the per-vector-type lowering routines. With AVX1 we have
17559   // essentially *zero* ability to manipulate a 256-bit vector with integer
17560   // types. Since we'll use floating point types there eventually, just
17561   // immediately cast everything to a float and operate entirely in that domain.
17562   if (VT.isInteger() && !Subtarget.hasAVX2()) {
17563     int ElementBits = VT.getScalarSizeInBits();
17564     if (ElementBits < 32) {
17565       // No floating point type available, if we can't use the bit operations
17566       // for masking/blending then decompose into 128-bit vectors.
17567       if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17568                                             Subtarget, DAG))
17569         return V;
17570       if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17571         return V;
17572       return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
17573     }
17574 
17575     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17576                                 VT.getVectorNumElements());
17577     V1 = DAG.getBitcast(FpVT, V1);
17578     V2 = DAG.getBitcast(FpVT, V2);
17579     return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17580   }
17581 
17582   switch (VT.SimpleTy) {
17583   case MVT::v4f64:
17584     return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17585   case MVT::v4i64:
17586     return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17587   case MVT::v8f32:
17588     return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17589   case MVT::v8i32:
17590     return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17591   case MVT::v16i16:
17592     return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17593   case MVT::v32i8:
17594     return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17595 
17596   default:
17597     llvm_unreachable("Not a valid 256-bit x86 vector type!");
17598   }
17599 }
17600 
17601 /// Try to lower a vector shuffle as a 128-bit shuffles.
lowerV4X128Shuffle(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)17602 static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
17603                                   const APInt &Zeroable, SDValue V1, SDValue V2,
17604                                   const X86Subtarget &Subtarget,
17605                                   SelectionDAG &DAG) {
17606   assert(VT.getScalarSizeInBits() == 64 &&
17607          "Unexpected element type size for 128bit shuffle.");
17608 
17609   // To handle 256 bit vector requires VLX and most probably
17610   // function lowerV2X128VectorShuffle() is better solution.
17611   assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
17612 
17613   // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17614   SmallVector<int, 4> Widened128Mask;
17615   if (!canWidenShuffleElements(Mask, Widened128Mask))
17616     return SDValue();
17617   assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
17618 
17619   // Try to use an insert into a zero vector.
17620   if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17621       (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17622     unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17623     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17624     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17625                               DAG.getIntPtrConstant(0, DL));
17626     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17627                        getZeroVector(VT, Subtarget, DAG, DL), LoV,
17628                        DAG.getIntPtrConstant(0, DL));
17629   }
17630 
17631   // Check for patterns which can be matched with a single insert of a 256-bit
17632   // subvector.
17633   bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17634   if (OnlyUsesV1 ||
17635       isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17636     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17637     SDValue SubVec =
17638         DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17639                     DAG.getIntPtrConstant(0, DL));
17640     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17641                        DAG.getIntPtrConstant(4, DL));
17642   }
17643 
17644   // See if this is an insertion of the lower 128-bits of V2 into V1.
17645   bool IsInsert = true;
17646   int V2Index = -1;
17647   for (int i = 0; i < 4; ++i) {
17648     assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17649     if (Widened128Mask[i] < 0)
17650       continue;
17651 
17652     // Make sure all V1 subvectors are in place.
17653     if (Widened128Mask[i] < 4) {
17654       if (Widened128Mask[i] != i) {
17655         IsInsert = false;
17656         break;
17657       }
17658     } else {
17659       // Make sure we only have a single V2 index and its the lowest 128-bits.
17660       if (V2Index >= 0 || Widened128Mask[i] != 4) {
17661         IsInsert = false;
17662         break;
17663       }
17664       V2Index = i;
17665     }
17666   }
17667   if (IsInsert && V2Index >= 0) {
17668     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17669     SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17670                                  DAG.getIntPtrConstant(0, DL));
17671     return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17672   }
17673 
17674   // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17675   // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17676   // possible we at least ensure the lanes stay sequential to help later
17677   // combines.
17678   SmallVector<int, 2> Widened256Mask;
17679   if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17680     Widened128Mask.clear();
17681     narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17682   }
17683 
17684   // Try to lower to vshuf64x2/vshuf32x4.
17685   SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17686   unsigned PermMask = 0;
17687   // Insure elements came from the same Op.
17688   for (int i = 0; i < 4; ++i) {
17689     assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17690     if (Widened128Mask[i] < 0)
17691       continue;
17692 
17693     SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17694     unsigned OpIndex = i / 2;
17695     if (Ops[OpIndex].isUndef())
17696       Ops[OpIndex] = Op;
17697     else if (Ops[OpIndex] != Op)
17698       return SDValue();
17699 
17700     // Convert the 128-bit shuffle mask selection values into 128-bit selection
17701     // bits defined by a vshuf64x2 instruction's immediate control byte.
17702     PermMask |= (Widened128Mask[i] % 4) << (i * 2);
17703   }
17704 
17705   return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17706                      DAG.getTargetConstant(PermMask, DL, MVT::i8));
17707 }
17708 
17709 /// Handle lowering of 8-lane 64-bit floating point shuffles.
lowerV8F64Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)17710 static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17711                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17712                                  const X86Subtarget &Subtarget,
17713                                  SelectionDAG &DAG) {
17714   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17715   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17716   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17717 
17718   if (V2.isUndef()) {
17719     // Use low duplicate instructions for masks that match their pattern.
17720     if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17721       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17722 
17723     if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17724       // Non-half-crossing single input shuffles can be lowered with an
17725       // interleaved permutation.
17726       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17727                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17728                               ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17729                               ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17730       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17731                          DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17732     }
17733 
17734     SmallVector<int, 4> RepeatedMask;
17735     if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17736       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17737                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17738   }
17739 
17740   if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17741                                            V2, Subtarget, DAG))
17742     return Shuf128;
17743 
17744   if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
17745     return Unpck;
17746 
17747   // Check if the blend happens to exactly fit that of SHUFPD.
17748   if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17749                                           Zeroable, Subtarget, DAG))
17750     return Op;
17751 
17752   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
17753                                        DAG, Subtarget))
17754     return V;
17755 
17756   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17757                                           Zeroable, Subtarget, DAG))
17758     return Blend;
17759 
17760   return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17761 }
17762 
17763 /// Handle lowering of 16-lane 32-bit floating point shuffles.
lowerV16F32Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)17764 static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17765                                   const APInt &Zeroable, SDValue V1, SDValue V2,
17766                                   const X86Subtarget &Subtarget,
17767                                   SelectionDAG &DAG) {
17768   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17769   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17770   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17771 
17772   // If the shuffle mask is repeated in each 128-bit lane, we have many more
17773   // options to efficiently lower the shuffle.
17774   SmallVector<int, 4> RepeatedMask;
17775   if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17776     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17777 
17778     // Use even/odd duplicate instructions for masks that match their pattern.
17779     if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17780       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17781     if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17782       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17783 
17784     if (V2.isUndef())
17785       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17786                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17787 
17788     // Use dedicated unpack instructions for masks that match their pattern.
17789     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
17790       return V;
17791 
17792     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17793                                             Zeroable, Subtarget, DAG))
17794       return Blend;
17795 
17796     // Otherwise, fall back to a SHUFPS sequence.
17797     return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17798   }
17799 
17800   // Try to create an in-lane repeating shuffle mask and then shuffle the
17801   // results into the target lanes.
17802   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17803           DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17804     return V;
17805 
17806   // If we have a single input shuffle with different shuffle patterns in the
17807   // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17808   if (V2.isUndef() &&
17809       !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17810     SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17811     return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17812   }
17813 
17814   // If we have AVX512F support, we can use VEXPAND.
17815   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
17816                                              V1, V2, DAG, Subtarget))
17817     return V;
17818 
17819   return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17820 }
17821 
17822 /// Handle lowering of 8-lane 64-bit integer shuffles.
lowerV8I64Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)17823 static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17824                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17825                                  const X86Subtarget &Subtarget,
17826                                  SelectionDAG &DAG) {
17827   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17828   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17829   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17830 
17831   if (V2.isUndef()) {
17832     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17833     // can use lower latency instructions that will operate on all four
17834     // 128-bit lanes.
17835     SmallVector<int, 2> Repeated128Mask;
17836     if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17837       SmallVector<int, 4> PSHUFDMask;
17838       narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17839       return DAG.getBitcast(
17840           MVT::v8i64,
17841           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17842                       DAG.getBitcast(MVT::v16i32, V1),
17843                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17844     }
17845 
17846     SmallVector<int, 4> Repeated256Mask;
17847     if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17848       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17849                          getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17850   }
17851 
17852   if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17853                                            V2, Subtarget, DAG))
17854     return Shuf128;
17855 
17856   // Try to use shift instructions.
17857   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
17858                                           Zeroable, Subtarget, DAG))
17859     return Shift;
17860 
17861   // Try to use VALIGN.
17862   if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17863                                             Subtarget, DAG))
17864     return Rotate;
17865 
17866   // Try to use PALIGNR.
17867   if (Subtarget.hasBWI())
17868     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17869                                                   Subtarget, DAG))
17870       return Rotate;
17871 
17872   if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
17873     return Unpck;
17874 
17875   // If we have AVX512F support, we can use VEXPAND.
17876   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
17877                                        DAG, Subtarget))
17878     return V;
17879 
17880   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17881                                           Zeroable, Subtarget, DAG))
17882     return Blend;
17883 
17884   return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17885 }
17886 
17887 /// Handle lowering of 16-lane 32-bit integer shuffles.
lowerV16I32Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)17888 static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17889                                   const APInt &Zeroable, SDValue V1, SDValue V2,
17890                                   const X86Subtarget &Subtarget,
17891                                   SelectionDAG &DAG) {
17892   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17893   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17894   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17895 
17896   // Whenever we can lower this as a zext, that instruction is strictly faster
17897   // than any alternative. It also allows us to fold memory operands into the
17898   // shuffle in many cases.
17899   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17900           DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17901     return ZExt;
17902 
17903   // If the shuffle mask is repeated in each 128-bit lane we can use more
17904   // efficient instructions that mirror the shuffles across the four 128-bit
17905   // lanes.
17906   SmallVector<int, 4> RepeatedMask;
17907   bool Is128BitLaneRepeatedShuffle =
17908       is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17909   if (Is128BitLaneRepeatedShuffle) {
17910     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17911     if (V2.isUndef())
17912       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17913                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17914 
17915     // Use dedicated unpack instructions for masks that match their pattern.
17916     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
17917       return V;
17918   }
17919 
17920   // Try to use shift instructions.
17921   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
17922                                           Zeroable, Subtarget, DAG))
17923     return Shift;
17924 
17925   // Try to use VALIGN.
17926   if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17927                                             Subtarget, DAG))
17928     return Rotate;
17929 
17930   // Try to use byte rotation instructions.
17931   if (Subtarget.hasBWI())
17932     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17933                                                   Subtarget, DAG))
17934       return Rotate;
17935 
17936   // Assume that a single SHUFPS is faster than using a permv shuffle.
17937   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17938   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17939     SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17940     SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17941     SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17942                                             CastV1, CastV2, DAG);
17943     return DAG.getBitcast(MVT::v16i32, ShufPS);
17944   }
17945 
17946   // Try to create an in-lane repeating shuffle mask and then shuffle the
17947   // results into the target lanes.
17948   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17949           DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17950     return V;
17951 
17952   // If we have AVX512F support, we can use VEXPAND.
17953   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
17954                                        DAG, Subtarget))
17955     return V;
17956 
17957   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17958                                           Zeroable, Subtarget, DAG))
17959     return Blend;
17960 
17961   return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17962 }
17963 
17964 /// Handle lowering of 32-lane 16-bit integer shuffles.
lowerV32I16Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)17965 static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17966                                   const APInt &Zeroable, SDValue V1, SDValue V2,
17967                                   const X86Subtarget &Subtarget,
17968                                   SelectionDAG &DAG) {
17969   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17970   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17971   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17972   assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17973 
17974   // Whenever we can lower this as a zext, that instruction is strictly faster
17975   // than any alternative. It also allows us to fold memory operands into the
17976   // shuffle in many cases.
17977   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17978           DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17979     return ZExt;
17980 
17981   // Use dedicated unpack instructions for masks that match their pattern.
17982   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
17983     return V;
17984 
17985   // Use dedicated pack instructions for masks that match their pattern.
17986   if (SDValue V =
17987           lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
17988     return V;
17989 
17990   // Try to use shift instructions.
17991   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
17992                                           Zeroable, Subtarget, DAG))
17993     return Shift;
17994 
17995   // Try to use byte rotation instructions.
17996   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17997                                                 Subtarget, DAG))
17998     return Rotate;
17999 
18000   if (V2.isUndef()) {
18001     // Try to use bit rotation instructions.
18002     if (SDValue Rotate =
18003             lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
18004       return Rotate;
18005 
18006     SmallVector<int, 8> RepeatedMask;
18007     if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
18008       // As this is a single-input shuffle, the repeated mask should be
18009       // a strictly valid v8i16 mask that we can pass through to the v8i16
18010       // lowering to handle even the v32 case.
18011       return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
18012                                                  RepeatedMask, Subtarget, DAG);
18013     }
18014   }
18015 
18016   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
18017                                           Zeroable, Subtarget, DAG))
18018     return Blend;
18019 
18020   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
18021                                               Zeroable, Subtarget, DAG))
18022     return PSHUFB;
18023 
18024   return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
18025 }
18026 
18027 /// Handle lowering of 64-lane 8-bit integer shuffles.
lowerV64I8Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)18028 static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18029                                  const APInt &Zeroable, SDValue V1, SDValue V2,
18030                                  const X86Subtarget &Subtarget,
18031                                  SelectionDAG &DAG) {
18032   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
18033   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
18034   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
18035   assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
18036 
18037   // Whenever we can lower this as a zext, that instruction is strictly faster
18038   // than any alternative. It also allows us to fold memory operands into the
18039   // shuffle in many cases.
18040   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18041           DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
18042     return ZExt;
18043 
18044   // Use dedicated unpack instructions for masks that match their pattern.
18045   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
18046     return V;
18047 
18048   // Use dedicated pack instructions for masks that match their pattern.
18049   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
18050                                        Subtarget))
18051     return V;
18052 
18053   // Try to use shift instructions.
18054   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
18055                                           Zeroable, Subtarget, DAG))
18056     return Shift;
18057 
18058   // Try to use byte rotation instructions.
18059   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
18060                                                 Subtarget, DAG))
18061     return Rotate;
18062 
18063   // Try to use bit rotation instructions.
18064   if (V2.isUndef())
18065     if (SDValue Rotate =
18066             lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
18067       return Rotate;
18068 
18069   // Lower as AND if possible.
18070   if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
18071                                              Zeroable, Subtarget, DAG))
18072     return Masked;
18073 
18074   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
18075                                               Zeroable, Subtarget, DAG))
18076     return PSHUFB;
18077 
18078   // VBMI can use VPERMV/VPERMV3 byte shuffles.
18079   if (Subtarget.hasVBMI())
18080     return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
18081 
18082   // Try to create an in-lane repeating shuffle mask and then shuffle the
18083   // results into the target lanes.
18084   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18085           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18086     return V;
18087 
18088   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
18089                                           Zeroable, Subtarget, DAG))
18090     return Blend;
18091 
18092   // Try to simplify this by merging 128-bit lanes to enable a lane-based
18093   // shuffle.
18094   if (!V2.isUndef())
18095     if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18096             DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18097       return Result;
18098 
18099   // FIXME: Implement direct support for this type!
18100   return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
18101 }
18102 
18103 /// High-level routine to lower various 512-bit x86 vector shuffles.
18104 ///
18105 /// This routine either breaks down the specific type of a 512-bit x86 vector
18106 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
18107 /// together based on the available instructions.
lower512BitShuffle(const SDLoc & DL,ArrayRef<int> Mask,MVT VT,SDValue V1,SDValue V2,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)18108 static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
18109                                   MVT VT, SDValue V1, SDValue V2,
18110                                   const APInt &Zeroable,
18111                                   const X86Subtarget &Subtarget,
18112                                   SelectionDAG &DAG) {
18113   assert(Subtarget.hasAVX512() &&
18114          "Cannot lower 512-bit vectors w/ basic ISA!");
18115 
18116   // If we have a single input to the zero element, insert that into V1 if we
18117   // can do so cheaply.
18118   int NumElts = Mask.size();
18119   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18120 
18121   if (NumV2Elements == 1 && Mask[0] >= NumElts)
18122     if (SDValue Insertion = lowerShuffleAsElementInsertion(
18123             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18124       return Insertion;
18125 
18126   // Handle special cases where the lower or upper half is UNDEF.
18127   if (SDValue V =
18128           lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
18129     return V;
18130 
18131   // Check for being able to broadcast a single element.
18132   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
18133                                                   Subtarget, DAG))
18134     return Broadcast;
18135 
18136   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
18137     // Try using bit ops for masking and blending before falling back to
18138     // splitting.
18139     if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
18140                                           Subtarget, DAG))
18141       return V;
18142     if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
18143       return V;
18144 
18145     return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
18146   }
18147 
18148   // Dispatch to each element type for lowering. If we don't have support for
18149   // specific element type shuffles at 512 bits, immediately split them and
18150   // lower them. Each lowering routine of a given type is allowed to assume that
18151   // the requisite ISA extensions for that element type are available.
18152   switch (VT.SimpleTy) {
18153   case MVT::v8f64:
18154     return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18155   case MVT::v16f32:
18156     return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18157   case MVT::v8i64:
18158     return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18159   case MVT::v16i32:
18160     return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18161   case MVT::v32i16:
18162     return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18163   case MVT::v64i8:
18164     return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18165 
18166   default:
18167     llvm_unreachable("Not a valid 512-bit x86 vector type!");
18168   }
18169 }
18170 
lower1BitShuffleAsKSHIFTR(const SDLoc & DL,ArrayRef<int> Mask,MVT VT,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)18171 static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
18172                                          MVT VT, SDValue V1, SDValue V2,
18173                                          const X86Subtarget &Subtarget,
18174                                          SelectionDAG &DAG) {
18175   // Shuffle should be unary.
18176   if (!V2.isUndef())
18177     return SDValue();
18178 
18179   int ShiftAmt = -1;
18180   int NumElts = Mask.size();
18181   for (int i = 0; i != NumElts; ++i) {
18182     int M = Mask[i];
18183     assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
18184            "Unexpected mask index.");
18185     if (M < 0)
18186       continue;
18187 
18188     // The first non-undef element determines our shift amount.
18189     if (ShiftAmt < 0) {
18190       ShiftAmt = M - i;
18191       // Need to be shifting right.
18192       if (ShiftAmt <= 0)
18193         return SDValue();
18194     }
18195     // All non-undef elements must shift by the same amount.
18196     if (ShiftAmt != M - i)
18197       return SDValue();
18198   }
18199   assert(ShiftAmt >= 0 && "All undef?");
18200 
18201   // Great we found a shift right.
18202   MVT WideVT = VT;
18203   if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18204     WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18205   SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18206                             DAG.getUNDEF(WideVT), V1,
18207                             DAG.getIntPtrConstant(0, DL));
18208   Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
18209                     DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18210   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18211                      DAG.getIntPtrConstant(0, DL));
18212 }
18213 
18214 // Determine if this shuffle can be implemented with a KSHIFT instruction.
18215 // Returns the shift amount if possible or -1 if not. This is a simplified
18216 // version of matchShuffleAsShift.
match1BitShuffleAsKSHIFT(unsigned & Opcode,ArrayRef<int> Mask,int MaskOffset,const APInt & Zeroable)18217 static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
18218                                     int MaskOffset, const APInt &Zeroable) {
18219   int Size = Mask.size();
18220 
18221   auto CheckZeros = [&](int Shift, bool Left) {
18222     for (int j = 0; j < Shift; ++j)
18223       if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
18224         return false;
18225 
18226     return true;
18227   };
18228 
18229   auto MatchShift = [&](int Shift, bool Left) {
18230     unsigned Pos = Left ? Shift : 0;
18231     unsigned Low = Left ? 0 : Shift;
18232     unsigned Len = Size - Shift;
18233     return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
18234   };
18235 
18236   for (int Shift = 1; Shift != Size; ++Shift)
18237     for (bool Left : {true, false})
18238       if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
18239         Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
18240         return Shift;
18241       }
18242 
18243   return -1;
18244 }
18245 
18246 
18247 // Lower vXi1 vector shuffles.
18248 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
18249 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
18250 // vector, shuffle and then truncate it back.
lower1BitShuffle(const SDLoc & DL,ArrayRef<int> Mask,MVT VT,SDValue V1,SDValue V2,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)18251 static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
18252                                 MVT VT, SDValue V1, SDValue V2,
18253                                 const APInt &Zeroable,
18254                                 const X86Subtarget &Subtarget,
18255                                 SelectionDAG &DAG) {
18256   assert(Subtarget.hasAVX512() &&
18257          "Cannot lower 512-bit vectors w/o basic ISA!");
18258 
18259   int NumElts = Mask.size();
18260 
18261   // Try to recognize shuffles that are just padding a subvector with zeros.
18262   int SubvecElts = 0;
18263   int Src = -1;
18264   for (int i = 0; i != NumElts; ++i) {
18265     if (Mask[i] >= 0) {
18266       // Grab the source from the first valid mask. All subsequent elements need
18267       // to use this same source.
18268       if (Src < 0)
18269         Src = Mask[i] / NumElts;
18270       if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18271         break;
18272     }
18273 
18274     ++SubvecElts;
18275   }
18276   assert(SubvecElts != NumElts && "Identity shuffle?");
18277 
18278   // Clip to a power 2.
18279   SubvecElts = PowerOf2Floor(SubvecElts);
18280 
18281   // Make sure the number of zeroable bits in the top at least covers the bits
18282   // not covered by the subvector.
18283   if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
18284     assert(Src >= 0 && "Expected a source!");
18285     MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18286     SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
18287                                   Src == 0 ? V1 : V2,
18288                                   DAG.getIntPtrConstant(0, DL));
18289     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18290                        DAG.getConstant(0, DL, VT),
18291                        Extract, DAG.getIntPtrConstant(0, DL));
18292   }
18293 
18294   // Try a simple shift right with undef elements. Later we'll try with zeros.
18295   if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
18296                                                 DAG))
18297     return Shift;
18298 
18299   // Try to match KSHIFTs.
18300   unsigned Offset = 0;
18301   for (SDValue V : { V1, V2 }) {
18302     unsigned Opcode;
18303     int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18304     if (ShiftAmt >= 0) {
18305       MVT WideVT = VT;
18306       if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18307         WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18308       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18309                                 DAG.getUNDEF(WideVT), V,
18310                                 DAG.getIntPtrConstant(0, DL));
18311       // Widened right shifts need two shifts to ensure we shift in zeroes.
18312       if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18313         int WideElts = WideVT.getVectorNumElements();
18314         // Shift left to put the original vector in the MSBs of the new size.
18315         Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18316                           DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18317         // Increase the shift amount to account for the left shift.
18318         ShiftAmt += WideElts - NumElts;
18319       }
18320 
18321       Res = DAG.getNode(Opcode, DL, WideVT, Res,
18322                         DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18323       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18324                          DAG.getIntPtrConstant(0, DL));
18325     }
18326     Offset += NumElts; // Increment for next iteration.
18327   }
18328 
18329 
18330 
18331   MVT ExtVT;
18332   switch (VT.SimpleTy) {
18333   default:
18334     llvm_unreachable("Expected a vector of i1 elements");
18335   case MVT::v2i1:
18336     ExtVT = MVT::v2i64;
18337     break;
18338   case MVT::v4i1:
18339     ExtVT = MVT::v4i32;
18340     break;
18341   case MVT::v8i1:
18342     // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18343     // shuffle.
18344     ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18345     break;
18346   case MVT::v16i1:
18347     // Take 512-bit type, unless we are avoiding 512-bit types and have the
18348     // 256-bit operation available.
18349     ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18350     break;
18351   case MVT::v32i1:
18352     // Take 512-bit type, unless we are avoiding 512-bit types and have the
18353     // 256-bit operation available.
18354     assert(Subtarget.hasBWI() && "Expected AVX512BW support");
18355     ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18356     break;
18357   case MVT::v64i1:
18358     // Fall back to scalarization. FIXME: We can do better if the shuffle
18359     // can be partitioned cleanly.
18360     if (!Subtarget.useBWIRegs())
18361       return SDValue();
18362     ExtVT = MVT::v64i8;
18363     break;
18364   }
18365 
18366   V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18367   V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18368 
18369   SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18370   // i1 was sign extended we can use X86ISD::CVT2MASK.
18371   int NumElems = VT.getVectorNumElements();
18372   if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18373       (Subtarget.hasDQI() && (NumElems < 32)))
18374     return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18375                        Shuffle, ISD::SETGT);
18376 
18377   return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18378 }
18379 
18380 /// Helper function that returns true if the shuffle mask should be
18381 /// commuted to improve canonicalization.
canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask)18382 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
18383   int NumElements = Mask.size();
18384 
18385   int NumV1Elements = 0, NumV2Elements = 0;
18386   for (int M : Mask)
18387     if (M < 0)
18388       continue;
18389     else if (M < NumElements)
18390       ++NumV1Elements;
18391     else
18392       ++NumV2Elements;
18393 
18394   // Commute the shuffle as needed such that more elements come from V1 than
18395   // V2. This allows us to match the shuffle pattern strictly on how many
18396   // elements come from V1 without handling the symmetric cases.
18397   if (NumV2Elements > NumV1Elements)
18398     return true;
18399 
18400   assert(NumV1Elements > 0 && "No V1 indices");
18401 
18402   if (NumV2Elements == 0)
18403     return false;
18404 
18405   // When the number of V1 and V2 elements are the same, try to minimize the
18406   // number of uses of V2 in the low half of the vector. When that is tied,
18407   // ensure that the sum of indices for V1 is equal to or lower than the sum
18408   // indices for V2. When those are equal, try to ensure that the number of odd
18409   // indices for V1 is lower than the number of odd indices for V2.
18410   if (NumV1Elements == NumV2Elements) {
18411     int LowV1Elements = 0, LowV2Elements = 0;
18412     for (int M : Mask.slice(0, NumElements / 2))
18413       if (M >= NumElements)
18414         ++LowV2Elements;
18415       else if (M >= 0)
18416         ++LowV1Elements;
18417     if (LowV2Elements > LowV1Elements)
18418       return true;
18419     if (LowV2Elements == LowV1Elements) {
18420       int SumV1Indices = 0, SumV2Indices = 0;
18421       for (int i = 0, Size = Mask.size(); i < Size; ++i)
18422         if (Mask[i] >= NumElements)
18423           SumV2Indices += i;
18424         else if (Mask[i] >= 0)
18425           SumV1Indices += i;
18426       if (SumV2Indices < SumV1Indices)
18427         return true;
18428       if (SumV2Indices == SumV1Indices) {
18429         int NumV1OddIndices = 0, NumV2OddIndices = 0;
18430         for (int i = 0, Size = Mask.size(); i < Size; ++i)
18431           if (Mask[i] >= NumElements)
18432             NumV2OddIndices += i % 2;
18433           else if (Mask[i] >= 0)
18434             NumV1OddIndices += i % 2;
18435         if (NumV2OddIndices < NumV1OddIndices)
18436           return true;
18437       }
18438     }
18439   }
18440 
18441   return false;
18442 }
18443 
18444 /// Top-level lowering for x86 vector shuffles.
18445 ///
18446 /// This handles decomposition, canonicalization, and lowering of all x86
18447 /// vector shuffles. Most of the specific lowering strategies are encapsulated
18448 /// above in helper routines. The canonicalization attempts to widen shuffles
18449 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
18450 /// s.t. only one of the two inputs needs to be tested, etc.
lowerVECTOR_SHUFFLE(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)18451 static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
18452                                    SelectionDAG &DAG) {
18453   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
18454   ArrayRef<int> OrigMask = SVOp->getMask();
18455   SDValue V1 = Op.getOperand(0);
18456   SDValue V2 = Op.getOperand(1);
18457   MVT VT = Op.getSimpleValueType();
18458   int NumElements = VT.getVectorNumElements();
18459   SDLoc DL(Op);
18460   bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18461 
18462   assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
18463          "Can't lower MMX shuffles");
18464 
18465   bool V1IsUndef = V1.isUndef();
18466   bool V2IsUndef = V2.isUndef();
18467   if (V1IsUndef && V2IsUndef)
18468     return DAG.getUNDEF(VT);
18469 
18470   // When we create a shuffle node we put the UNDEF node to second operand,
18471   // but in some cases the first operand may be transformed to UNDEF.
18472   // In this case we should just commute the node.
18473   if (V1IsUndef)
18474     return DAG.getCommutedVectorShuffle(*SVOp);
18475 
18476   // Check for non-undef masks pointing at an undef vector and make the masks
18477   // undef as well. This makes it easier to match the shuffle based solely on
18478   // the mask.
18479   if (V2IsUndef &&
18480       any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18481     SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
18482     for (int &M : NewMask)
18483       if (M >= NumElements)
18484         M = -1;
18485     return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18486   }
18487 
18488   // Check for illegal shuffle mask element index values.
18489   int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18490   (void)MaskUpperLimit;
18491   assert(llvm::all_of(OrigMask,
18492                       [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
18493          "Out of bounds shuffle index");
18494 
18495   // We actually see shuffles that are entirely re-arrangements of a set of
18496   // zero inputs. This mostly happens while decomposing complex shuffles into
18497   // simple ones. Directly lower these as a buildvector of zeros.
18498   APInt KnownUndef, KnownZero;
18499   computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18500 
18501   APInt Zeroable = KnownUndef | KnownZero;
18502   if (Zeroable.isAllOnesValue())
18503     return getZeroVector(VT, Subtarget, DAG, DL);
18504 
18505   bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18506 
18507   // Try to collapse shuffles into using a vector type with fewer elements but
18508   // wider element types. We cap this to not form integers or floating point
18509   // elements wider than 64 bits. It does not seem beneficial to form i128
18510   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18511   SmallVector<int, 16> WidenedMask;
18512   if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18513       canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18514     // Shuffle mask widening should not interfere with a broadcast opportunity
18515     // by obfuscating the operands with bitcasts.
18516     // TODO: Avoid lowering directly from this top-level function: make this
18517     // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18518     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18519                                                     Subtarget, DAG))
18520       return Broadcast;
18521 
18522     MVT NewEltVT = VT.isFloatingPoint()
18523                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
18524                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
18525     int NewNumElts = NumElements / 2;
18526     MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18527     // Make sure that the new vector type is legal. For example, v2f64 isn't
18528     // legal on SSE1.
18529     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18530       if (V2IsZero) {
18531         // Modify the new Mask to take all zeros from the all-zero vector.
18532         // Choose indices that are blend-friendly.
18533         bool UsedZeroVector = false;
18534         assert(is_contained(WidenedMask, SM_SentinelZero) &&
18535                "V2's non-undef elements are used?!");
18536         for (int i = 0; i != NewNumElts; ++i)
18537           if (WidenedMask[i] == SM_SentinelZero) {
18538             WidenedMask[i] = i + NewNumElts;
18539             UsedZeroVector = true;
18540           }
18541         // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18542         // some elements to be undef.
18543         if (UsedZeroVector)
18544           V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18545       }
18546       V1 = DAG.getBitcast(NewVT, V1);
18547       V2 = DAG.getBitcast(NewVT, V2);
18548       return DAG.getBitcast(
18549           VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18550     }
18551   }
18552 
18553   // Commute the shuffle if it will improve canonicalization.
18554   SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end());
18555   if (canonicalizeShuffleMaskWithCommute(Mask)) {
18556     ShuffleVectorSDNode::commuteMask(Mask);
18557     std::swap(V1, V2);
18558   }
18559 
18560   // For each vector width, delegate to a specialized lowering routine.
18561   if (VT.is128BitVector())
18562     return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18563 
18564   if (VT.is256BitVector())
18565     return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18566 
18567   if (VT.is512BitVector())
18568     return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18569 
18570   if (Is1BitVector)
18571     return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18572 
18573   llvm_unreachable("Unimplemented!");
18574 }
18575 
18576 /// Try to lower a VSELECT instruction to a vector shuffle.
lowerVSELECTtoVectorShuffle(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)18577 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
18578                                            const X86Subtarget &Subtarget,
18579                                            SelectionDAG &DAG) {
18580   SDValue Cond = Op.getOperand(0);
18581   SDValue LHS = Op.getOperand(1);
18582   SDValue RHS = Op.getOperand(2);
18583   MVT VT = Op.getSimpleValueType();
18584 
18585   // Only non-legal VSELECTs reach this lowering, convert those into generic
18586   // shuffles and re-use the shuffle lowering path for blends.
18587   if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
18588     SmallVector<int, 32> Mask;
18589     if (createShuffleMaskFromVSELECT(Mask, Cond))
18590       return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18591   }
18592 
18593   return SDValue();
18594 }
18595 
LowerVSELECT(SDValue Op,SelectionDAG & DAG) const18596 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18597   SDValue Cond = Op.getOperand(0);
18598   SDValue LHS = Op.getOperand(1);
18599   SDValue RHS = Op.getOperand(2);
18600 
18601   // A vselect where all conditions and data are constants can be optimized into
18602   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18603   if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
18604       ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
18605       ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
18606     return SDValue();
18607 
18608   // Try to lower this to a blend-style vector shuffle. This can handle all
18609   // constant condition cases.
18610   if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18611     return BlendOp;
18612 
18613   // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18614   // with patterns on the mask registers on AVX-512.
18615   MVT CondVT = Cond.getSimpleValueType();
18616   unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18617   if (CondEltSize == 1)
18618     return Op;
18619 
18620   // Variable blends are only legal from SSE4.1 onward.
18621   if (!Subtarget.hasSSE41())
18622     return SDValue();
18623 
18624   SDLoc dl(Op);
18625   MVT VT = Op.getSimpleValueType();
18626   unsigned EltSize = VT.getScalarSizeInBits();
18627   unsigned NumElts = VT.getVectorNumElements();
18628 
18629   // Expand v32i16/v64i8 without BWI.
18630   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18631     return SDValue();
18632 
18633   // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18634   // into an i1 condition so that we can use the mask-based 512-bit blend
18635   // instructions.
18636   if (VT.getSizeInBits() == 512) {
18637     // Build a mask by testing the condition against zero.
18638     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18639     SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18640                                 DAG.getConstant(0, dl, CondVT),
18641                                 ISD::SETNE);
18642     // Now return a new VSELECT using the mask.
18643     return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18644   }
18645 
18646   // SEXT/TRUNC cases where the mask doesn't match the destination size.
18647   if (CondEltSize != EltSize) {
18648     // If we don't have a sign splat, rely on the expansion.
18649     if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18650       return SDValue();
18651 
18652     MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18653     MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18654     Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18655     return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18656   }
18657 
18658   // Only some types will be legal on some subtargets. If we can emit a legal
18659   // VSELECT-matching blend, return Op, and but if we need to expand, return
18660   // a null value.
18661   switch (VT.SimpleTy) {
18662   default:
18663     // Most of the vector types have blends past SSE4.1.
18664     return Op;
18665 
18666   case MVT::v32i8:
18667     // The byte blends for AVX vectors were introduced only in AVX2.
18668     if (Subtarget.hasAVX2())
18669       return Op;
18670 
18671     return SDValue();
18672 
18673   case MVT::v8i16:
18674   case MVT::v16i16: {
18675     // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18676     MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18677     Cond = DAG.getBitcast(CastVT, Cond);
18678     LHS = DAG.getBitcast(CastVT, LHS);
18679     RHS = DAG.getBitcast(CastVT, RHS);
18680     SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18681     return DAG.getBitcast(VT, Select);
18682   }
18683   }
18684 }
18685 
LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,SelectionDAG & DAG)18686 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
18687   MVT VT = Op.getSimpleValueType();
18688   SDValue Vec = Op.getOperand(0);
18689   SDValue Idx = Op.getOperand(1);
18690   assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18691   SDLoc dl(Op);
18692 
18693   if (!Vec.getSimpleValueType().is128BitVector())
18694     return SDValue();
18695 
18696   if (VT.getSizeInBits() == 8) {
18697     // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18698     // we're going to zero extend the register or fold the store.
18699     if (llvm::isNullConstant(Idx) && !MayFoldIntoZeroExtend(Op) &&
18700         !MayFoldIntoStore(Op))
18701       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18702                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18703                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
18704 
18705     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
18706     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18707                                   DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18708     return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18709   }
18710 
18711   if (VT == MVT::f32) {
18712     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18713     // the result back to FR32 register. It's only worth matching if the
18714     // result has a single use which is a store or a bitcast to i32.  And in
18715     // the case of a store, it's not worth it if the index is a constant 0,
18716     // because a MOVSSmr can be used instead, which is smaller and faster.
18717     if (!Op.hasOneUse())
18718       return SDValue();
18719     SDNode *User = *Op.getNode()->use_begin();
18720     if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18721         (User->getOpcode() != ISD::BITCAST ||
18722          User->getValueType(0) != MVT::i32))
18723       return SDValue();
18724     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18725                                   DAG.getBitcast(MVT::v4i32, Vec), Idx);
18726     return DAG.getBitcast(MVT::f32, Extract);
18727   }
18728 
18729   if (VT == MVT::i32 || VT == MVT::i64)
18730       return Op;
18731 
18732   return SDValue();
18733 }
18734 
18735 /// Extract one bit from mask vector, like v16i1 or v8i1.
18736 /// AVX-512 feature.
ExtractBitFromMaskVector(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)18737 static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
18738                                         const X86Subtarget &Subtarget) {
18739   SDValue Vec = Op.getOperand(0);
18740   SDLoc dl(Vec);
18741   MVT VecVT = Vec.getSimpleValueType();
18742   SDValue Idx = Op.getOperand(1);
18743   auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18744   MVT EltVT = Op.getSimpleValueType();
18745 
18746   assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18747          "Unexpected vector type in ExtractBitFromMaskVector");
18748 
18749   // variable index can't be handled in mask registers,
18750   // extend vector to VR512/128
18751   if (!IdxC) {
18752     unsigned NumElts = VecVT.getVectorNumElements();
18753     // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18754     // than extending to 128/256bit.
18755     MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18756     MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18757     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18758     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18759     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18760   }
18761 
18762   unsigned IdxVal = IdxC->getZExtValue();
18763   if (IdxVal == 0) // the operation is legal
18764     return Op;
18765 
18766   // Extend to natively supported kshift.
18767   unsigned NumElems = VecVT.getVectorNumElements();
18768   MVT WideVecVT = VecVT;
18769   if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
18770     WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18771     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
18772                       DAG.getUNDEF(WideVecVT), Vec,
18773                       DAG.getIntPtrConstant(0, dl));
18774   }
18775 
18776   // Use kshiftr instruction to move to the lower element.
18777   Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
18778                     DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18779 
18780   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18781                      DAG.getIntPtrConstant(0, dl));
18782 }
18783 
18784 SDValue
LowerEXTRACT_VECTOR_ELT(SDValue Op,SelectionDAG & DAG) const18785 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18786                                            SelectionDAG &DAG) const {
18787   SDLoc dl(Op);
18788   SDValue Vec = Op.getOperand(0);
18789   MVT VecVT = Vec.getSimpleValueType();
18790   SDValue Idx = Op.getOperand(1);
18791   auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18792 
18793   if (VecVT.getVectorElementType() == MVT::i1)
18794     return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18795 
18796   if (!IdxC) {
18797     // Its more profitable to go through memory (1 cycles throughput)
18798     // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
18799     // IACA tool was used to get performance estimation
18800     // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18801     //
18802     // example : extractelement <16 x i8> %a, i32 %i
18803     //
18804     // Block Throughput: 3.00 Cycles
18805     // Throughput Bottleneck: Port5
18806     //
18807     // | Num Of |   Ports pressure in cycles  |    |
18808     // |  Uops  |  0  - DV  |  5  |  6  |  7  |    |
18809     // ---------------------------------------------
18810     // |   1    |           | 1.0 |     |     | CP | vmovd xmm1, edi
18811     // |   1    |           | 1.0 |     |     | CP | vpshufb xmm0, xmm0, xmm1
18812     // |   2    | 1.0       | 1.0 |     |     | CP | vpextrb eax, xmm0, 0x0
18813     // Total Num Of Uops: 4
18814     //
18815     //
18816     // Block Throughput: 1.00 Cycles
18817     // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18818     //
18819     // |    |  Ports pressure in cycles   |  |
18820     // |Uops| 1 | 2 - D  |3 -  D  | 4 | 5 |  |
18821     // ---------------------------------------------------------
18822     // |2^  |   | 0.5    | 0.5    |1.0|   |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18823     // |1   |0.5|        |        |   |0.5|  | lea rax, ptr [rsp-0x18]
18824     // |1   |   |0.5, 0.5|0.5, 0.5|   |   |CP| mov al, byte ptr [rdi+rax*1]
18825     // Total Num Of Uops: 4
18826 
18827     return SDValue();
18828   }
18829 
18830   unsigned IdxVal = IdxC->getZExtValue();
18831 
18832   // If this is a 256-bit vector result, first extract the 128-bit vector and
18833   // then extract the element from the 128-bit vector.
18834   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18835     // Get the 128-bit vector.
18836     Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18837     MVT EltVT = VecVT.getVectorElementType();
18838 
18839     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18840     assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18841 
18842     // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18843     // this can be done with a mask.
18844     IdxVal &= ElemsPerChunk - 1;
18845     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18846                        DAG.getIntPtrConstant(IdxVal, dl));
18847   }
18848 
18849   assert(VecVT.is128BitVector() && "Unexpected vector length");
18850 
18851   MVT VT = Op.getSimpleValueType();
18852 
18853   if (VT.getSizeInBits() == 16) {
18854     // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18855     // we're going to zero extend the register or fold the store (SSE41 only).
18856     if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
18857         !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
18858       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18859                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18860                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
18861 
18862     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18863                                   DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18864     return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18865   }
18866 
18867   if (Subtarget.hasSSE41())
18868     if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18869       return Res;
18870 
18871   // TODO: We only extract a single element from v16i8, we can probably afford
18872   // to be more aggressive here before using the default approach of spilling to
18873   // stack.
18874   if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
18875     // Extract either the lowest i32 or any i16, and extract the sub-byte.
18876     int DWordIdx = IdxVal / 4;
18877     if (DWordIdx == 0) {
18878       SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18879                                 DAG.getBitcast(MVT::v4i32, Vec),
18880                                 DAG.getIntPtrConstant(DWordIdx, dl));
18881       int ShiftVal = (IdxVal % 4) * 8;
18882       if (ShiftVal != 0)
18883         Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18884                           DAG.getConstant(ShiftVal, dl, MVT::i8));
18885       return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18886     }
18887 
18888     int WordIdx = IdxVal / 2;
18889     SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18890                               DAG.getBitcast(MVT::v8i16, Vec),
18891                               DAG.getIntPtrConstant(WordIdx, dl));
18892     int ShiftVal = (IdxVal % 2) * 8;
18893     if (ShiftVal != 0)
18894       Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18895                         DAG.getConstant(ShiftVal, dl, MVT::i8));
18896     return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18897   }
18898 
18899   if (VT.getSizeInBits() == 32) {
18900     if (IdxVal == 0)
18901       return Op;
18902 
18903     // SHUFPS the element to the lowest double word, then movss.
18904     int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
18905     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18906     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18907                        DAG.getIntPtrConstant(0, dl));
18908   }
18909 
18910   if (VT.getSizeInBits() == 64) {
18911     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18912     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18913     //        to match extract_elt for f64.
18914     if (IdxVal == 0)
18915       return Op;
18916 
18917     // UNPCKHPD the element to the lowest double word, then movsd.
18918     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18919     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18920     int Mask[2] = { 1, -1 };
18921     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18922     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18923                        DAG.getIntPtrConstant(0, dl));
18924   }
18925 
18926   return SDValue();
18927 }
18928 
18929 /// Insert one bit to mask vector, like v16i1 or v8i1.
18930 /// AVX-512 feature.
InsertBitToMaskVector(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)18931 static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
18932                                      const X86Subtarget &Subtarget) {
18933   SDLoc dl(Op);
18934   SDValue Vec = Op.getOperand(0);
18935   SDValue Elt = Op.getOperand(1);
18936   SDValue Idx = Op.getOperand(2);
18937   MVT VecVT = Vec.getSimpleValueType();
18938 
18939   if (!isa<ConstantSDNode>(Idx)) {
18940     // Non constant index. Extend source and destination,
18941     // insert element and then truncate the result.
18942     unsigned NumElts = VecVT.getVectorNumElements();
18943     MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18944     MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18945     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18946       DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18947       DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18948     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18949   }
18950 
18951   // Copy into a k-register, extract to v1i1 and insert_subvector.
18952   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18953   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18954 }
18955 
LowerINSERT_VECTOR_ELT(SDValue Op,SelectionDAG & DAG) const18956 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18957                                                   SelectionDAG &DAG) const {
18958   MVT VT = Op.getSimpleValueType();
18959   MVT EltVT = VT.getVectorElementType();
18960   unsigned NumElts = VT.getVectorNumElements();
18961   unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18962 
18963   if (EltVT == MVT::i1)
18964     return InsertBitToMaskVector(Op, DAG, Subtarget);
18965 
18966   SDLoc dl(Op);
18967   SDValue N0 = Op.getOperand(0);
18968   SDValue N1 = Op.getOperand(1);
18969   SDValue N2 = Op.getOperand(2);
18970   auto *N2C = dyn_cast<ConstantSDNode>(N2);
18971 
18972   if (!N2C) {
18973     // Variable insertion indices, usually we're better off spilling to stack,
18974     // but AVX512 can use a variable compare+select by comparing against all
18975     // possible vector indices, and FP insertion has less gpr->simd traffic.
18976     if (!(Subtarget.hasBWI() ||
18977           (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18978           (Subtarget.hasSSE41() && VT.isFloatingPoint())))
18979       return SDValue();
18980 
18981     MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18982     MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18983     if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18984       return SDValue();
18985 
18986     SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18987     SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18988     SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18989 
18990     SmallVector<SDValue, 16> RawIndices;
18991     for (unsigned I = 0; I != NumElts; ++I)
18992       RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18993     SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18994 
18995     // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18996     return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18997                            ISD::CondCode::SETEQ);
18998   }
18999 
19000   if (N2C->getAPIntValue().uge(NumElts))
19001     return SDValue();
19002   uint64_t IdxVal = N2C->getZExtValue();
19003 
19004   bool IsZeroElt = X86::isZeroNode(N1);
19005   bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
19006 
19007   // If we are inserting a element, see if we can do this more efficiently with
19008   // a blend shuffle with a rematerializable vector than a costly integer
19009   // insertion.
19010   if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
19011       (16 <= EltSizeInBits || (IsZeroElt && !VT.is128BitVector()))) {
19012     SmallVector<int, 8> BlendMask;
19013     for (unsigned i = 0; i != NumElts; ++i)
19014       BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19015     SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
19016                                   : getOnesVector(VT, DAG, dl);
19017     return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
19018   }
19019 
19020   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
19021   // into that, and then insert the subvector back into the result.
19022   if (VT.is256BitVector() || VT.is512BitVector()) {
19023     // With a 256-bit vector, we can insert into the zero element efficiently
19024     // using a blend if we have AVX or AVX2 and the right data type.
19025     if (VT.is256BitVector() && IdxVal == 0) {
19026       // TODO: It is worthwhile to cast integer to floating point and back
19027       // and incur a domain crossing penalty if that's what we'll end up
19028       // doing anyway after extracting to a 128-bit vector.
19029       if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19030           (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
19031         SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19032         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19033                            DAG.getTargetConstant(1, dl, MVT::i8));
19034       }
19035     }
19036 
19037     // Get the desired 128-bit vector chunk.
19038     SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19039 
19040     // Insert the element into the desired chunk.
19041     unsigned NumEltsIn128 = 128 / EltSizeInBits;
19042     assert(isPowerOf2_32(NumEltsIn128));
19043     // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19044     unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19045 
19046     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19047                     DAG.getIntPtrConstant(IdxIn128, dl));
19048 
19049     // Insert the changed part back into the bigger vector
19050     return insert128BitVector(N0, V, IdxVal, DAG, dl);
19051   }
19052   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
19053 
19054   // This will be just movd/movq/movss/movsd.
19055   if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19056     if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19057         EltVT == MVT::i64) {
19058       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19059       return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19060     }
19061 
19062     // We can't directly insert an i8 or i16 into a vector, so zero extend
19063     // it to i32 first.
19064     if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19065       N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19066       MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19067       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19068       N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19069       return DAG.getBitcast(VT, N1);
19070     }
19071   }
19072 
19073   // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19074   // argument. SSE41 required for pinsrb.
19075   if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19076     unsigned Opc;
19077     if (VT == MVT::v8i16) {
19078       assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
19079       Opc = X86ISD::PINSRW;
19080     } else {
19081       assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
19082       assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
19083       Opc = X86ISD::PINSRB;
19084     }
19085 
19086     assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
19087     N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19088     N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19089     return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19090   }
19091 
19092   if (Subtarget.hasSSE41()) {
19093     if (EltVT == MVT::f32) {
19094       // Bits [7:6] of the constant are the source select. This will always be
19095       //   zero here. The DAG Combiner may combine an extract_elt index into
19096       //   these bits. For example (insert (extract, 3), 2) could be matched by
19097       //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19098       // Bits [5:4] of the constant are the destination select. This is the
19099       //   value of the incoming immediate.
19100       // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19101       //   combine either bitwise AND or insert of float 0.0 to set these bits.
19102 
19103       bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19104       if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
19105         // If this is an insertion of 32-bits into the low 32-bits of
19106         // a vector, we prefer to generate a blend with immediate rather
19107         // than an insertps. Blends are simpler operations in hardware and so
19108         // will always have equal or better performance than insertps.
19109         // But if optimizing for size and there's a load folding opportunity,
19110         // generate insertps because blendps does not have a 32-bit memory
19111         // operand form.
19112         N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19113         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19114                            DAG.getTargetConstant(1, dl, MVT::i8));
19115       }
19116       // Create this as a scalar to vector..
19117       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19118       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19119                          DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19120     }
19121 
19122     // PINSR* works with constant index.
19123     if (EltVT == MVT::i32 || EltVT == MVT::i64)
19124       return Op;
19125   }
19126 
19127   return SDValue();
19128 }
19129 
LowerSCALAR_TO_VECTOR(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)19130 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
19131                                      SelectionDAG &DAG) {
19132   SDLoc dl(Op);
19133   MVT OpVT = Op.getSimpleValueType();
19134 
19135   // It's always cheaper to replace a xor+movd with xorps and simplifies further
19136   // combines.
19137   if (X86::isZeroNode(Op.getOperand(0)))
19138     return getZeroVector(OpVT, Subtarget, DAG, dl);
19139 
19140   // If this is a 256-bit vector result, first insert into a 128-bit
19141   // vector and then insert into the 256-bit vector.
19142   if (!OpVT.is128BitVector()) {
19143     // Insert into a 128-bit vector.
19144     unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19145     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
19146                                  OpVT.getVectorNumElements() / SizeFactor);
19147 
19148     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19149 
19150     // Insert the 128-bit vector.
19151     return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19152   }
19153   assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
19154          "Expected an SSE type!");
19155 
19156   // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
19157   if (OpVT == MVT::v4i32)
19158     return Op;
19159 
19160   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19161   return DAG.getBitcast(
19162       OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19163 }
19164 
19165 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
19166 // simple superregister reference or explicit instructions to insert
19167 // the upper bits of a vector.
LowerINSERT_SUBVECTOR(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)19168 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
19169                                      SelectionDAG &DAG) {
19170   assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19171 
19172   return insert1BitVector(Op, DAG, Subtarget);
19173 }
19174 
LowerEXTRACT_SUBVECTOR(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)19175 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
19176                                       SelectionDAG &DAG) {
19177   assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19178          "Only vXi1 extract_subvectors need custom lowering");
19179 
19180   SDLoc dl(Op);
19181   SDValue Vec = Op.getOperand(0);
19182   uint64_t IdxVal = Op.getConstantOperandVal(1);
19183 
19184   if (IdxVal == 0) // the operation is legal
19185     return Op;
19186 
19187   MVT VecVT = Vec.getSimpleValueType();
19188   unsigned NumElems = VecVT.getVectorNumElements();
19189 
19190   // Extend to natively supported kshift.
19191   MVT WideVecVT = VecVT;
19192   if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
19193     WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19194     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
19195                       DAG.getUNDEF(WideVecVT), Vec,
19196                       DAG.getIntPtrConstant(0, dl));
19197   }
19198 
19199   // Shift to the LSB.
19200   Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
19201                     DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19202 
19203   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19204                      DAG.getIntPtrConstant(0, dl));
19205 }
19206 
19207 // Returns the appropriate wrapper opcode for a global reference.
getGlobalWrapperKind(const GlobalValue * GV,const unsigned char OpFlags) const19208 unsigned X86TargetLowering::getGlobalWrapperKind(
19209     const GlobalValue *GV, const unsigned char OpFlags) const {
19210   // References to absolute symbols are never PC-relative.
19211   if (GV && GV->isAbsoluteSymbolRef())
19212     return X86ISD::Wrapper;
19213 
19214   CodeModel::Model M = getTargetMachine().getCodeModel();
19215   if (Subtarget.isPICStyleRIPRel() &&
19216       (M == CodeModel::Small || M == CodeModel::Kernel))
19217     return X86ISD::WrapperRIP;
19218 
19219   // GOTPCREL references must always use RIP.
19220   if (OpFlags == X86II::MO_GOTPCREL)
19221     return X86ISD::WrapperRIP;
19222 
19223   return X86ISD::Wrapper;
19224 }
19225 
19226 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19227 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19228 // one of the above mentioned nodes. It has to be wrapped because otherwise
19229 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19230 // be used to form addressing mode. These wrapped nodes will be selected
19231 // into MOV32ri.
19232 SDValue
LowerConstantPool(SDValue Op,SelectionDAG & DAG) const19233 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19234   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19235 
19236   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19237   // global base reg.
19238   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19239 
19240   auto PtrVT = getPointerTy(DAG.getDataLayout());
19241   SDValue Result = DAG.getTargetConstantPool(
19242       CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19243   SDLoc DL(CP);
19244   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
19245   // With PIC, the address is actually $g + Offset.
19246   if (OpFlag) {
19247     Result =
19248         DAG.getNode(ISD::ADD, DL, PtrVT,
19249                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19250   }
19251 
19252   return Result;
19253 }
19254 
LowerJumpTable(SDValue Op,SelectionDAG & DAG) const19255 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19256   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19257 
19258   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19259   // global base reg.
19260   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19261 
19262   auto PtrVT = getPointerTy(DAG.getDataLayout());
19263   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19264   SDLoc DL(JT);
19265   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
19266 
19267   // With PIC, the address is actually $g + Offset.
19268   if (OpFlag)
19269     Result =
19270         DAG.getNode(ISD::ADD, DL, PtrVT,
19271                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19272 
19273   return Result;
19274 }
19275 
LowerExternalSymbol(SDValue Op,SelectionDAG & DAG) const19276 SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19277                                                SelectionDAG &DAG) const {
19278   return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19279 }
19280 
19281 SDValue
LowerBlockAddress(SDValue Op,SelectionDAG & DAG) const19282 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19283   // Create the TargetBlockAddressAddress node.
19284   unsigned char OpFlags =
19285     Subtarget.classifyBlockAddressReference();
19286   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19287   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19288   SDLoc dl(Op);
19289   auto PtrVT = getPointerTy(DAG.getDataLayout());
19290   SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19291   Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
19292 
19293   // With PIC, the address is actually $g + Offset.
19294   if (isGlobalRelativeToPICBase(OpFlags)) {
19295     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19296                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19297   }
19298 
19299   return Result;
19300 }
19301 
19302 /// Creates target global address or external symbol nodes for calls or
19303 /// other uses.
LowerGlobalOrExternal(SDValue Op,SelectionDAG & DAG,bool ForCall) const19304 SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19305                                                  bool ForCall) const {
19306   // Unpack the global address or external symbol.
19307   const SDLoc &dl = SDLoc(Op);
19308   const GlobalValue *GV = nullptr;
19309   int64_t Offset = 0;
19310   const char *ExternalSym = nullptr;
19311   if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19312     GV = G->getGlobal();
19313     Offset = G->getOffset();
19314   } else {
19315     const auto *ES = cast<ExternalSymbolSDNode>(Op);
19316     ExternalSym = ES->getSymbol();
19317   }
19318 
19319   // Calculate some flags for address lowering.
19320   const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
19321   unsigned char OpFlags;
19322   if (ForCall)
19323     OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19324   else
19325     OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19326   bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19327   bool NeedsLoad = isGlobalStubReference(OpFlags);
19328 
19329   CodeModel::Model M = DAG.getTarget().getCodeModel();
19330   auto PtrVT = getPointerTy(DAG.getDataLayout());
19331   SDValue Result;
19332 
19333   if (GV) {
19334     // Create a target global address if this is a global. If possible, fold the
19335     // offset into the global address reference. Otherwise, ADD it on later.
19336     // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19337     // allowed because if the address of foo is 0, the ELF R_X86_64_32
19338     // relocation will compute to a negative value, which is invalid.
19339     int64_t GlobalOffset = 0;
19340     if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19341         X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
19342       std::swap(GlobalOffset, Offset);
19343     }
19344     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19345   } else {
19346     // If this is not a global address, this must be an external symbol.
19347     Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19348   }
19349 
19350   // If this is a direct call, avoid the wrapper if we don't need to do any
19351   // loads or adds. This allows SDAG ISel to match direct calls.
19352   if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19353     return Result;
19354 
19355   Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19356 
19357   // With PIC, the address is actually $g + Offset.
19358   if (HasPICReg) {
19359     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19360                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19361   }
19362 
19363   // For globals that require a load from a stub to get the address, emit the
19364   // load.
19365   if (NeedsLoad)
19366     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19367                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
19368 
19369   // If there was a non-zero offset that we didn't fold, create an explicit
19370   // addition for it.
19371   if (Offset != 0)
19372     Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19373                          DAG.getConstant(Offset, dl, PtrVT));
19374 
19375   return Result;
19376 }
19377 
19378 SDValue
LowerGlobalAddress(SDValue Op,SelectionDAG & DAG) const19379 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19380   return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19381 }
19382 
19383 static SDValue
GetTLSADDR(SelectionDAG & DAG,SDValue Chain,GlobalAddressSDNode * GA,SDValue * InFlag,const EVT PtrVT,unsigned ReturnReg,unsigned char OperandFlags,bool LocalDynamic=false)19384 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
19385            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
19386            unsigned char OperandFlags, bool LocalDynamic = false) {
19387   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19388   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19389   SDLoc dl(GA);
19390   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19391                                            GA->getValueType(0),
19392                                            GA->getOffset(),
19393                                            OperandFlags);
19394 
19395   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
19396                                            : X86ISD::TLSADDR;
19397 
19398   if (InFlag) {
19399     SDValue Ops[] = { Chain,  TGA, *InFlag };
19400     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
19401   } else {
19402     SDValue Ops[]  = { Chain, TGA };
19403     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
19404   }
19405 
19406   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19407   MFI.setAdjustsStack(true);
19408   MFI.setHasCalls(true);
19409 
19410   SDValue Flag = Chain.getValue(1);
19411   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
19412 }
19413 
19414 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19415 static SDValue
LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode * GA,SelectionDAG & DAG,const EVT PtrVT)19416 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19417                                 const EVT PtrVT) {
19418   SDValue InFlag;
19419   SDLoc dl(GA);  // ? function entry point might be better
19420   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
19421                                    DAG.getNode(X86ISD::GlobalBaseReg,
19422                                                SDLoc(), PtrVT), InFlag);
19423   InFlag = Chain.getValue(1);
19424 
19425   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
19426 }
19427 
19428 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19429 static SDValue
LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode * GA,SelectionDAG & DAG,const EVT PtrVT)19430 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19431                                 const EVT PtrVT) {
19432   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
19433                     X86::RAX, X86II::MO_TLSGD);
19434 }
19435 
19436 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19437 static SDValue
LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode * GA,SelectionDAG & DAG,const EVT PtrVT)19438 LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19439                                  const EVT PtrVT) {
19440   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
19441                     X86::EAX, X86II::MO_TLSGD);
19442 }
19443 
LowerToTLSLocalDynamicModel(GlobalAddressSDNode * GA,SelectionDAG & DAG,const EVT PtrVT,bool Is64Bit,bool Is64BitLP64)19444 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
19445                                            SelectionDAG &DAG, const EVT PtrVT,
19446                                            bool Is64Bit, bool Is64BitLP64) {
19447   SDLoc dl(GA);
19448 
19449   // Get the start address of the TLS block for this module.
19450   X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
19451       .getInfo<X86MachineFunctionInfo>();
19452   MFI->incNumLocalDynamicTLSAccesses();
19453 
19454   SDValue Base;
19455   if (Is64Bit) {
19456     unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19457     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
19458                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
19459   } else {
19460     SDValue InFlag;
19461     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
19462         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
19463     InFlag = Chain.getValue(1);
19464     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
19465                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
19466   }
19467 
19468   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19469   // of Base.
19470 
19471   // Build x@dtpoff.
19472   unsigned char OperandFlags = X86II::MO_DTPOFF;
19473   unsigned WrapperKind = X86ISD::Wrapper;
19474   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19475                                            GA->getValueType(0),
19476                                            GA->getOffset(), OperandFlags);
19477   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19478 
19479   // Add x@dtpoff with the base.
19480   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19481 }
19482 
19483 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
LowerToTLSExecModel(GlobalAddressSDNode * GA,SelectionDAG & DAG,const EVT PtrVT,TLSModel::Model model,bool is64Bit,bool isPIC)19484 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19485                                    const EVT PtrVT, TLSModel::Model model,
19486                                    bool is64Bit, bool isPIC) {
19487   SDLoc dl(GA);
19488 
19489   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19490   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
19491                                                          is64Bit ? 257 : 256));
19492 
19493   SDValue ThreadPointer =
19494       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19495                   MachinePointerInfo(Ptr));
19496 
19497   unsigned char OperandFlags = 0;
19498   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
19499   // initialexec.
19500   unsigned WrapperKind = X86ISD::Wrapper;
19501   if (model == TLSModel::LocalExec) {
19502     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19503   } else if (model == TLSModel::InitialExec) {
19504     if (is64Bit) {
19505       OperandFlags = X86II::MO_GOTTPOFF;
19506       WrapperKind = X86ISD::WrapperRIP;
19507     } else {
19508       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19509     }
19510   } else {
19511     llvm_unreachable("Unexpected model");
19512   }
19513 
19514   // emit "addl x@ntpoff,%eax" (local exec)
19515   // or "addl x@indntpoff,%eax" (initial exec)
19516   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19517   SDValue TGA =
19518       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19519                                  GA->getOffset(), OperandFlags);
19520   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19521 
19522   if (model == TLSModel::InitialExec) {
19523     if (isPIC && !is64Bit) {
19524       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19525                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19526                            Offset);
19527     }
19528 
19529     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19530                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
19531   }
19532 
19533   // The address of the thread local variable is the add of the thread
19534   // pointer with the offset of the variable.
19535   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19536 }
19537 
19538 SDValue
LowerGlobalTLSAddress(SDValue Op,SelectionDAG & DAG) const19539 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19540 
19541   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19542 
19543   if (DAG.getTarget().useEmulatedTLS())
19544     return LowerToTLSEmulatedModel(GA, DAG);
19545 
19546   const GlobalValue *GV = GA->getGlobal();
19547   auto PtrVT = getPointerTy(DAG.getDataLayout());
19548   bool PositionIndependent = isPositionIndependent();
19549 
19550   if (Subtarget.isTargetELF()) {
19551     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19552     switch (model) {
19553       case TLSModel::GeneralDynamic:
19554         if (Subtarget.is64Bit()) {
19555           if (Subtarget.isTarget64BitLP64())
19556             return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19557           return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19558         }
19559         return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19560       case TLSModel::LocalDynamic:
19561         return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19562                                            Subtarget.isTarget64BitLP64());
19563       case TLSModel::InitialExec:
19564       case TLSModel::LocalExec:
19565         return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19566                                    PositionIndependent);
19567     }
19568     llvm_unreachable("Unknown TLS model.");
19569   }
19570 
19571   if (Subtarget.isTargetDarwin()) {
19572     // Darwin only has one model of TLS.  Lower to that.
19573     unsigned char OpFlag = 0;
19574     unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
19575                            X86ISD::WrapperRIP : X86ISD::Wrapper;
19576 
19577     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19578     // global base reg.
19579     bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19580     if (PIC32)
19581       OpFlag = X86II::MO_TLVP_PIC_BASE;
19582     else
19583       OpFlag = X86II::MO_TLVP;
19584     SDLoc DL(Op);
19585     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
19586                                                 GA->getValueType(0),
19587                                                 GA->getOffset(), OpFlag);
19588     SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19589 
19590     // With PIC32, the address is actually $g + Offset.
19591     if (PIC32)
19592       Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19593                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19594                            Offset);
19595 
19596     // Lowering the machine isd will make sure everything is in the right
19597     // location.
19598     SDValue Chain = DAG.getEntryNode();
19599     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19600     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19601     SDValue Args[] = { Chain, Offset };
19602     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19603     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
19604                                DAG.getIntPtrConstant(0, DL, true),
19605                                Chain.getValue(1), DL);
19606 
19607     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19608     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19609     MFI.setAdjustsStack(true);
19610 
19611     // And our return value (tls address) is in the standard call return value
19612     // location.
19613     unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19614     return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19615   }
19616 
19617   if (Subtarget.isOSWindows()) {
19618     // Just use the implicit TLS architecture
19619     // Need to generate something similar to:
19620     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19621     //                                  ; from TEB
19622     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
19623     //   mov     rcx, qword [rdx+rcx*8]
19624     //   mov     eax, .tls$:tlsvar
19625     //   [rax+rcx] contains the address
19626     // Windows 64bit: gs:0x58
19627     // Windows 32bit: fs:__tls_array
19628 
19629     SDLoc dl(GA);
19630     SDValue Chain = DAG.getEntryNode();
19631 
19632     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19633     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19634     // use its literal value of 0x2C.
19635     Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
19636                                         ? Type::getInt8PtrTy(*DAG.getContext(),
19637                                                              256)
19638                                         : Type::getInt32PtrTy(*DAG.getContext(),
19639                                                               257));
19640 
19641     SDValue TlsArray = Subtarget.is64Bit()
19642                            ? DAG.getIntPtrConstant(0x58, dl)
19643                            : (Subtarget.isTargetWindowsGNU()
19644                                   ? DAG.getIntPtrConstant(0x2C, dl)
19645                                   : DAG.getExternalSymbol("_tls_array", PtrVT));
19646 
19647     SDValue ThreadPointer =
19648         DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19649 
19650     SDValue res;
19651     if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
19652       res = ThreadPointer;
19653     } else {
19654       // Load the _tls_index variable
19655       SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19656       if (Subtarget.is64Bit())
19657         IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19658                              MachinePointerInfo(), MVT::i32);
19659       else
19660         IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19661 
19662       const DataLayout &DL = DAG.getDataLayout();
19663       SDValue Scale =
19664           DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19665       IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19666 
19667       res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19668     }
19669 
19670     res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19671 
19672     // Get the offset of start of .tls section
19673     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19674                                              GA->getValueType(0),
19675                                              GA->getOffset(), X86II::MO_SECREL);
19676     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19677 
19678     // The address of the thread local variable is the add of the thread
19679     // pointer with the offset of the variable.
19680     return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19681   }
19682 
19683   llvm_unreachable("TLS not implemented for this target.");
19684 }
19685 
19686 /// Lower SRA_PARTS and friends, which return two i32 values
19687 /// and take a 2 x i32 value to shift plus a shift amount.
19688 /// TODO: Can this be moved to general expansion code?
LowerShiftParts(SDValue Op,SelectionDAG & DAG)19689 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
19690   SDValue Lo, Hi;
19691   DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19692   return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19693 }
19694 
LowerFunnelShift(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)19695 static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
19696                                 SelectionDAG &DAG) {
19697   MVT VT = Op.getSimpleValueType();
19698   assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
19699          "Unexpected funnel shift opcode!");
19700 
19701   SDLoc DL(Op);
19702   SDValue Op0 = Op.getOperand(0);
19703   SDValue Op1 = Op.getOperand(1);
19704   SDValue Amt = Op.getOperand(2);
19705 
19706   bool IsFSHR = Op.getOpcode() == ISD::FSHR;
19707 
19708   if (VT.isVector()) {
19709     assert(Subtarget.hasVBMI2() && "Expected VBMI2");
19710 
19711     if (IsFSHR)
19712       std::swap(Op0, Op1);
19713 
19714     // With AVX512, but not VLX we need to widen to get a 512-bit result type.
19715     if (!Subtarget.hasVLX() && !VT.is512BitVector()) {
19716       Op0 = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
19717       Op1 = widenSubVector(Op1, false, Subtarget, DAG, DL, 512);
19718     }
19719 
19720     SDValue Funnel;
19721     APInt APIntShiftAmt;
19722     MVT ResultVT = Op0.getSimpleValueType();
19723     if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
19724       uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
19725       Funnel =
19726           DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, ResultVT, Op0,
19727                       Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
19728     } else {
19729       if (!Subtarget.hasVLX() && !VT.is512BitVector())
19730         Amt = widenSubVector(Amt, false, Subtarget, DAG, DL, 512);
19731       Funnel = DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL,
19732                            ResultVT, Op0, Op1, Amt);
19733     }
19734     if (!Subtarget.hasVLX() && !VT.is512BitVector())
19735       Funnel = extractSubVector(Funnel, 0, DAG, DL, VT.getSizeInBits());
19736     return Funnel;
19737   }
19738   assert(
19739       (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
19740       "Unexpected funnel shift type!");
19741 
19742   // Expand slow SHLD/SHRD cases if we are not optimizing for size.
19743   bool OptForSize = DAG.shouldOptForSize();
19744   bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
19745 
19746   // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
19747   // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
19748   if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
19749       !isa<ConstantSDNode>(Amt)) {
19750     unsigned EltSizeInBits = VT.getScalarSizeInBits();
19751     SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
19752     SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
19753     Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
19754     Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
19755     Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
19756     SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
19757     Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
19758     if (IsFSHR) {
19759       Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
19760     } else {
19761       Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
19762       Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
19763     }
19764     return DAG.getZExtOrTrunc(Res, DL, VT);
19765   }
19766 
19767   if (VT == MVT::i8 || ExpandFunnel)
19768     return SDValue();
19769 
19770   // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
19771   if (VT == MVT::i16) {
19772     Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
19773                       DAG.getConstant(15, DL, Amt.getValueType()));
19774     unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
19775     return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
19776   }
19777 
19778   return Op;
19779 }
19780 
19781 // Try to use a packed vector operation to handle i64 on 32-bit targets when
19782 // AVX512DQ is enabled.
LowerI64IntToFP_AVX512DQ(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)19783 static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
19784                                         const X86Subtarget &Subtarget) {
19785   assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19786           Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19787           Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19788           Op.getOpcode() == ISD::UINT_TO_FP) &&
19789          "Unexpected opcode!");
19790   bool IsStrict = Op->isStrictFPOpcode();
19791   unsigned OpNo = IsStrict ? 1 : 0;
19792   SDValue Src = Op.getOperand(OpNo);
19793   MVT SrcVT = Src.getSimpleValueType();
19794   MVT VT = Op.getSimpleValueType();
19795 
19796    if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19797        (VT != MVT::f32 && VT != MVT::f64))
19798     return SDValue();
19799 
19800   // Pack the i64 into a vector, do the operation and extract.
19801 
19802   // Using 256-bit to ensure result is 128-bits for f32 case.
19803   unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19804   MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19805   MVT VecVT = MVT::getVectorVT(VT, NumElts);
19806 
19807   SDLoc dl(Op);
19808   SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19809   if (IsStrict) {
19810     SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19811                                  {Op.getOperand(0), InVec});
19812     SDValue Chain = CvtVec.getValue(1);
19813     SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19814                                 DAG.getIntPtrConstant(0, dl));
19815     return DAG.getMergeValues({Value, Chain}, dl);
19816   }
19817 
19818   SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19819 
19820   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19821                      DAG.getIntPtrConstant(0, dl));
19822 }
19823 
useVectorCast(unsigned Opcode,MVT FromVT,MVT ToVT,const X86Subtarget & Subtarget)19824 static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19825                           const X86Subtarget &Subtarget) {
19826   switch (Opcode) {
19827     case ISD::SINT_TO_FP:
19828       // TODO: Handle wider types with AVX/AVX512.
19829       if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19830         return false;
19831       // CVTDQ2PS or (V)CVTDQ2PD
19832       return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19833 
19834     case ISD::UINT_TO_FP:
19835       // TODO: Handle wider types and i64 elements.
19836       if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19837         return false;
19838       // VCVTUDQ2PS or VCVTUDQ2PD
19839       return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19840 
19841     default:
19842       return false;
19843   }
19844 }
19845 
19846 /// Given a scalar cast operation that is extracted from a vector, try to
19847 /// vectorize the cast op followed by extraction. This will avoid an expensive
19848 /// round-trip between XMM and GPR.
vectorizeExtractedCast(SDValue Cast,SelectionDAG & DAG,const X86Subtarget & Subtarget)19849 static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
19850                                       const X86Subtarget &Subtarget) {
19851   // TODO: This could be enhanced to handle smaller integer types by peeking
19852   // through an extend.
19853   SDValue Extract = Cast.getOperand(0);
19854   MVT DestVT = Cast.getSimpleValueType();
19855   if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19856       !isa<ConstantSDNode>(Extract.getOperand(1)))
19857     return SDValue();
19858 
19859   // See if we have a 128-bit vector cast op for this type of cast.
19860   SDValue VecOp = Extract.getOperand(0);
19861   MVT FromVT = VecOp.getSimpleValueType();
19862   unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19863   MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19864   MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19865   if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19866     return SDValue();
19867 
19868   // If we are extracting from a non-zero element, first shuffle the source
19869   // vector to allow extracting from element zero.
19870   SDLoc DL(Cast);
19871   if (!isNullConstant(Extract.getOperand(1))) {
19872     SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19873     Mask[0] = Extract.getConstantOperandVal(1);
19874     VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19875   }
19876   // If the source vector is wider than 128-bits, extract the low part. Do not
19877   // create an unnecessarily wide vector cast op.
19878   if (FromVT != Vec128VT)
19879     VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19880 
19881   // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19882   // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19883   SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19884   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19885                      DAG.getIntPtrConstant(0, DL));
19886 }
19887 
19888 /// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19889 /// try to vectorize the cast ops. This will avoid an expensive round-trip
19890 /// between XMM and GPR.
lowerFPToIntToFP(SDValue CastToFP,SelectionDAG & DAG,const X86Subtarget & Subtarget)19891 static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
19892                                 const X86Subtarget &Subtarget) {
19893   // TODO: Allow FP_TO_UINT.
19894   SDValue CastToInt = CastToFP.getOperand(0);
19895   MVT VT = CastToFP.getSimpleValueType();
19896   if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19897     return SDValue();
19898 
19899   MVT IntVT = CastToInt.getSimpleValueType();
19900   SDValue X = CastToInt.getOperand(0);
19901   MVT SrcVT = X.getSimpleValueType();
19902   if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19903     return SDValue();
19904 
19905   // See if we have 128-bit vector cast instructions for this type of cast.
19906   // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19907   if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19908       IntVT != MVT::i32)
19909     return SDValue();
19910 
19911   unsigned SrcSize = SrcVT.getSizeInBits();
19912   unsigned IntSize = IntVT.getSizeInBits();
19913   unsigned VTSize = VT.getSizeInBits();
19914   MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19915   MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19916   MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19917 
19918   // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19919   unsigned ToIntOpcode =
19920       SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19921   unsigned ToFPOpcode =
19922       IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19923 
19924   // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19925   //
19926   // We are not defining the high elements (for example, zero them) because
19927   // that could nullify any performance advantage that we hoped to gain from
19928   // this vector op hack. We do not expect any adverse effects (like denorm
19929   // penalties) with cast ops.
19930   SDLoc DL(CastToFP);
19931   SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
19932   SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19933   SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19934   SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19935   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19936 }
19937 
lowerINT_TO_FP_vXi64(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)19938 static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
19939                                     const X86Subtarget &Subtarget) {
19940   SDLoc DL(Op);
19941   bool IsStrict = Op->isStrictFPOpcode();
19942   MVT VT = Op->getSimpleValueType(0);
19943   SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19944 
19945   if (Subtarget.hasDQI()) {
19946     assert(!Subtarget.hasVLX() && "Unexpected features");
19947 
19948     assert((Src.getSimpleValueType() == MVT::v2i64 ||
19949             Src.getSimpleValueType() == MVT::v4i64) &&
19950            "Unsupported custom type");
19951 
19952     // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19953     assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19954            "Unexpected VT!");
19955     MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19956 
19957     // Need to concat with zero vector for strict fp to avoid spurious
19958     // exceptions.
19959     SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19960                            : DAG.getUNDEF(MVT::v8i64);
19961     Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19962                       DAG.getIntPtrConstant(0, DL));
19963     SDValue Res, Chain;
19964     if (IsStrict) {
19965       Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19966                         {Op->getOperand(0), Src});
19967       Chain = Res.getValue(1);
19968     } else {
19969       Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19970     }
19971 
19972     Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19973                       DAG.getIntPtrConstant(0, DL));
19974 
19975     if (IsStrict)
19976       return DAG.getMergeValues({Res, Chain}, DL);
19977     return Res;
19978   }
19979 
19980   bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19981                   Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19982   if (VT != MVT::v4f32 || IsSigned)
19983     return SDValue();
19984 
19985   SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19986   SDValue One  = DAG.getConstant(1, DL, MVT::v4i64);
19987   SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19988                              DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19989                              DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19990   SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19991   SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19992   SmallVector<SDValue, 4> SignCvts(4);
19993   SmallVector<SDValue, 4> Chains(4);
19994   for (int i = 0; i != 4; ++i) {
19995     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19996                               DAG.getIntPtrConstant(i, DL));
19997     if (IsStrict) {
19998       SignCvts[i] =
19999           DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
20000                       {Op.getOperand(0), Elt});
20001       Chains[i] = SignCvts[i].getValue(1);
20002     } else {
20003       SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
20004     }
20005   }
20006   SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20007 
20008   SDValue Slow, Chain;
20009   if (IsStrict) {
20010     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20011     Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20012                        {Chain, SignCvt, SignCvt});
20013     Chain = Slow.getValue(1);
20014   } else {
20015     Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20016   }
20017 
20018   IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20019   SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20020 
20021   if (IsStrict)
20022     return DAG.getMergeValues({Cvt, Chain}, DL);
20023 
20024   return Cvt;
20025 }
20026 
LowerSINT_TO_FP(SDValue Op,SelectionDAG & DAG) const20027 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20028                                            SelectionDAG &DAG) const {
20029   bool IsStrict = Op->isStrictFPOpcode();
20030   unsigned OpNo = IsStrict ? 1 : 0;
20031   SDValue Src = Op.getOperand(OpNo);
20032   SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20033   MVT SrcVT = Src.getSimpleValueType();
20034   MVT VT = Op.getSimpleValueType();
20035   SDLoc dl(Op);
20036 
20037   if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
20038     return Extract;
20039 
20040   if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
20041     return R;
20042 
20043   if (SrcVT.isVector()) {
20044     if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20045       // Note: Since v2f64 is a legal type. We don't need to zero extend the
20046       // source for strict FP.
20047       if (IsStrict)
20048         return DAG.getNode(
20049             X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20050             {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20051                                 DAG.getUNDEF(SrcVT))});
20052       return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20053                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20054                                      DAG.getUNDEF(SrcVT)));
20055     }
20056     if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20057       return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
20058 
20059     return SDValue();
20060   }
20061 
20062   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20063          "Unknown SINT_TO_FP to lower!");
20064 
20065   bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20066 
20067   // These are really Legal; return the operand so the caller accepts it as
20068   // Legal.
20069   if (SrcVT == MVT::i32 && UseSSEReg)
20070     return Op;
20071   if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20072     return Op;
20073 
20074   if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
20075     return V;
20076 
20077   // SSE doesn't have an i16 conversion so we need to promote.
20078   if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20079     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20080     if (IsStrict)
20081       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20082                          {Chain, Ext});
20083 
20084     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20085   }
20086 
20087   if (VT == MVT::f128)
20088     return SDValue();
20089 
20090   SDValue ValueToStore = Src;
20091   if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20092     // Bitcasting to f64 here allows us to do a single 64-bit store from
20093     // an SSE register, avoiding the store forwarding penalty that would come
20094     // with two 32-bit stores.
20095     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20096 
20097   unsigned Size = SrcVT.getStoreSize();
20098   Align Alignment(Size);
20099   MachineFunction &MF = DAG.getMachineFunction();
20100   auto PtrVT = getPointerTy(MF.getDataLayout());
20101   int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20102   MachinePointerInfo MPI =
20103       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
20104   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20105   Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20106   std::pair<SDValue, SDValue> Tmp =
20107       BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20108 
20109   if (IsStrict)
20110     return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20111 
20112   return Tmp.first;
20113 }
20114 
BuildFILD(EVT DstVT,EVT SrcVT,const SDLoc & DL,SDValue Chain,SDValue Pointer,MachinePointerInfo PtrInfo,Align Alignment,SelectionDAG & DAG) const20115 std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20116     EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20117     MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20118   // Build the FILD
20119   SDVTList Tys;
20120   bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20121   if (useSSE)
20122     Tys = DAG.getVTList(MVT::f80, MVT::Other);
20123   else
20124     Tys = DAG.getVTList(DstVT, MVT::Other);
20125 
20126   SDValue FILDOps[] = {Chain, Pointer};
20127   SDValue Result =
20128       DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20129                               Alignment, MachineMemOperand::MOLoad);
20130   Chain = Result.getValue(1);
20131 
20132   if (useSSE) {
20133     MachineFunction &MF = DAG.getMachineFunction();
20134     unsigned SSFISize = DstVT.getStoreSize();
20135     int SSFI =
20136         MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20137     auto PtrVT = getPointerTy(MF.getDataLayout());
20138     SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20139     Tys = DAG.getVTList(MVT::Other);
20140     SDValue FSTOps[] = {Chain, Result, StackSlot};
20141     MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
20142         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
20143         MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20144 
20145     Chain =
20146         DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20147     Result = DAG.getLoad(
20148         DstVT, DL, Chain, StackSlot,
20149         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
20150     Chain = Result.getValue(1);
20151   }
20152 
20153   return { Result, Chain };
20154 }
20155 
20156 /// Horizontal vector math instructions may be slower than normal math with
20157 /// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20158 /// implementation, and likely shuffle complexity of the alternate sequence.
shouldUseHorizontalOp(bool IsSingleSource,SelectionDAG & DAG,const X86Subtarget & Subtarget)20159 static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20160                                   const X86Subtarget &Subtarget) {
20161   bool IsOptimizingSize = DAG.shouldOptForSize();
20162   bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20163   return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20164 }
20165 
20166 /// 64-bit unsigned integer to double expansion.
LowerUINT_TO_FP_i64(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)20167 static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
20168                                    const X86Subtarget &Subtarget) {
20169   // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20170   // when converting 0 when rounding toward negative infinity. Caller will
20171   // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20172   assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
20173   // This algorithm is not obvious. Here it is what we're trying to output:
20174   /*
20175      movq       %rax,  %xmm0
20176      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20177      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20178      #ifdef __SSE3__
20179        haddpd   %xmm0, %xmm0
20180      #else
20181        pshufd   $0x4e, %xmm0, %xmm1
20182        addpd    %xmm1, %xmm0
20183      #endif
20184   */
20185 
20186   SDLoc dl(Op);
20187   LLVMContext *Context = DAG.getContext();
20188 
20189   // Build some magic constants.
20190   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20191   Constant *C0 = ConstantDataVector::get(*Context, CV0);
20192   auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20193   SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20194 
20195   SmallVector<Constant*,2> CV1;
20196   CV1.push_back(
20197     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20198                                       APInt(64, 0x4330000000000000ULL))));
20199   CV1.push_back(
20200     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20201                                       APInt(64, 0x4530000000000000ULL))));
20202   Constant *C1 = ConstantVector::get(CV1);
20203   SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20204 
20205   // Load the 64-bit value into an XMM register.
20206   SDValue XR1 =
20207       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20208   SDValue CLod0 = DAG.getLoad(
20209       MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20210       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
20211   SDValue Unpck1 =
20212       getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20213 
20214   SDValue CLod1 = DAG.getLoad(
20215       MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20216       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
20217   SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20218   // TODO: Are there any fast-math-flags to propagate here?
20219   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20220   SDValue Result;
20221 
20222   if (Subtarget.hasSSE3() &&
20223       shouldUseHorizontalOp(true, DAG, Subtarget)) {
20224     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20225   } else {
20226     SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20227     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20228   }
20229   Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20230                        DAG.getIntPtrConstant(0, dl));
20231   return Result;
20232 }
20233 
20234 /// 32-bit unsigned integer to float expansion.
LowerUINT_TO_FP_i32(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)20235 static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
20236                                    const X86Subtarget &Subtarget) {
20237   unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20238   SDLoc dl(Op);
20239   // FP constant to bias correct the final result.
20240   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
20241                                    MVT::f64);
20242 
20243   // Load the 32-bit value into an XMM register.
20244   SDValue Load =
20245       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20246 
20247   // Zero out the upper parts of the register.
20248   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20249 
20250   // Or the load with the bias.
20251   SDValue Or = DAG.getNode(
20252       ISD::OR, dl, MVT::v2i64,
20253       DAG.getBitcast(MVT::v2i64, Load),
20254       DAG.getBitcast(MVT::v2i64,
20255                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20256   Or =
20257       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20258                   DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
20259 
20260   if (Op.getNode()->isStrictFPOpcode()) {
20261     // Subtract the bias.
20262     // TODO: Are there any fast-math-flags to propagate here?
20263     SDValue Chain = Op.getOperand(0);
20264     SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20265                               {Chain, Or, Bias});
20266 
20267     if (Op.getValueType() == Sub.getValueType())
20268       return Sub;
20269 
20270     // Handle final rounding.
20271     std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20272         Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20273 
20274     return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20275   }
20276 
20277   // Subtract the bias.
20278   // TODO: Are there any fast-math-flags to propagate here?
20279   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20280 
20281   // Handle final rounding.
20282   return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20283 }
20284 
lowerUINT_TO_FP_v2i32(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget,const SDLoc & DL)20285 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
20286                                      const X86Subtarget &Subtarget,
20287                                      const SDLoc &DL) {
20288   if (Op.getSimpleValueType() != MVT::v2f64)
20289     return SDValue();
20290 
20291   bool IsStrict = Op->isStrictFPOpcode();
20292 
20293   SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20294   assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
20295 
20296   if (Subtarget.hasAVX512()) {
20297     if (!Subtarget.hasVLX()) {
20298       // Let generic type legalization widen this.
20299       if (!IsStrict)
20300         return SDValue();
20301       // Otherwise pad the integer input with 0s and widen the operation.
20302       N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20303                        DAG.getConstant(0, DL, MVT::v2i32));
20304       SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20305                                 {Op.getOperand(0), N0});
20306       SDValue Chain = Res.getValue(1);
20307       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20308                         DAG.getIntPtrConstant(0, DL));
20309       return DAG.getMergeValues({Res, Chain}, DL);
20310     }
20311 
20312     // Legalize to v4i32 type.
20313     N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20314                      DAG.getUNDEF(MVT::v2i32));
20315     if (IsStrict)
20316       return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20317                          {Op.getOperand(0), N0});
20318     return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20319   }
20320 
20321   // Zero extend to 2i64, OR with the floating point representation of 2^52.
20322   // This gives us the floating point equivalent of 2^52 + the i32 integer
20323   // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20324   // point leaving just our i32 integers in double format.
20325   SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20326   SDValue VBias =
20327       DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);
20328   SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20329                            DAG.getBitcast(MVT::v2i64, VBias));
20330   Or = DAG.getBitcast(MVT::v2f64, Or);
20331 
20332   if (IsStrict)
20333     return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20334                        {Op.getOperand(0), Or, VBias});
20335   return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20336 }
20337 
lowerUINT_TO_FP_vXi32(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)20338 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
20339                                      const X86Subtarget &Subtarget) {
20340   SDLoc DL(Op);
20341   bool IsStrict = Op->isStrictFPOpcode();
20342   SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20343   MVT VecIntVT = V.getSimpleValueType();
20344   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20345          "Unsupported custom type");
20346 
20347   if (Subtarget.hasAVX512()) {
20348     // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20349     assert(!Subtarget.hasVLX() && "Unexpected features");
20350     MVT VT = Op->getSimpleValueType(0);
20351 
20352     // v8i32->v8f64 is legal with AVX512 so just return it.
20353     if (VT == MVT::v8f64)
20354       return Op;
20355 
20356     assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
20357            "Unexpected VT!");
20358     MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20359     MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20360     // Need to concat with zero vector for strict fp to avoid spurious
20361     // exceptions.
20362     SDValue Tmp =
20363         IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20364     V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20365                     DAG.getIntPtrConstant(0, DL));
20366     SDValue Res, Chain;
20367     if (IsStrict) {
20368       Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20369                         {Op->getOperand(0), V});
20370       Chain = Res.getValue(1);
20371     } else {
20372       Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20373     }
20374 
20375     Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20376                       DAG.getIntPtrConstant(0, DL));
20377 
20378     if (IsStrict)
20379       return DAG.getMergeValues({Res, Chain}, DL);
20380     return Res;
20381   }
20382 
20383   if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20384       Op->getSimpleValueType(0) == MVT::v4f64) {
20385     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20386     Constant *Bias = ConstantFP::get(
20387         *DAG.getContext(),
20388         APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20389     auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20390     SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20391     SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20392     SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20393     SDValue VBias = DAG.getMemIntrinsicNode(
20394         X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20395         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
20396         MachineMemOperand::MOLoad);
20397 
20398     SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20399                              DAG.getBitcast(MVT::v4i64, VBias));
20400     Or = DAG.getBitcast(MVT::v4f64, Or);
20401 
20402     if (IsStrict)
20403       return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20404                          {Op.getOperand(0), Or, VBias});
20405     return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20406   }
20407 
20408   // The algorithm is the following:
20409   // #ifdef __SSE4_1__
20410   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20411   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20412   //                                 (uint4) 0x53000000, 0xaa);
20413   // #else
20414   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20415   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
20416   // #endif
20417   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20418   //     return (float4) lo + fhi;
20419 
20420   bool Is128 = VecIntVT == MVT::v4i32;
20421   MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20422   // If we convert to something else than the supported type, e.g., to v4f64,
20423   // abort early.
20424   if (VecFloatVT != Op->getSimpleValueType(0))
20425     return SDValue();
20426 
20427   // In the #idef/#else code, we have in common:
20428   // - The vector of constants:
20429   // -- 0x4b000000
20430   // -- 0x53000000
20431   // - A shift:
20432   // -- v >> 16
20433 
20434   // Create the splat vector for 0x4b000000.
20435   SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20436   // Create the splat vector for 0x53000000.
20437   SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20438 
20439   // Create the right shift.
20440   SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20441   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20442 
20443   SDValue Low, High;
20444   if (Subtarget.hasSSE41()) {
20445     MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20446     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20447     SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20448     SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20449     // Low will be bitcasted right away, so do not bother bitcasting back to its
20450     // original type.
20451     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20452                       VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20453     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20454     //                                 (uint4) 0x53000000, 0xaa);
20455     SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20456     SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20457     // High will be bitcasted right away, so do not bother bitcasting back to
20458     // its original type.
20459     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20460                        VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20461   } else {
20462     SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20463     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20464     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20465     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20466 
20467     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
20468     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20469   }
20470 
20471   // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20472   SDValue VecCstFSub = DAG.getConstantFP(
20473       APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20474 
20475   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20476   // NOTE: By using fsub of a positive constant instead of fadd of a negative
20477   // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20478   // enabled. See PR24512.
20479   SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20480   // TODO: Are there any fast-math-flags to propagate here?
20481   //     (float4) lo;
20482   SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20483   //     return (float4) lo + fhi;
20484   if (IsStrict) {
20485     SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20486                                 {Op.getOperand(0), HighBitcast, VecCstFSub});
20487     return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20488                        {FHigh.getValue(1), LowBitcast, FHigh});
20489   }
20490 
20491   SDValue FHigh =
20492       DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20493   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20494 }
20495 
lowerUINT_TO_FP_vec(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)20496 static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
20497                                    const X86Subtarget &Subtarget) {
20498   unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20499   SDValue N0 = Op.getOperand(OpNo);
20500   MVT SrcVT = N0.getSimpleValueType();
20501   SDLoc dl(Op);
20502 
20503   switch (SrcVT.SimpleTy) {
20504   default:
20505     llvm_unreachable("Custom UINT_TO_FP is not supported!");
20506   case MVT::v2i32:
20507     return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
20508   case MVT::v4i32:
20509   case MVT::v8i32:
20510     return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
20511   case MVT::v2i64:
20512   case MVT::v4i64:
20513     return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
20514   }
20515 }
20516 
LowerUINT_TO_FP(SDValue Op,SelectionDAG & DAG) const20517 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20518                                            SelectionDAG &DAG) const {
20519   bool IsStrict = Op->isStrictFPOpcode();
20520   unsigned OpNo = IsStrict ? 1 : 0;
20521   SDValue Src = Op.getOperand(OpNo);
20522   SDLoc dl(Op);
20523   auto PtrVT = getPointerTy(DAG.getDataLayout());
20524   MVT SrcVT = Src.getSimpleValueType();
20525   MVT DstVT = Op->getSimpleValueType(0);
20526   SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20527 
20528   if (DstVT == MVT::f128)
20529     return SDValue();
20530 
20531   if (DstVT.isVector())
20532     return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
20533 
20534   if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
20535     return Extract;
20536 
20537   if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20538       (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20539     // Conversions from unsigned i32 to f32/f64 are legal,
20540     // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
20541     return Op;
20542   }
20543 
20544   // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20545   if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20546     Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20547     if (IsStrict)
20548       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20549                          {Chain, Src});
20550     return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20551   }
20552 
20553   if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
20554     return V;
20555 
20556   // The transform for i64->f64 isn't correct for 0 when rounding to negative
20557   // infinity. It produces -0.0, so disable under strictfp.
20558   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict)
20559     return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
20560   if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
20561     return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
20562   if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20563       (DstVT == MVT::f32 || DstVT == MVT::f64))
20564     return SDValue();
20565 
20566   // Make a 64-bit buffer, and use it to build an FILD.
20567   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20568   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20569   Align SlotAlign(8);
20570   MachinePointerInfo MPI =
20571     MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
20572   if (SrcVT == MVT::i32) {
20573     SDValue OffsetSlot =
20574         DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
20575     SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20576     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20577                                   OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20578     std::pair<SDValue, SDValue> Tmp =
20579         BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20580     if (IsStrict)
20581       return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20582 
20583     return Tmp.first;
20584   }
20585 
20586   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
20587   SDValue ValueToStore = Src;
20588   if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20589     // Bitcasting to f64 here allows us to do a single 64-bit store from
20590     // an SSE register, avoiding the store forwarding penalty that would come
20591     // with two 32-bit stores.
20592     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20593   }
20594   SDValue Store =
20595       DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20596   // For i64 source, we need to add the appropriate power of 2 if the input
20597   // was negative. We must be careful to do the computation in x87 extended
20598   // precision, not in SSE.
20599   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20600   SDValue Ops[] = { Store, StackSlot };
20601   SDValue Fild =
20602       DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20603                               SlotAlign, MachineMemOperand::MOLoad);
20604   Chain = Fild.getValue(1);
20605 
20606 
20607   // Check whether the sign bit is set.
20608   SDValue SignSet = DAG.getSetCC(
20609       dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20610       Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20611 
20612   // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20613   APInt FF(64, 0x5F80000000000000ULL);
20614   SDValue FudgePtr = DAG.getConstantPool(
20615       ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20616   Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20617 
20618   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20619   SDValue Zero = DAG.getIntPtrConstant(0, dl);
20620   SDValue Four = DAG.getIntPtrConstant(4, dl);
20621   SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20622   FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20623 
20624   // Load the value out, extending it from f32 to f80.
20625   SDValue Fudge = DAG.getExtLoad(
20626       ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20627       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
20628       CPAlignment);
20629   Chain = Fudge.getValue(1);
20630   // Extend everything to 80 bits to force it to be done on x87.
20631   // TODO: Are there any fast-math-flags to propagate here?
20632   if (IsStrict) {
20633     SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
20634                               {Chain, Fild, Fudge});
20635     // STRICT_FP_ROUND can't handle equal types.
20636     if (DstVT == MVT::f80)
20637       return Add;
20638     return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20639                        {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
20640   }
20641   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
20642   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20643                      DAG.getIntPtrConstant(0, dl));
20644 }
20645 
20646 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20647 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20648 // just return an SDValue().
20649 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20650 // to i16, i32 or i64, and we lower it to a legal sequence and return the
20651 // result.
20652 SDValue
FP_TO_INTHelper(SDValue Op,SelectionDAG & DAG,bool IsSigned,SDValue & Chain) const20653 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20654                                    bool IsSigned, SDValue &Chain) const {
20655   bool IsStrict = Op->isStrictFPOpcode();
20656   SDLoc DL(Op);
20657 
20658   EVT DstTy = Op.getValueType();
20659   SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20660   EVT TheVT = Value.getValueType();
20661   auto PtrVT = getPointerTy(DAG.getDataLayout());
20662 
20663   if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20664     // f16 must be promoted before using the lowering in this routine.
20665     // fp128 does not use this lowering.
20666     return SDValue();
20667   }
20668 
20669   // If using FIST to compute an unsigned i64, we'll need some fixup
20670   // to handle values above the maximum signed i64.  A FIST is always
20671   // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20672   bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20673 
20674   // FIXME: This does not generate an invalid exception if the input does not
20675   // fit in i32. PR44019
20676   if (!IsSigned && DstTy != MVT::i64) {
20677     // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20678     // The low 32 bits of the fist result will have the correct uint32 result.
20679     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20680     DstTy = MVT::i64;
20681   }
20682 
20683   assert(DstTy.getSimpleVT() <= MVT::i64 &&
20684          DstTy.getSimpleVT() >= MVT::i16 &&
20685          "Unknown FP_TO_INT to lower!");
20686 
20687   // We lower FP->int64 into FISTP64 followed by a load from a temporary
20688   // stack slot.
20689   MachineFunction &MF = DAG.getMachineFunction();
20690   unsigned MemSize = DstTy.getStoreSize();
20691   int SSFI =
20692       MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20693   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20694 
20695   Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20696 
20697   SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20698 
20699   if (UnsignedFixup) {
20700     //
20701     // Conversion to unsigned i64 is implemented with a select,
20702     // depending on whether the source value fits in the range
20703     // of a signed i64.  Let Thresh be the FP equivalent of
20704     // 0x8000000000000000ULL.
20705     //
20706     //  Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20707     //  FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20708     //  FistSrc = (Value - FltOfs);
20709     //  Fist-to-mem64 FistSrc
20710     //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20711     //  to XOR'ing the high 32 bits with Adjust.
20712     //
20713     // Being a power of 2, Thresh is exactly representable in all FP formats.
20714     // For X87 we'd like to use the smallest FP type for this constant, but
20715     // for DAG type consistency we have to match the FP operand type.
20716 
20717     APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20718     LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
20719     bool LosesInfo = false;
20720     if (TheVT == MVT::f64)
20721       // The rounding mode is irrelevant as the conversion should be exact.
20722       Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
20723                               &LosesInfo);
20724     else if (TheVT == MVT::f80)
20725       Status = Thresh.convert(APFloat::x87DoubleExtended(),
20726                               APFloat::rmNearestTiesToEven, &LosesInfo);
20727 
20728     assert(Status == APFloat::opOK && !LosesInfo &&
20729            "FP conversion should have been exact");
20730 
20731     SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20732 
20733     EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20734                                    *DAG.getContext(), TheVT);
20735     SDValue Cmp;
20736     if (IsStrict) {
20737       Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20738                          /*IsSignaling*/ true);
20739       Chain = Cmp.getValue(1);
20740     } else {
20741       Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20742     }
20743 
20744     // Our preferred lowering of
20745     //
20746     // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20747     //
20748     // is
20749     //
20750     // (Value >= Thresh) << 63
20751     //
20752     // but since we can get here after LegalOperations, DAGCombine might do the
20753     // wrong thing if we create a select. So, directly create the preferred
20754     // version.
20755     SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20756     SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20757     Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20758 
20759     SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20760                                    DAG.getConstantFP(0.0, DL, TheVT));
20761 
20762     if (IsStrict) {
20763       Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20764                           { Chain, Value, FltOfs });
20765       Chain = Value.getValue(1);
20766     } else
20767       Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20768   }
20769 
20770   MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
20771 
20772   // FIXME This causes a redundant load/store if the SSE-class value is already
20773   // in memory, such as if it is on the callstack.
20774   if (isScalarFPTypeInSSEReg(TheVT)) {
20775     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20776     Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20777     SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20778     SDValue Ops[] = { Chain, StackSlot };
20779 
20780     unsigned FLDSize = TheVT.getStoreSize();
20781     assert(FLDSize <= MemSize && "Stack slot not big enough");
20782     MachineMemOperand *MMO = MF.getMachineMemOperand(
20783         MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20784     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20785     Chain = Value.getValue(1);
20786   }
20787 
20788   // Build the FP_TO_INT*_IN_MEM
20789   MachineMemOperand *MMO = MF.getMachineMemOperand(
20790       MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20791   SDValue Ops[] = { Chain, Value, StackSlot };
20792   SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
20793                                          DAG.getVTList(MVT::Other),
20794                                          Ops, DstTy, MMO);
20795 
20796   SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
20797   Chain = Res.getValue(1);
20798 
20799   // If we need an unsigned fixup, XOR the result with adjust.
20800   if (UnsignedFixup)
20801     Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20802 
20803   return Res;
20804 }
20805 
LowerAVXExtend(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)20806 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
20807                               const X86Subtarget &Subtarget) {
20808   MVT VT = Op.getSimpleValueType();
20809   SDValue In = Op.getOperand(0);
20810   MVT InVT = In.getSimpleValueType();
20811   SDLoc dl(Op);
20812   unsigned Opc = Op.getOpcode();
20813 
20814   assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20815   assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
20816          "Unexpected extension opcode");
20817   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
20818          "Expected same number of elements");
20819   assert((VT.getVectorElementType() == MVT::i16 ||
20820           VT.getVectorElementType() == MVT::i32 ||
20821           VT.getVectorElementType() == MVT::i64) &&
20822          "Unexpected element type");
20823   assert((InVT.getVectorElementType() == MVT::i8 ||
20824           InVT.getVectorElementType() == MVT::i16 ||
20825           InVT.getVectorElementType() == MVT::i32) &&
20826          "Unexpected element type");
20827 
20828   unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
20829 
20830   if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20831     assert(InVT == MVT::v32i8 && "Unexpected VT!");
20832     return splitVectorIntUnary(Op, DAG);
20833   }
20834 
20835   if (Subtarget.hasInt256())
20836     return Op;
20837 
20838   // Optimize vectors in AVX mode:
20839   //
20840   //   v8i16 -> v8i32
20841   //   Use vpmovzwd for 4 lower elements  v8i16 -> v4i32.
20842   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
20843   //   Concat upper and lower parts.
20844   //
20845   //   v4i32 -> v4i64
20846   //   Use vpmovzdq for 4 lower elements  v4i32 -> v2i64.
20847   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
20848   //   Concat upper and lower parts.
20849   //
20850   MVT HalfVT = VT.getHalfNumVectorElementsVT();
20851   SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20852 
20853   // Short-circuit if we can determine that each 128-bit half is the same value.
20854   // Otherwise, this is difficult to match and optimize.
20855   if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20856     if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20857       return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20858 
20859   SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20860   SDValue Undef = DAG.getUNDEF(InVT);
20861   bool NeedZero = Opc == ISD::ZERO_EXTEND;
20862   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20863   OpHi = DAG.getBitcast(HalfVT, OpHi);
20864 
20865   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20866 }
20867 
20868 // Helper to split and extend a v16i1 mask to v16i8 or v16i16.
SplitAndExtendv16i1(unsigned ExtOpc,MVT VT,SDValue In,const SDLoc & dl,SelectionDAG & DAG)20869 static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20870                                    const SDLoc &dl, SelectionDAG &DAG) {
20871   assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20872   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20873                            DAG.getIntPtrConstant(0, dl));
20874   SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20875                            DAG.getIntPtrConstant(8, dl));
20876   Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20877   Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20878   SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20879   return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20880 }
20881 
LowerZERO_EXTEND_Mask(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)20882 static  SDValue LowerZERO_EXTEND_Mask(SDValue Op,
20883                                       const X86Subtarget &Subtarget,
20884                                       SelectionDAG &DAG) {
20885   MVT VT = Op->getSimpleValueType(0);
20886   SDValue In = Op->getOperand(0);
20887   MVT InVT = In.getSimpleValueType();
20888   assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20889   SDLoc DL(Op);
20890   unsigned NumElts = VT.getVectorNumElements();
20891 
20892   // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20893   // avoids a constant pool load.
20894   if (VT.getVectorElementType() != MVT::i8) {
20895     SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20896     return DAG.getNode(ISD::SRL, DL, VT, Extend,
20897                        DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20898   }
20899 
20900   // Extend VT if BWI is not supported.
20901   MVT ExtVT = VT;
20902   if (!Subtarget.hasBWI()) {
20903     // If v16i32 is to be avoided, we'll need to split and concatenate.
20904     if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20905       return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
20906 
20907     ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20908   }
20909 
20910   // Widen to 512-bits if VLX is not supported.
20911   MVT WideVT = ExtVT;
20912   if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20913     NumElts *= 512 / ExtVT.getSizeInBits();
20914     InVT = MVT::getVectorVT(MVT::i1, NumElts);
20915     In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
20916                      In, DAG.getIntPtrConstant(0, DL));
20917     WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
20918                               NumElts);
20919   }
20920 
20921   SDValue One = DAG.getConstant(1, DL, WideVT);
20922   SDValue Zero = DAG.getConstant(0, DL, WideVT);
20923 
20924   SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
20925 
20926   // Truncate if we had to extend above.
20927   if (VT != ExtVT) {
20928     WideVT = MVT::getVectorVT(MVT::i8, NumElts);
20929     SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
20930   }
20931 
20932   // Extract back to 128/256-bit if we widened.
20933   if (WideVT != VT)
20934     SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
20935                               DAG.getIntPtrConstant(0, DL));
20936 
20937   return SelectedVal;
20938 }
20939 
LowerZERO_EXTEND(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)20940 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
20941                                 SelectionDAG &DAG) {
20942   SDValue In = Op.getOperand(0);
20943   MVT SVT = In.getSimpleValueType();
20944 
20945   if (SVT.getVectorElementType() == MVT::i1)
20946     return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
20947 
20948   assert(Subtarget.hasAVX() && "Expected AVX support");
20949   return LowerAVXExtend(Op, DAG, Subtarget);
20950 }
20951 
20952 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
20953 /// It makes use of the fact that vectors with enough leading sign/zero bits
20954 /// prevent the PACKSS/PACKUS from saturating the results.
20955 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
20956 /// within each 128-bit lane.
truncateVectorWithPACK(unsigned Opcode,EVT DstVT,SDValue In,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget)20957 static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
20958                                       const SDLoc &DL, SelectionDAG &DAG,
20959                                       const X86Subtarget &Subtarget) {
20960   assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
20961          "Unexpected PACK opcode");
20962   assert(DstVT.isVector() && "VT not a vector?");
20963 
20964   // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
20965   if (!Subtarget.hasSSE2())
20966     return SDValue();
20967 
20968   EVT SrcVT = In.getValueType();
20969 
20970   // No truncation required, we might get here due to recursive calls.
20971   if (SrcVT == DstVT)
20972     return In;
20973 
20974   // We only support vector truncation to 64bits or greater from a
20975   // 128bits or greater source.
20976   unsigned DstSizeInBits = DstVT.getSizeInBits();
20977   unsigned SrcSizeInBits = SrcVT.getSizeInBits();
20978   if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
20979     return SDValue();
20980 
20981   unsigned NumElems = SrcVT.getVectorNumElements();
20982   if (!isPowerOf2_32(NumElems))
20983     return SDValue();
20984 
20985   LLVMContext &Ctx = *DAG.getContext();
20986   assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
20987   assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
20988 
20989   EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
20990 
20991   // Pack to the largest type possible:
20992   // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20993   EVT InVT = MVT::i16, OutVT = MVT::i8;
20994   if (SrcVT.getScalarSizeInBits() > 16 &&
20995       (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
20996     InVT = MVT::i32;
20997     OutVT = MVT::i16;
20998   }
20999 
21000   // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
21001   if (SrcVT.is128BitVector()) {
21002     InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
21003     OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
21004     In = DAG.getBitcast(InVT, In);
21005     SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
21006     Res = extractSubVector(Res, 0, DAG, DL, 64);
21007     return DAG.getBitcast(DstVT, Res);
21008   }
21009 
21010   // Split lower/upper subvectors.
21011   SDValue Lo, Hi;
21012   std::tie(Lo, Hi) = splitVector(In, DAG, DL);
21013 
21014   unsigned SubSizeInBits = SrcSizeInBits / 2;
21015   InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
21016   OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
21017 
21018   // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
21019   if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
21020     Lo = DAG.getBitcast(InVT, Lo);
21021     Hi = DAG.getBitcast(InVT, Hi);
21022     SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21023     return DAG.getBitcast(DstVT, Res);
21024   }
21025 
21026   // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
21027   // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
21028   if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
21029     Lo = DAG.getBitcast(InVT, Lo);
21030     Hi = DAG.getBitcast(InVT, Hi);
21031     SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21032 
21033     // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
21034     // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
21035     // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
21036     SmallVector<int, 64> Mask;
21037     int Scale = 64 / OutVT.getScalarSizeInBits();
21038     narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
21039     Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
21040 
21041     if (DstVT.is256BitVector())
21042       return DAG.getBitcast(DstVT, Res);
21043 
21044     // If 512bit -> 128bit truncate another stage.
21045     EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21046     Res = DAG.getBitcast(PackedVT, Res);
21047     return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21048   }
21049 
21050   // Recursively pack lower/upper subvectors, concat result and pack again.
21051   assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
21052   EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
21053   Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
21054   Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
21055 
21056   PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21057   SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
21058   return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21059 }
21060 
LowerTruncateVecI1(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)21061 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
21062                                   const X86Subtarget &Subtarget) {
21063 
21064   SDLoc DL(Op);
21065   MVT VT = Op.getSimpleValueType();
21066   SDValue In = Op.getOperand(0);
21067   MVT InVT = In.getSimpleValueType();
21068 
21069   assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
21070 
21071   // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21072   unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21073   if (InVT.getScalarSizeInBits() <= 16) {
21074     if (Subtarget.hasBWI()) {
21075       // legal, will go to VPMOVB2M, VPMOVW2M
21076       if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21077         // We need to shift to get the lsb into sign position.
21078         // Shift packed bytes not supported natively, bitcast to word
21079         MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21080         In = DAG.getNode(ISD::SHL, DL, ExtVT,
21081                          DAG.getBitcast(ExtVT, In),
21082                          DAG.getConstant(ShiftInx, DL, ExtVT));
21083         In = DAG.getBitcast(InVT, In);
21084       }
21085       return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21086                           In, ISD::SETGT);
21087     }
21088     // Use TESTD/Q, extended vector to packed dword/qword.
21089     assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
21090            "Unexpected vector type.");
21091     unsigned NumElts = InVT.getVectorNumElements();
21092     assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
21093     // We need to change to a wider element type that we have support for.
21094     // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21095     // For 16 element vectors we extend to v16i32 unless we are explicitly
21096     // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21097     // we need to split into two 8 element vectors which we can extend to v8i32,
21098     // truncate and concat the results. There's an additional complication if
21099     // the original type is v16i8. In that case we can't split the v16i8
21100     // directly, so we need to shuffle high elements to low and use
21101     // sign_extend_vector_inreg.
21102     if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21103       SDValue Lo, Hi;
21104       if (InVT == MVT::v16i8) {
21105         Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21106         Hi = DAG.getVectorShuffle(
21107             InVT, DL, In, In,
21108             {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21109         Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21110       } else {
21111         assert(InVT == MVT::v16i16 && "Unexpected VT!");
21112         Lo = extract128BitVector(In, 0, DAG, DL);
21113         Hi = extract128BitVector(In, 8, DAG, DL);
21114       }
21115       // We're split now, just emit two truncates and a concat. The two
21116       // truncates will trigger legalization to come back to this function.
21117       Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21118       Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21119       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21120     }
21121     // We either have 8 elements or we're allowed to use 512-bit vectors.
21122     // If we have VLX, we want to use the narrowest vector that can get the
21123     // job done so we use vXi32.
21124     MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21125     MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21126     In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21127     InVT = ExtVT;
21128     ShiftInx = InVT.getScalarSizeInBits() - 1;
21129   }
21130 
21131   if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21132     // We need to shift to get the lsb into sign position.
21133     In = DAG.getNode(ISD::SHL, DL, InVT, In,
21134                      DAG.getConstant(ShiftInx, DL, InVT));
21135   }
21136   // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21137   if (Subtarget.hasDQI())
21138     return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21139   return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21140 }
21141 
LowerTRUNCATE(SDValue Op,SelectionDAG & DAG) const21142 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21143   SDLoc DL(Op);
21144   MVT VT = Op.getSimpleValueType();
21145   SDValue In = Op.getOperand(0);
21146   MVT InVT = In.getSimpleValueType();
21147   unsigned InNumEltBits = InVT.getScalarSizeInBits();
21148 
21149   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
21150          "Invalid TRUNCATE operation");
21151 
21152   // If we're called by the type legalizer, handle a few cases.
21153   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21154   if (!TLI.isTypeLegal(InVT)) {
21155     if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21156         VT.is128BitVector()) {
21157       assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21158              "Unexpected subtarget!");
21159       // The default behavior is to truncate one step, concatenate, and then
21160       // truncate the remainder. We'd rather produce two 64-bit results and
21161       // concatenate those.
21162       SDValue Lo, Hi;
21163       std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21164 
21165       EVT LoVT, HiVT;
21166       std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21167 
21168       Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21169       Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21170       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21171     }
21172 
21173     // Otherwise let default legalization handle it.
21174     return SDValue();
21175   }
21176 
21177   if (VT.getVectorElementType() == MVT::i1)
21178     return LowerTruncateVecI1(Op, DAG, Subtarget);
21179 
21180   // vpmovqb/w/d, vpmovdb/w, vpmovwb
21181   if (Subtarget.hasAVX512()) {
21182     if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21183       assert(VT == MVT::v32i8 && "Unexpected VT!");
21184       return splitVectorIntUnary(Op, DAG);
21185     }
21186 
21187     // word to byte only under BWI. Otherwise we have to promoted to v16i32
21188     // and then truncate that. But we should only do that if we haven't been
21189     // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21190     // handled by isel patterns.
21191     if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21192         Subtarget.canExtendTo512DQ())
21193       return Op;
21194   }
21195 
21196   unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
21197   unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21198 
21199   // Truncate with PACKUS if we are truncating a vector with leading zero bits
21200   // that extend all the way to the packed/truncated value.
21201   // Pre-SSE41 we can only use PACKUSWB.
21202   KnownBits Known = DAG.computeKnownBits(In);
21203   if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
21204     if (SDValue V =
21205             truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
21206       return V;
21207 
21208   // Truncate with PACKSS if we are truncating a vector with sign-bits that
21209   // extend all the way to the packed/truncated value.
21210   if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
21211     if (SDValue V =
21212             truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
21213       return V;
21214 
21215   // Handle truncation of V256 to V128 using shuffles.
21216   assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
21217 
21218   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21219     In = DAG.getBitcast(MVT::v8i32, In);
21220 
21221     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21222     if (Subtarget.hasInt256()) {
21223       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21224       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21225       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21226                          DAG.getIntPtrConstant(0, DL));
21227     }
21228 
21229     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21230                                DAG.getIntPtrConstant(0, DL));
21231     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21232                                DAG.getIntPtrConstant(4, DL));
21233     static const int ShufMask[] = {0, 2, 4, 6};
21234     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
21235   }
21236 
21237   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21238     In = DAG.getBitcast(MVT::v32i8, In);
21239 
21240     // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21241     if (Subtarget.hasInt256()) {
21242       // The PSHUFB mask:
21243       static const int ShufMask1[] = { 0,  1,  4,  5,  8,  9, 12, 13,
21244                                       -1, -1, -1, -1, -1, -1, -1, -1,
21245                                       16, 17, 20, 21, 24, 25, 28, 29,
21246                                       -1, -1, -1, -1, -1, -1, -1, -1 };
21247       In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21248       In = DAG.getBitcast(MVT::v4i64, In);
21249 
21250       static const int ShufMask2[] = {0, 2, -1, -1};
21251       In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21252       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16,
21253                          DAG.getBitcast(MVT::v16i16, In),
21254                          DAG.getIntPtrConstant(0, DL));
21255     }
21256 
21257     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
21258                                DAG.getIntPtrConstant(0, DL));
21259     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
21260                                DAG.getIntPtrConstant(16, DL));
21261 
21262     // The PSHUFB mask:
21263     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
21264                                    -1, -1, -1, -1, -1, -1, -1, -1};
21265 
21266     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
21267     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
21268 
21269     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
21270     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
21271 
21272     // The MOVLHPS Mask:
21273     static const int ShufMask2[] = {0, 1, 4, 5};
21274     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
21275     return DAG.getBitcast(MVT::v8i16, res);
21276   }
21277 
21278   if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
21279     // Use an AND to zero uppper bits for PACKUS.
21280     In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
21281 
21282     SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21283                                DAG.getIntPtrConstant(0, DL));
21284     SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21285                                DAG.getIntPtrConstant(8, DL));
21286     return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
21287   }
21288 
21289   llvm_unreachable("All 256->128 cases should have been handled above!");
21290 }
21291 
21292 // We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21293 // behaves on out of range inputs to generate optimized conversions.
expandFP_TO_UINT_SSE(MVT VT,SDValue Src,const SDLoc & dl,SelectionDAG & DAG,const X86Subtarget & Subtarget)21294 static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,
21295                                     SelectionDAG &DAG,
21296                                     const X86Subtarget &Subtarget) {
21297   MVT SrcVT = Src.getSimpleValueType();
21298   unsigned DstBits = VT.getScalarSizeInBits();
21299   assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
21300 
21301   // Calculate the converted result for values in the range 0 to
21302   // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21303   SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21304   SDValue Big =
21305       DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21306                   DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21307                               DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21308 
21309   // The "CVTTP2SI" instruction conveniently sets the sign bit if
21310   // and only if the value was out of range. So we can use that
21311   // as our indicator that we rather use "Big" instead of "Small".
21312   //
21313   // Use "Small" if "IsOverflown" has all bits cleared
21314   // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21315 
21316   // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21317   // use the slightly slower blendv select instead.
21318   if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21319     SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21320     return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21321   }
21322 
21323   SDValue IsOverflown =
21324       DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21325                   DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21326   return DAG.getNode(ISD::OR, dl, VT, Small,
21327                      DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21328 }
21329 
LowerFP_TO_INT(SDValue Op,SelectionDAG & DAG) const21330 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21331   bool IsStrict = Op->isStrictFPOpcode();
21332   bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21333                   Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21334   MVT VT = Op->getSimpleValueType(0);
21335   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21336   MVT SrcVT = Src.getSimpleValueType();
21337   SDLoc dl(Op);
21338 
21339   if (VT.isVector()) {
21340     if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21341       MVT ResVT = MVT::v4i32;
21342       MVT TruncVT = MVT::v4i1;
21343       unsigned Opc;
21344       if (IsStrict)
21345         Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
21346       else
21347         Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21348 
21349       if (!IsSigned && !Subtarget.hasVLX()) {
21350         assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21351         // Widen to 512-bits.
21352         ResVT = MVT::v8i32;
21353         TruncVT = MVT::v8i1;
21354         Opc = Op.getOpcode();
21355         // Need to concat with zero vector for strict fp to avoid spurious
21356         // exceptions.
21357         // TODO: Should we just do this for non-strict as well?
21358         SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21359                                : DAG.getUNDEF(MVT::v8f64);
21360         Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21361                           DAG.getIntPtrConstant(0, dl));
21362       }
21363       SDValue Res, Chain;
21364       if (IsStrict) {
21365         Res =
21366             DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src});
21367         Chain = Res.getValue(1);
21368       } else {
21369         Res = DAG.getNode(Opc, dl, ResVT, Src);
21370       }
21371 
21372       Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21373       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21374                         DAG.getIntPtrConstant(0, dl));
21375       if (IsStrict)
21376         return DAG.getMergeValues({Res, Chain}, dl);
21377       return Res;
21378     }
21379 
21380     // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21381     if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21382       assert(!IsSigned && "Expected unsigned conversion!");
21383       assert(Subtarget.useAVX512Regs() && "Requires avx512f");
21384       return Op;
21385     }
21386 
21387     // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21388     if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21389         (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21390         Subtarget.useAVX512Regs()) {
21391       assert(!IsSigned && "Expected unsigned conversion!");
21392       assert(!Subtarget.hasVLX() && "Unexpected features!");
21393       MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21394       MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21395       // Need to concat with zero vector for strict fp to avoid spurious
21396       // exceptions.
21397       // TODO: Should we just do this for non-strict as well?
21398       SDValue Tmp =
21399           IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21400       Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21401                         DAG.getIntPtrConstant(0, dl));
21402 
21403       SDValue Res, Chain;
21404       if (IsStrict) {
21405         Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21406                           {Op->getOperand(0), Src});
21407         Chain = Res.getValue(1);
21408       } else {
21409         Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21410       }
21411 
21412       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21413                         DAG.getIntPtrConstant(0, dl));
21414 
21415       if (IsStrict)
21416         return DAG.getMergeValues({Res, Chain}, dl);
21417       return Res;
21418     }
21419 
21420     // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21421     if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21422         (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21423         Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21424       assert(!Subtarget.hasVLX() && "Unexpected features!");
21425       MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21426       // Need to concat with zero vector for strict fp to avoid spurious
21427       // exceptions.
21428       // TODO: Should we just do this for non-strict as well?
21429       SDValue Tmp =
21430           IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21431       Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21432                         DAG.getIntPtrConstant(0, dl));
21433 
21434       SDValue Res, Chain;
21435       if (IsStrict) {
21436         Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21437                           {Op->getOperand(0), Src});
21438         Chain = Res.getValue(1);
21439       } else {
21440         Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21441       }
21442 
21443       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21444                         DAG.getIntPtrConstant(0, dl));
21445 
21446       if (IsStrict)
21447         return DAG.getMergeValues({Res, Chain}, dl);
21448       return Res;
21449     }
21450 
21451     if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21452       if (!Subtarget.hasVLX()) {
21453         // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21454         // legalizer and then widened again by vector op legalization.
21455         if (!IsStrict)
21456           return SDValue();
21457 
21458         SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21459         SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21460                                   {Src, Zero, Zero, Zero});
21461         Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21462                           {Op->getOperand(0), Tmp});
21463         SDValue Chain = Tmp.getValue(1);
21464         Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21465                           DAG.getIntPtrConstant(0, dl));
21466         return DAG.getMergeValues({Tmp, Chain}, dl);
21467       }
21468 
21469       assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21470       SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21471                                 DAG.getUNDEF(MVT::v2f32));
21472       if (IsStrict) {
21473         unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21474                                 : X86ISD::STRICT_CVTTP2UI;
21475         return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21476       }
21477       unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21478       return DAG.getNode(Opc, dl, VT, Tmp);
21479     }
21480 
21481     // Generate optimized instructions for pre AVX512 unsigned conversions from
21482     // vXf32 to vXi32.
21483     if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21484         (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21485         (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21486       assert(!IsSigned && "Expected unsigned conversion!");
21487       return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21488     }
21489 
21490     return SDValue();
21491   }
21492 
21493   assert(!VT.isVector());
21494 
21495   bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21496 
21497   if (!IsSigned && UseSSEReg) {
21498     // Conversions from f32/f64 with AVX512 should be legal.
21499     if (Subtarget.hasAVX512())
21500       return Op;
21501 
21502     // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21503     // behaves on out of range inputs to generate optimized conversions.
21504     if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21505                       (VT == MVT::i64 && Subtarget.is64Bit()))) {
21506       unsigned DstBits = VT.getScalarSizeInBits();
21507       APInt UIntLimit = APInt::getSignMask(DstBits);
21508       SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21509                                         DAG.getConstant(UIntLimit, dl, VT));
21510       MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21511 
21512       // Calculate the converted result for values in the range:
21513       // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21514       // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21515       SDValue Small =
21516           DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21517                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21518       SDValue Big = DAG.getNode(
21519           X86ISD::CVTTS2SI, dl, VT,
21520           DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21521                       DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21522 
21523       // The "CVTTS2SI" instruction conveniently sets the sign bit if
21524       // and only if the value was out of range. So we can use that
21525       // as our indicator that we rather use "Big" instead of "Small".
21526       //
21527       // Use "Small" if "IsOverflown" has all bits cleared
21528       // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21529       SDValue IsOverflown = DAG.getNode(
21530           ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21531       return DAG.getNode(ISD::OR, dl, VT, Small,
21532                          DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21533     }
21534 
21535     // Use default expansion for i64.
21536     if (VT == MVT::i64)
21537       return SDValue();
21538 
21539     assert(VT == MVT::i32 && "Unexpected VT!");
21540 
21541     // Promote i32 to i64 and use a signed operation on 64-bit targets.
21542     // FIXME: This does not generate an invalid exception if the input does not
21543     // fit in i32. PR44019
21544     if (Subtarget.is64Bit()) {
21545       SDValue Res, Chain;
21546       if (IsStrict) {
21547         Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other},
21548                           { Op.getOperand(0), Src });
21549         Chain = Res.getValue(1);
21550       } else
21551         Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21552 
21553       Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21554       if (IsStrict)
21555         return DAG.getMergeValues({ Res, Chain }, dl);
21556       return Res;
21557     }
21558 
21559     // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21560     // use fisttp which will be handled later.
21561     if (!Subtarget.hasSSE3())
21562       return SDValue();
21563   }
21564 
21565   // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21566   // FIXME: This does not generate an invalid exception if the input does not
21567   // fit in i16. PR44019
21568   if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21569     assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21570     SDValue Res, Chain;
21571     if (IsStrict) {
21572       Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other},
21573                         { Op.getOperand(0), Src });
21574       Chain = Res.getValue(1);
21575     } else
21576       Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21577 
21578     Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21579     if (IsStrict)
21580       return DAG.getMergeValues({ Res, Chain }, dl);
21581     return Res;
21582   }
21583 
21584   // If this is a FP_TO_SINT using SSEReg we're done.
21585   if (UseSSEReg && IsSigned)
21586     return Op;
21587 
21588   // fp128 needs to use a libcall.
21589   if (SrcVT == MVT::f128) {
21590     RTLIB::Libcall LC;
21591     if (IsSigned)
21592       LC = RTLIB::getFPTOSINT(SrcVT, VT);
21593     else
21594       LC = RTLIB::getFPTOUINT(SrcVT, VT);
21595 
21596     SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21597     MakeLibCallOptions CallOptions;
21598     std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
21599                                                   SDLoc(Op), Chain);
21600 
21601     if (IsStrict)
21602       return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21603 
21604     return Tmp.first;
21605   }
21606 
21607   // Fall back to X87.
21608   SDValue Chain;
21609   if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21610     if (IsStrict)
21611       return DAG.getMergeValues({V, Chain}, dl);
21612     return V;
21613   }
21614 
21615   llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21616 }
21617 
LowerLRINT_LLRINT(SDValue Op,SelectionDAG & DAG) const21618 SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21619                                              SelectionDAG &DAG) const {
21620   SDValue Src = Op.getOperand(0);
21621   MVT SrcVT = Src.getSimpleValueType();
21622 
21623   // If the source is in an SSE register, the node is Legal.
21624   if (isScalarFPTypeInSSEReg(SrcVT))
21625     return Op;
21626 
21627   return LRINT_LLRINTHelper(Op.getNode(), DAG);
21628 }
21629 
LRINT_LLRINTHelper(SDNode * N,SelectionDAG & DAG) const21630 SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
21631                                               SelectionDAG &DAG) const {
21632   EVT DstVT = N->getValueType(0);
21633   SDValue Src = N->getOperand(0);
21634   EVT SrcVT = Src.getValueType();
21635 
21636   if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21637     // f16 must be promoted before using the lowering in this routine.
21638     // fp128 does not use this lowering.
21639     return SDValue();
21640   }
21641 
21642   SDLoc DL(N);
21643   SDValue Chain = DAG.getEntryNode();
21644 
21645   bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
21646 
21647   // If we're converting from SSE, the stack slot needs to hold both types.
21648   // Otherwise it only needs to hold the DstVT.
21649   EVT OtherVT = UseSSE ? SrcVT : DstVT;
21650   SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
21651   int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
21652   MachinePointerInfo MPI =
21653       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
21654 
21655   if (UseSSE) {
21656     assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
21657     Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
21658     SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21659     SDValue Ops[] = { Chain, StackPtr };
21660 
21661     Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
21662                                   /*Align*/ None, MachineMemOperand::MOLoad);
21663     Chain = Src.getValue(1);
21664   }
21665 
21666   SDValue StoreOps[] = { Chain, Src, StackPtr };
21667   Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
21668                                   StoreOps, DstVT, MPI, /*Align*/ None,
21669                                   MachineMemOperand::MOStore);
21670 
21671   return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
21672 }
21673 
21674 SDValue
LowerFP_TO_INT_SAT(SDValue Op,SelectionDAG & DAG) const21675 X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
21676   // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
21677   // but making use of X86 specifics to produce better instruction sequences.
21678   SDNode *Node = Op.getNode();
21679   bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
21680   unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
21681   SDLoc dl(SDValue(Node, 0));
21682   SDValue Src = Node->getOperand(0);
21683 
21684   // There are three types involved here: SrcVT is the source floating point
21685   // type, DstVT is the type of the result, and TmpVT is the result of the
21686   // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
21687   // DstVT).
21688   EVT SrcVT = Src.getValueType();
21689   EVT DstVT = Node->getValueType(0);
21690   EVT TmpVT = DstVT;
21691 
21692   // This code is only for floats and doubles. Fall back to generic code for
21693   // anything else.
21694   if (!isScalarFPTypeInSSEReg(SrcVT))
21695     return SDValue();
21696 
21697   EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
21698   unsigned SatWidth = SatVT.getScalarSizeInBits();
21699   unsigned DstWidth = DstVT.getScalarSizeInBits();
21700   unsigned TmpWidth = TmpVT.getScalarSizeInBits();
21701   assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
21702          "Expected saturation width smaller than result width");
21703 
21704   // Promote result of FP_TO_*INT to at least 32 bits.
21705   if (TmpWidth < 32) {
21706     TmpVT = MVT::i32;
21707     TmpWidth = 32;
21708   }
21709 
21710   // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
21711   // us to use a native signed conversion instead.
21712   if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
21713     TmpVT = MVT::i64;
21714     TmpWidth = 64;
21715   }
21716 
21717   // If the saturation width is smaller than the size of the temporary result,
21718   // we can always use signed conversion, which is native.
21719   if (SatWidth < TmpWidth)
21720     FpToIntOpcode = ISD::FP_TO_SINT;
21721 
21722   // Determine minimum and maximum integer values and their corresponding
21723   // floating-point values.
21724   APInt MinInt, MaxInt;
21725   if (IsSigned) {
21726     MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth);
21727     MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth);
21728   } else {
21729     MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth);
21730     MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth);
21731   }
21732 
21733   APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21734   APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21735 
21736   APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
21737     MinInt, IsSigned, APFloat::rmTowardZero);
21738   APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
21739     MaxInt, IsSigned, APFloat::rmTowardZero);
21740   bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
21741                           && !(MaxStatus & APFloat::opStatus::opInexact);
21742 
21743   SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
21744   SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
21745 
21746   // If the integer bounds are exactly representable as floats, emit a
21747   // min+max+fptoi sequence. Otherwise use comparisons and selects.
21748   if (AreExactFloatBounds) {
21749     if (DstVT != TmpVT) {
21750       // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
21751       SDValue MinClamped = DAG.getNode(
21752         X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
21753       // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
21754       SDValue BothClamped = DAG.getNode(
21755         X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
21756       // Convert clamped value to integer.
21757       SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
21758 
21759       // NaN will become INDVAL, with the top bit set and the rest zero.
21760       // Truncation will discard the top bit, resulting in zero.
21761       return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21762     }
21763 
21764     // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
21765     SDValue MinClamped = DAG.getNode(
21766       X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
21767     // Clamp by MaxFloat from above. NaN cannot occur.
21768     SDValue BothClamped = DAG.getNode(
21769       X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
21770     // Convert clamped value to integer.
21771     SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
21772 
21773     if (!IsSigned) {
21774       // In the unsigned case we're done, because we mapped NaN to MinFloat,
21775       // which is zero.
21776       return FpToInt;
21777     }
21778 
21779     // Otherwise, select zero if Src is NaN.
21780     SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21781     return DAG.getSelectCC(
21782       dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
21783   }
21784 
21785   SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
21786   SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
21787 
21788   // Result of direct conversion, which may be selected away.
21789   SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
21790 
21791   if (DstVT != TmpVT) {
21792     // NaN will become INDVAL, with the top bit set and the rest zero.
21793     // Truncation will discard the top bit, resulting in zero.
21794     FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21795   }
21796 
21797   SDValue Select = FpToInt;
21798   // For signed conversions where we saturate to the same size as the
21799   // result type of the fptoi instructions, INDVAL coincides with integer
21800   // minimum, so we don't need to explicitly check it.
21801   if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
21802     // If Src ULT MinFloat, select MinInt. In particular, this also selects
21803     // MinInt if Src is NaN.
21804     Select = DAG.getSelectCC(
21805       dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
21806   }
21807 
21808   // If Src OGT MaxFloat, select MaxInt.
21809   Select = DAG.getSelectCC(
21810     dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
21811 
21812   // In the unsigned case we are done, because we mapped NaN to MinInt, which
21813   // is already zero. The promoted case was already handled above.
21814   if (!IsSigned || DstVT != TmpVT) {
21815     return Select;
21816   }
21817 
21818   // Otherwise, select 0 if Src is NaN.
21819   SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21820   return DAG.getSelectCC(
21821     dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
21822 }
21823 
LowerFP_EXTEND(SDValue Op,SelectionDAG & DAG) const21824 SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
21825   bool IsStrict = Op->isStrictFPOpcode();
21826 
21827   SDLoc DL(Op);
21828   MVT VT = Op.getSimpleValueType();
21829   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21830   MVT SVT = In.getSimpleValueType();
21831 
21832   if (VT == MVT::f128)
21833     return SDValue();
21834 
21835   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
21836 
21837   SDValue Res =
21838       DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
21839   if (IsStrict)
21840     return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21841                        {Op->getOperand(0), Res});
21842   return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21843 }
21844 
LowerFP_ROUND(SDValue Op,SelectionDAG & DAG) const21845 SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
21846   bool IsStrict = Op->isStrictFPOpcode();
21847   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21848   // It's legal except when f128 is involved
21849   if (In.getSimpleValueType() != MVT::f128)
21850     return Op;
21851 
21852   return SDValue();
21853 }
21854 
LowerFP16_TO_FP(SDValue Op,SelectionDAG & DAG)21855 static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
21856   bool IsStrict = Op->isStrictFPOpcode();
21857   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21858   assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
21859          "Unexpected VT!");
21860 
21861   SDLoc dl(Op);
21862   SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
21863                             DAG.getConstant(0, dl, MVT::v8i16), Src,
21864                             DAG.getIntPtrConstant(0, dl));
21865 
21866   SDValue Chain;
21867   if (IsStrict) {
21868     Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
21869                       {Op.getOperand(0), Res});
21870     Chain = Res.getValue(1);
21871   } else {
21872     Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
21873   }
21874 
21875   Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
21876                     DAG.getIntPtrConstant(0, dl));
21877 
21878   if (IsStrict)
21879     return DAG.getMergeValues({Res, Chain}, dl);
21880 
21881   return Res;
21882 }
21883 
LowerFP_TO_FP16(SDValue Op,SelectionDAG & DAG)21884 static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
21885   bool IsStrict = Op->isStrictFPOpcode();
21886   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21887   assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
21888          "Unexpected VT!");
21889 
21890   SDLoc dl(Op);
21891   SDValue Res, Chain;
21892   if (IsStrict) {
21893     Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
21894                       DAG.getConstantFP(0, dl, MVT::v4f32), Src,
21895                       DAG.getIntPtrConstant(0, dl));
21896     Res = DAG.getNode(
21897         X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
21898         {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
21899     Chain = Res.getValue(1);
21900   } else {
21901     // FIXME: Should we use zeros for upper elements for non-strict?
21902     Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
21903     Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
21904                       DAG.getTargetConstant(4, dl, MVT::i32));
21905   }
21906 
21907   Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
21908                     DAG.getIntPtrConstant(0, dl));
21909 
21910   if (IsStrict)
21911     return DAG.getMergeValues({Res, Chain}, dl);
21912 
21913   return Res;
21914 }
21915 
21916 /// Depending on uarch and/or optimizing for size, we might prefer to use a
21917 /// vector operation in place of the typical scalar operation.
lowerAddSubToHorizontalOp(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)21918 static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
21919                                          const X86Subtarget &Subtarget) {
21920   // If both operands have other uses, this is probably not profitable.
21921   SDValue LHS = Op.getOperand(0);
21922   SDValue RHS = Op.getOperand(1);
21923   if (!LHS.hasOneUse() && !RHS.hasOneUse())
21924     return Op;
21925 
21926   // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
21927   bool IsFP = Op.getSimpleValueType().isFloatingPoint();
21928   if (IsFP && !Subtarget.hasSSE3())
21929     return Op;
21930   if (!IsFP && !Subtarget.hasSSSE3())
21931     return Op;
21932 
21933   // Extract from a common vector.
21934   if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21935       RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21936       LHS.getOperand(0) != RHS.getOperand(0) ||
21937       !isa<ConstantSDNode>(LHS.getOperand(1)) ||
21938       !isa<ConstantSDNode>(RHS.getOperand(1)) ||
21939       !shouldUseHorizontalOp(true, DAG, Subtarget))
21940     return Op;
21941 
21942   // Allow commuted 'hadd' ops.
21943   // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
21944   unsigned HOpcode;
21945   switch (Op.getOpcode()) {
21946     case ISD::ADD: HOpcode = X86ISD::HADD; break;
21947     case ISD::SUB: HOpcode = X86ISD::HSUB; break;
21948     case ISD::FADD: HOpcode = X86ISD::FHADD; break;
21949     case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
21950     default:
21951       llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
21952   }
21953   unsigned LExtIndex = LHS.getConstantOperandVal(1);
21954   unsigned RExtIndex = RHS.getConstantOperandVal(1);
21955   if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
21956       (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
21957     std::swap(LExtIndex, RExtIndex);
21958 
21959   if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
21960     return Op;
21961 
21962   SDValue X = LHS.getOperand(0);
21963   EVT VecVT = X.getValueType();
21964   unsigned BitWidth = VecVT.getSizeInBits();
21965   unsigned NumLanes = BitWidth / 128;
21966   unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
21967   assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
21968          "Not expecting illegal vector widths here");
21969 
21970   // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
21971   // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
21972   SDLoc DL(Op);
21973   if (BitWidth == 256 || BitWidth == 512) {
21974     unsigned LaneIdx = LExtIndex / NumEltsPerLane;
21975     X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
21976     LExtIndex %= NumEltsPerLane;
21977   }
21978 
21979   // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
21980   // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
21981   // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
21982   // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
21983   SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
21984   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
21985                      DAG.getIntPtrConstant(LExtIndex / 2, DL));
21986 }
21987 
21988 /// Depending on uarch and/or optimizing for size, we might prefer to use a
21989 /// vector operation in place of the typical scalar operation.
lowerFaddFsub(SDValue Op,SelectionDAG & DAG) const21990 SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
21991   assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
21992          "Only expecting float/double");
21993   return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
21994 }
21995 
21996 /// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
21997 /// This mode isn't supported in hardware on X86. But as long as we aren't
21998 /// compiling with trapping math, we can emulate this with
21999 /// floor(X + copysign(nextafter(0.5, 0.0), X)).
LowerFROUND(SDValue Op,SelectionDAG & DAG)22000 static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
22001   SDValue N0 = Op.getOperand(0);
22002   SDLoc dl(Op);
22003   MVT VT = Op.getSimpleValueType();
22004 
22005   // N0 += copysign(nextafter(0.5, 0.0), N0)
22006   const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22007   bool Ignored;
22008   APFloat Point5Pred = APFloat(0.5f);
22009   Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22010   Point5Pred.next(/*nextDown*/true);
22011 
22012   SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22013                               DAG.getConstantFP(Point5Pred, dl, VT), N0);
22014   N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22015 
22016   // Truncate the result to remove fraction.
22017   return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22018 }
22019 
22020 /// The only differences between FABS and FNEG are the mask and the logic op.
22021 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
LowerFABSorFNEG(SDValue Op,SelectionDAG & DAG)22022 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
22023   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
22024          "Wrong opcode for lowering FABS or FNEG.");
22025 
22026   bool IsFABS = (Op.getOpcode() == ISD::FABS);
22027 
22028   // If this is a FABS and it has an FNEG user, bail out to fold the combination
22029   // into an FNABS. We'll lower the FABS after that if it is still in use.
22030   if (IsFABS)
22031     for (SDNode *User : Op->uses())
22032       if (User->getOpcode() == ISD::FNEG)
22033         return Op;
22034 
22035   SDLoc dl(Op);
22036   MVT VT = Op.getSimpleValueType();
22037 
22038   bool IsF128 = (VT == MVT::f128);
22039   assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
22040           VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
22041           VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
22042          "Unexpected type in LowerFABSorFNEG");
22043 
22044   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
22045   // decide if we should generate a 16-byte constant mask when we only need 4 or
22046   // 8 bytes for the scalar case.
22047 
22048   // There are no scalar bitwise logical SSE/AVX instructions, so we
22049   // generate a 16-byte vector constant and logic op even for the scalar case.
22050   // Using a 16-byte mask allows folding the load of the mask with
22051   // the logic op, so it can save (~4 bytes) on code size.
22052   bool IsFakeVector = !VT.isVector() && !IsF128;
22053   MVT LogicVT = VT;
22054   if (IsFakeVector)
22055     LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
22056 
22057   unsigned EltBits = VT.getScalarSizeInBits();
22058   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22059   APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22060                            APInt::getSignMask(EltBits);
22061   const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22062   SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22063 
22064   SDValue Op0 = Op.getOperand(0);
22065   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22066   unsigned LogicOp = IsFABS  ? X86ISD::FAND :
22067                      IsFNABS ? X86ISD::FOR  :
22068                                X86ISD::FXOR;
22069   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22070 
22071   if (VT.isVector() || IsF128)
22072     return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22073 
22074   // For the scalar case extend to a 128-bit vector, perform the logic op,
22075   // and extract the scalar result back out.
22076   Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22077   SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22078   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22079                      DAG.getIntPtrConstant(0, dl));
22080 }
22081 
LowerFCOPYSIGN(SDValue Op,SelectionDAG & DAG)22082 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
22083   SDValue Mag = Op.getOperand(0);
22084   SDValue Sign = Op.getOperand(1);
22085   SDLoc dl(Op);
22086 
22087   // If the sign operand is smaller, extend it first.
22088   MVT VT = Op.getSimpleValueType();
22089   if (Sign.getSimpleValueType().bitsLT(VT))
22090     Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22091 
22092   // And if it is bigger, shrink it first.
22093   if (Sign.getSimpleValueType().bitsGT(VT))
22094     Sign =
22095         DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(0, dl));
22096 
22097   // At this point the operands and the result should have the same
22098   // type, and that won't be f80 since that is not custom lowered.
22099   bool IsF128 = (VT == MVT::f128);
22100   assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
22101           VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
22102           VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
22103          "Unexpected type in LowerFCOPYSIGN");
22104 
22105   const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22106 
22107   // Perform all scalar logic operations as 16-byte vectors because there are no
22108   // scalar FP logic instructions in SSE.
22109   // TODO: This isn't necessary. If we used scalar types, we might avoid some
22110   // unnecessary splats, but we might miss load folding opportunities. Should
22111   // this decision be based on OptimizeForSize?
22112   bool IsFakeVector = !VT.isVector() && !IsF128;
22113   MVT LogicVT = VT;
22114   if (IsFakeVector)
22115     LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
22116 
22117   // The mask constants are automatically splatted for vector types.
22118   unsigned EltSizeInBits = VT.getScalarSizeInBits();
22119   SDValue SignMask = DAG.getConstantFP(
22120       APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22121   SDValue MagMask = DAG.getConstantFP(
22122       APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22123 
22124   // First, clear all bits but the sign bit from the second operand (sign).
22125   if (IsFakeVector)
22126     Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22127   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22128 
22129   // Next, clear the sign bit from the first operand (magnitude).
22130   // TODO: If we had general constant folding for FP logic ops, this check
22131   // wouldn't be necessary.
22132   SDValue MagBits;
22133   if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22134     APFloat APF = Op0CN->getValueAPF();
22135     APF.clearSign();
22136     MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22137   } else {
22138     // If the magnitude operand wasn't a constant, we need to AND out the sign.
22139     if (IsFakeVector)
22140       Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22141     MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22142   }
22143 
22144   // OR the magnitude value with the sign bit.
22145   SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22146   return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22147                                           DAG.getIntPtrConstant(0, dl));
22148 }
22149 
LowerFGETSIGN(SDValue Op,SelectionDAG & DAG)22150 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
22151   SDValue N0 = Op.getOperand(0);
22152   SDLoc dl(Op);
22153   MVT VT = Op.getSimpleValueType();
22154 
22155   MVT OpVT = N0.getSimpleValueType();
22156   assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
22157          "Unexpected type for FGETSIGN");
22158 
22159   // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22160   MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22161   SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22162   Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22163   Res = DAG.getZExtOrTrunc(Res, dl, VT);
22164   Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22165   return Res;
22166 }
22167 
22168 /// Helper for creating a X86ISD::SETCC node.
getSETCC(X86::CondCode Cond,SDValue EFLAGS,const SDLoc & dl,SelectionDAG & DAG)22169 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
22170                         SelectionDAG &DAG) {
22171   return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22172                      DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22173 }
22174 
22175 /// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
22176 /// style scalarized (associative) reduction patterns. Partial reductions
22177 /// are supported when the pointer SrcMask is non-null.
22178 /// TODO - move this to SelectionDAG?
matchScalarReduction(SDValue Op,ISD::NodeType BinOp,SmallVectorImpl<SDValue> & SrcOps,SmallVectorImpl<APInt> * SrcMask=nullptr)22179 static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
22180                                  SmallVectorImpl<SDValue> &SrcOps,
22181                                  SmallVectorImpl<APInt> *SrcMask = nullptr) {
22182   SmallVector<SDValue, 8> Opnds;
22183   DenseMap<SDValue, APInt> SrcOpMap;
22184   EVT VT = MVT::Other;
22185 
22186   // Recognize a special case where a vector is casted into wide integer to
22187   // test all 0s.
22188   assert(Op.getOpcode() == unsigned(BinOp) &&
22189          "Unexpected bit reduction opcode");
22190   Opnds.push_back(Op.getOperand(0));
22191   Opnds.push_back(Op.getOperand(1));
22192 
22193   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22194     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
22195     // BFS traverse all BinOp operands.
22196     if (I->getOpcode() == unsigned(BinOp)) {
22197       Opnds.push_back(I->getOperand(0));
22198       Opnds.push_back(I->getOperand(1));
22199       // Re-evaluate the number of nodes to be traversed.
22200       e += 2; // 2 more nodes (LHS and RHS) are pushed.
22201       continue;
22202     }
22203 
22204     // Quit if a non-EXTRACT_VECTOR_ELT
22205     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22206       return false;
22207 
22208     // Quit if without a constant index.
22209     auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
22210     if (!Idx)
22211       return false;
22212 
22213     SDValue Src = I->getOperand(0);
22214     DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
22215     if (M == SrcOpMap.end()) {
22216       VT = Src.getValueType();
22217       // Quit if not the same type.
22218       if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
22219         return false;
22220       unsigned NumElts = VT.getVectorNumElements();
22221       APInt EltCount = APInt::getNullValue(NumElts);
22222       M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
22223       SrcOps.push_back(Src);
22224     }
22225 
22226     // Quit if element already used.
22227     unsigned CIdx = Idx->getZExtValue();
22228     if (M->second[CIdx])
22229       return false;
22230     M->second.setBit(CIdx);
22231   }
22232 
22233   if (SrcMask) {
22234     // Collect the source partial masks.
22235     for (SDValue &SrcOp : SrcOps)
22236       SrcMask->push_back(SrcOpMap[SrcOp]);
22237   } else {
22238     // Quit if not all elements are used.
22239     for (const auto &I : SrcOpMap)
22240       if (!I.second.isAllOnesValue())
22241         return false;
22242   }
22243 
22244   return true;
22245 }
22246 
22247 // Helper function for comparing all bits of a vector against zero.
LowerVectorAllZero(const SDLoc & DL,SDValue V,ISD::CondCode CC,const APInt & Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG,X86::CondCode & X86CC)22248 static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
22249                                   const APInt &Mask,
22250                                   const X86Subtarget &Subtarget,
22251                                   SelectionDAG &DAG, X86::CondCode &X86CC) {
22252   EVT VT = V.getValueType();
22253   unsigned ScalarSize = VT.getScalarSizeInBits();
22254   if (Mask.getBitWidth() != ScalarSize) {
22255     assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
22256     return SDValue();
22257   }
22258 
22259   assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22260   X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
22261 
22262   auto MaskBits = [&](SDValue Src) {
22263     if (Mask.isAllOnesValue())
22264       return Src;
22265     EVT SrcVT = Src.getValueType();
22266     SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
22267     return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
22268   };
22269 
22270   // For sub-128-bit vector, cast to (legal) integer and compare with zero.
22271   if (VT.getSizeInBits() < 128) {
22272     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
22273     if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))
22274       return SDValue();
22275     return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22276                        DAG.getBitcast(IntVT, MaskBits(V)),
22277                        DAG.getConstant(0, DL, IntVT));
22278   }
22279 
22280   // Quit if not splittable to 128/256-bit vector.
22281   if (!isPowerOf2_32(VT.getSizeInBits()))
22282     return SDValue();
22283 
22284   // Split down to 128/256-bit vector.
22285   unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
22286   while (VT.getSizeInBits() > TestSize) {
22287     auto Split = DAG.SplitVector(V, DL);
22288     VT = Split.first.getValueType();
22289     V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
22290   }
22291 
22292   bool UsePTEST = Subtarget.hasSSE41();
22293   if (UsePTEST) {
22294     MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
22295     V = DAG.getBitcast(TestVT, MaskBits(V));
22296     return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
22297   }
22298 
22299   // Without PTEST, a masked v2i64 or-reduction is not faster than
22300   // scalarization.
22301   if (!Mask.isAllOnesValue() && VT.getScalarSizeInBits() > 32)
22302       return SDValue();
22303 
22304   V = DAG.getBitcast(MVT::v16i8, MaskBits(V));
22305   V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,
22306                   getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
22307   V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22308   return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22309                      DAG.getConstant(0xFFFF, DL, MVT::i32));
22310 }
22311 
22312 // Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to
22313 // CMP(MOVMSK(PCMPEQB(X,0))).
MatchVectorAllZeroTest(SDValue Op,ISD::CondCode CC,const SDLoc & DL,const X86Subtarget & Subtarget,SelectionDAG & DAG,SDValue & X86CC)22314 static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
22315                                       const SDLoc &DL,
22316                                       const X86Subtarget &Subtarget,
22317                                       SelectionDAG &DAG, SDValue &X86CC) {
22318   assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22319 
22320   if (!Subtarget.hasSSE2() || !Op->hasOneUse())
22321     return SDValue();
22322 
22323   // Check whether we're masking/truncating an OR-reduction result, in which
22324   // case track the masked bits.
22325   APInt Mask = APInt::getAllOnesValue(Op.getScalarValueSizeInBits());
22326   switch (Op.getOpcode()) {
22327   case ISD::TRUNCATE: {
22328     SDValue Src = Op.getOperand(0);
22329     Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
22330                                 Op.getScalarValueSizeInBits());
22331     Op = Src;
22332     break;
22333   }
22334   case ISD::AND: {
22335     if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
22336       Mask = Cst->getAPIntValue();
22337       Op = Op.getOperand(0);
22338     }
22339     break;
22340   }
22341   }
22342 
22343   SmallVector<SDValue, 8> VecIns;
22344   if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {
22345     EVT VT = VecIns[0].getValueType();
22346     assert(llvm::all_of(VecIns,
22347                         [VT](SDValue V) { return VT == V.getValueType(); }) &&
22348            "Reduction source vector mismatch");
22349 
22350     // Quit if less than 128-bits or not splittable to 128/256-bit vector.
22351     if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))
22352       return SDValue();
22353 
22354     // If more than one full vector is evaluated, OR them first before PTEST.
22355     for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
22356          Slot += 2, e += 1) {
22357       // Each iteration will OR 2 nodes and append the result until there is
22358       // only 1 node left, i.e. the final OR'd value of all vectors.
22359       SDValue LHS = VecIns[Slot];
22360       SDValue RHS = VecIns[Slot + 1];
22361       VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));
22362     }
22363 
22364     X86::CondCode CCode;
22365     if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,
22366                                        DAG, CCode)) {
22367       X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
22368       return V;
22369     }
22370   }
22371 
22372   if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
22373     ISD::NodeType BinOp;
22374     if (SDValue Match =
22375             DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {
22376       X86::CondCode CCode;
22377       if (SDValue V =
22378               LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {
22379         X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
22380         return V;
22381       }
22382     }
22383   }
22384 
22385   return SDValue();
22386 }
22387 
22388 /// return true if \c Op has a use that doesn't just read flags.
hasNonFlagsUse(SDValue Op)22389 static bool hasNonFlagsUse(SDValue Op) {
22390   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
22391        ++UI) {
22392     SDNode *User = *UI;
22393     unsigned UOpNo = UI.getOperandNo();
22394     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
22395       // Look pass truncate.
22396       UOpNo = User->use_begin().getOperandNo();
22397       User = *User->use_begin();
22398     }
22399 
22400     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
22401         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
22402       return true;
22403   }
22404   return false;
22405 }
22406 
22407 // Transform to an x86-specific ALU node with flags if there is a chance of
22408 // using an RMW op or only the flags are used. Otherwise, leave
22409 // the node alone and emit a 'cmp' or 'test' instruction.
isProfitableToUseFlagOp(SDValue Op)22410 static bool isProfitableToUseFlagOp(SDValue Op) {
22411   for (SDNode *U : Op->uses())
22412     if (U->getOpcode() != ISD::CopyToReg &&
22413         U->getOpcode() != ISD::SETCC &&
22414         U->getOpcode() != ISD::STORE)
22415       return false;
22416 
22417   return true;
22418 }
22419 
22420 /// Emit nodes that will be selected as "test Op0,Op0", or something
22421 /// equivalent.
EmitTest(SDValue Op,unsigned X86CC,const SDLoc & dl,SelectionDAG & DAG,const X86Subtarget & Subtarget)22422 static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
22423                         SelectionDAG &DAG, const X86Subtarget &Subtarget) {
22424   // CF and OF aren't always set the way we want. Determine which
22425   // of these we need.
22426   bool NeedCF = false;
22427   bool NeedOF = false;
22428   switch (X86CC) {
22429   default: break;
22430   case X86::COND_A: case X86::COND_AE:
22431   case X86::COND_B: case X86::COND_BE:
22432     NeedCF = true;
22433     break;
22434   case X86::COND_G: case X86::COND_GE:
22435   case X86::COND_L: case X86::COND_LE:
22436   case X86::COND_O: case X86::COND_NO: {
22437     // Check if we really need to set the
22438     // Overflow flag. If NoSignedWrap is present
22439     // that is not actually needed.
22440     switch (Op->getOpcode()) {
22441     case ISD::ADD:
22442     case ISD::SUB:
22443     case ISD::MUL:
22444     case ISD::SHL:
22445       if (Op.getNode()->getFlags().hasNoSignedWrap())
22446         break;
22447       LLVM_FALLTHROUGH;
22448     default:
22449       NeedOF = true;
22450       break;
22451     }
22452     break;
22453   }
22454   }
22455   // See if we can use the EFLAGS value from the operand instead of
22456   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
22457   // we prove that the arithmetic won't overflow, we can't use OF or CF.
22458   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
22459     // Emit a CMP with 0, which is the TEST pattern.
22460     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22461                        DAG.getConstant(0, dl, Op.getValueType()));
22462   }
22463   unsigned Opcode = 0;
22464   unsigned NumOperands = 0;
22465 
22466   SDValue ArithOp = Op;
22467 
22468   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
22469   // which may be the result of a CAST.  We use the variable 'Op', which is the
22470   // non-casted variable when we check for possible users.
22471   switch (ArithOp.getOpcode()) {
22472   case ISD::AND:
22473     // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
22474     // because a TEST instruction will be better.
22475     if (!hasNonFlagsUse(Op))
22476       break;
22477 
22478     LLVM_FALLTHROUGH;
22479   case ISD::ADD:
22480   case ISD::SUB:
22481   case ISD::OR:
22482   case ISD::XOR:
22483     if (!isProfitableToUseFlagOp(Op))
22484       break;
22485 
22486     // Otherwise use a regular EFLAGS-setting instruction.
22487     switch (ArithOp.getOpcode()) {
22488     default: llvm_unreachable("unexpected operator!");
22489     case ISD::ADD: Opcode = X86ISD::ADD; break;
22490     case ISD::SUB: Opcode = X86ISD::SUB; break;
22491     case ISD::XOR: Opcode = X86ISD::XOR; break;
22492     case ISD::AND: Opcode = X86ISD::AND; break;
22493     case ISD::OR:  Opcode = X86ISD::OR;  break;
22494     }
22495 
22496     NumOperands = 2;
22497     break;
22498   case X86ISD::ADD:
22499   case X86ISD::SUB:
22500   case X86ISD::OR:
22501   case X86ISD::XOR:
22502   case X86ISD::AND:
22503     return SDValue(Op.getNode(), 1);
22504   case ISD::SSUBO:
22505   case ISD::USUBO: {
22506     // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
22507     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22508     return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
22509                        Op->getOperand(1)).getValue(1);
22510   }
22511   default:
22512     break;
22513   }
22514 
22515   if (Opcode == 0) {
22516     // Emit a CMP with 0, which is the TEST pattern.
22517     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22518                        DAG.getConstant(0, dl, Op.getValueType()));
22519   }
22520   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22521   SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
22522 
22523   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
22524   DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
22525   return SDValue(New.getNode(), 1);
22526 }
22527 
22528 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
22529 /// equivalent.
EmitCmp(SDValue Op0,SDValue Op1,unsigned X86CC,const SDLoc & dl,SelectionDAG & DAG,const X86Subtarget & Subtarget)22530 static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
22531                        const SDLoc &dl, SelectionDAG &DAG,
22532                        const X86Subtarget &Subtarget) {
22533   if (isNullConstant(Op1))
22534     return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
22535 
22536   EVT CmpVT = Op0.getValueType();
22537 
22538   assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
22539           CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
22540 
22541   // Only promote the compare up to I32 if it is a 16 bit operation
22542   // with an immediate.  16 bit immediates are to be avoided.
22543   if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
22544       !DAG.getMachineFunction().getFunction().hasMinSize()) {
22545     ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
22546     ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
22547     // Don't do this if the immediate can fit in 8-bits.
22548     if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
22549         (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
22550       unsigned ExtendOp =
22551           isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22552       if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
22553         // For equality comparisons try to use SIGN_EXTEND if the input was
22554         // truncate from something with enough sign bits.
22555         if (Op0.getOpcode() == ISD::TRUNCATE) {
22556           SDValue In = Op0.getOperand(0);
22557           unsigned EffBits =
22558               In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
22559           if (EffBits <= 16)
22560             ExtendOp = ISD::SIGN_EXTEND;
22561         } else if (Op1.getOpcode() == ISD::TRUNCATE) {
22562           SDValue In = Op1.getOperand(0);
22563           unsigned EffBits =
22564               In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
22565           if (EffBits <= 16)
22566             ExtendOp = ISD::SIGN_EXTEND;
22567         }
22568       }
22569 
22570       CmpVT = MVT::i32;
22571       Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
22572       Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
22573     }
22574   }
22575 
22576   // Try to shrink i64 compares if the input has enough zero bits.
22577   // FIXME: Do this for non-constant compares for constant on LHS?
22578   if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
22579       Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
22580       cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
22581       DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
22582     CmpVT = MVT::i32;
22583     Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
22584     Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
22585   }
22586 
22587   // 0-x == y --> x+y == 0
22588   // 0-x != y --> x+y != 0
22589   if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
22590       Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22591     SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22592     SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
22593     return Add.getValue(1);
22594   }
22595 
22596   // x == 0-y --> x+y == 0
22597   // x != 0-y --> x+y != 0
22598   if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
22599       Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22600     SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22601     SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
22602     return Add.getValue(1);
22603   }
22604 
22605   // Use SUB instead of CMP to enable CSE between SUB and CMP.
22606   SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22607   SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
22608   return Sub.getValue(1);
22609 }
22610 
22611 /// Check if replacement of SQRT with RSQRT should be disabled.
isFsqrtCheap(SDValue Op,SelectionDAG & DAG) const22612 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
22613   EVT VT = Op.getValueType();
22614 
22615   // We never want to use both SQRT and RSQRT instructions for the same input.
22616   if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
22617     return false;
22618 
22619   if (VT.isVector())
22620     return Subtarget.hasFastVectorFSQRT();
22621   return Subtarget.hasFastScalarFSQRT();
22622 }
22623 
22624 /// The minimum architected relative accuracy is 2^-12. We need one
22625 /// Newton-Raphson step to have a good float result (24 bits of precision).
getSqrtEstimate(SDValue Op,SelectionDAG & DAG,int Enabled,int & RefinementSteps,bool & UseOneConstNR,bool Reciprocal) const22626 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
22627                                            SelectionDAG &DAG, int Enabled,
22628                                            int &RefinementSteps,
22629                                            bool &UseOneConstNR,
22630                                            bool Reciprocal) const {
22631   EVT VT = Op.getValueType();
22632 
22633   // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
22634   // It is likely not profitable to do this for f64 because a double-precision
22635   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
22636   // instructions: convert to single, rsqrtss, convert back to double, refine
22637   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
22638   // along with FMA, this could be a throughput win.
22639   // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
22640   // after legalize types.
22641   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22642       (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
22643       (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
22644       (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22645       (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22646     if (RefinementSteps == ReciprocalEstimate::Unspecified)
22647       RefinementSteps = 1;
22648 
22649     UseOneConstNR = false;
22650     // There is no FSQRT for 512-bits, but there is RSQRT14.
22651     unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
22652     return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
22653   }
22654   return SDValue();
22655 }
22656 
22657 /// The minimum architected relative accuracy is 2^-12. We need one
22658 /// Newton-Raphson step to have a good float result (24 bits of precision).
getRecipEstimate(SDValue Op,SelectionDAG & DAG,int Enabled,int & RefinementSteps) const22659 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
22660                                             int Enabled,
22661                                             int &RefinementSteps) const {
22662   EVT VT = Op.getValueType();
22663 
22664   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
22665   // It is likely not profitable to do this for f64 because a double-precision
22666   // reciprocal estimate with refinement on x86 prior to FMA requires
22667   // 15 instructions: convert to single, rcpss, convert back to double, refine
22668   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
22669   // along with FMA, this could be a throughput win.
22670 
22671   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22672       (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
22673       (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22674       (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22675     // Enable estimate codegen with 1 refinement step for vector division.
22676     // Scalar division estimates are disabled because they break too much
22677     // real-world code. These defaults are intended to match GCC behavior.
22678     if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
22679       return SDValue();
22680 
22681     if (RefinementSteps == ReciprocalEstimate::Unspecified)
22682       RefinementSteps = 1;
22683 
22684     // There is no FSQRT for 512-bits, but there is RCP14.
22685     unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
22686     return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
22687   }
22688   return SDValue();
22689 }
22690 
22691 /// If we have at least two divisions that use the same divisor, convert to
22692 /// multiplication by a reciprocal. This may need to be adjusted for a given
22693 /// CPU if a division's cost is not at least twice the cost of a multiplication.
22694 /// This is because we still need one division to calculate the reciprocal and
22695 /// then we need two multiplies by that reciprocal as replacements for the
22696 /// original divisions.
combineRepeatedFPDivisors() const22697 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
22698   return 2;
22699 }
22700 
22701 SDValue
BuildSDIVPow2(SDNode * N,const APInt & Divisor,SelectionDAG & DAG,SmallVectorImpl<SDNode * > & Created) const22702 X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
22703                                  SelectionDAG &DAG,
22704                                  SmallVectorImpl<SDNode *> &Created) const {
22705   AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
22706   if (isIntDivCheap(N->getValueType(0), Attr))
22707     return SDValue(N,0); // Lower SDIV as SDIV
22708 
22709   assert((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&
22710          "Unexpected divisor!");
22711 
22712   // Only perform this transform if CMOV is supported otherwise the select
22713   // below will become a branch.
22714   if (!Subtarget.hasCMov())
22715     return SDValue();
22716 
22717   // fold (sdiv X, pow2)
22718   EVT VT = N->getValueType(0);
22719   // FIXME: Support i8.
22720   if (VT != MVT::i16 && VT != MVT::i32 &&
22721       !(Subtarget.is64Bit() && VT == MVT::i64))
22722     return SDValue();
22723 
22724   unsigned Lg2 = Divisor.countTrailingZeros();
22725 
22726   // If the divisor is 2 or -2, the default expansion is better.
22727   if (Lg2 == 1)
22728     return SDValue();
22729 
22730   SDLoc DL(N);
22731   SDValue N0 = N->getOperand(0);
22732   SDValue Zero = DAG.getConstant(0, DL, VT);
22733   APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
22734   SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
22735 
22736   // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
22737   SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
22738   SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
22739   SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
22740 
22741   Created.push_back(Cmp.getNode());
22742   Created.push_back(Add.getNode());
22743   Created.push_back(CMov.getNode());
22744 
22745   // Divide by pow2.
22746   SDValue SRA =
22747       DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
22748 
22749   // If we're dividing by a positive value, we're done.  Otherwise, we must
22750   // negate the result.
22751   if (Divisor.isNonNegative())
22752     return SRA;
22753 
22754   Created.push_back(SRA.getNode());
22755   return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
22756 }
22757 
22758 /// Result of 'and' is compared against zero. Change to a BT node if possible.
22759 /// Returns the BT node and the condition code needed to use it.
LowerAndToBT(SDValue And,ISD::CondCode CC,const SDLoc & dl,SelectionDAG & DAG,SDValue & X86CC)22760 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
22761                             const SDLoc &dl, SelectionDAG &DAG,
22762                             SDValue &X86CC) {
22763   assert(And.getOpcode() == ISD::AND && "Expected AND node!");
22764   SDValue Op0 = And.getOperand(0);
22765   SDValue Op1 = And.getOperand(1);
22766   if (Op0.getOpcode() == ISD::TRUNCATE)
22767     Op0 = Op0.getOperand(0);
22768   if (Op1.getOpcode() == ISD::TRUNCATE)
22769     Op1 = Op1.getOperand(0);
22770 
22771   SDValue Src, BitNo;
22772   if (Op1.getOpcode() == ISD::SHL)
22773     std::swap(Op0, Op1);
22774   if (Op0.getOpcode() == ISD::SHL) {
22775     if (isOneConstant(Op0.getOperand(0))) {
22776       // If we looked past a truncate, check that it's only truncating away
22777       // known zeros.
22778       unsigned BitWidth = Op0.getValueSizeInBits();
22779       unsigned AndBitWidth = And.getValueSizeInBits();
22780       if (BitWidth > AndBitWidth) {
22781         KnownBits Known = DAG.computeKnownBits(Op0);
22782         if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
22783           return SDValue();
22784       }
22785       Src = Op1;
22786       BitNo = Op0.getOperand(1);
22787     }
22788   } else if (Op1.getOpcode() == ISD::Constant) {
22789     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
22790     uint64_t AndRHSVal = AndRHS->getZExtValue();
22791     SDValue AndLHS = Op0;
22792 
22793     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
22794       Src = AndLHS.getOperand(0);
22795       BitNo = AndLHS.getOperand(1);
22796     } else {
22797       // Use BT if the immediate can't be encoded in a TEST instruction or we
22798       // are optimizing for size and the immedaite won't fit in a byte.
22799       bool OptForSize = DAG.shouldOptForSize();
22800       if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
22801           isPowerOf2_64(AndRHSVal)) {
22802         Src = AndLHS;
22803         BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
22804                                 Src.getValueType());
22805       }
22806     }
22807   }
22808 
22809   // No patterns found, give up.
22810   if (!Src.getNode())
22811     return SDValue();
22812 
22813   // If Src is i8, promote it to i32 with any_extend.  There is no i8 BT
22814   // instruction.  Since the shift amount is in-range-or-undefined, we know
22815   // that doing a bittest on the i32 value is ok.  We extend to i32 because
22816   // the encoding for the i16 version is larger than the i32 version.
22817   // Also promote i16 to i32 for performance / code size reason.
22818   if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
22819     Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
22820 
22821   // See if we can use the 32-bit instruction instead of the 64-bit one for a
22822   // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22823   // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22824   // known to be zero.
22825   if (Src.getValueType() == MVT::i64 &&
22826       DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22827     Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
22828 
22829   // If the operand types disagree, extend the shift amount to match.  Since
22830   // BT ignores high bits (like shifts) we can use anyextend.
22831   if (Src.getValueType() != BitNo.getValueType())
22832     BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
22833 
22834   X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
22835                                 dl, MVT::i8);
22836   return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
22837 }
22838 
22839 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
22840 /// CMPs.
translateX86FSETCC(ISD::CondCode SetCCOpcode,SDValue & Op0,SDValue & Op1,bool & IsAlwaysSignaling)22841 static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
22842                                    SDValue &Op1, bool &IsAlwaysSignaling) {
22843   unsigned SSECC;
22844   bool Swap = false;
22845 
22846   // SSE Condition code mapping:
22847   //  0 - EQ
22848   //  1 - LT
22849   //  2 - LE
22850   //  3 - UNORD
22851   //  4 - NEQ
22852   //  5 - NLT
22853   //  6 - NLE
22854   //  7 - ORD
22855   switch (SetCCOpcode) {
22856   default: llvm_unreachable("Unexpected SETCC condition");
22857   case ISD::SETOEQ:
22858   case ISD::SETEQ:  SSECC = 0; break;
22859   case ISD::SETOGT:
22860   case ISD::SETGT:  Swap = true; LLVM_FALLTHROUGH;
22861   case ISD::SETLT:
22862   case ISD::SETOLT: SSECC = 1; break;
22863   case ISD::SETOGE:
22864   case ISD::SETGE:  Swap = true; LLVM_FALLTHROUGH;
22865   case ISD::SETLE:
22866   case ISD::SETOLE: SSECC = 2; break;
22867   case ISD::SETUO:  SSECC = 3; break;
22868   case ISD::SETUNE:
22869   case ISD::SETNE:  SSECC = 4; break;
22870   case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
22871   case ISD::SETUGE: SSECC = 5; break;
22872   case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
22873   case ISD::SETUGT: SSECC = 6; break;
22874   case ISD::SETO:   SSECC = 7; break;
22875   case ISD::SETUEQ: SSECC = 8; break;
22876   case ISD::SETONE: SSECC = 12; break;
22877   }
22878   if (Swap)
22879     std::swap(Op0, Op1);
22880 
22881   switch (SetCCOpcode) {
22882   default:
22883     IsAlwaysSignaling = true;
22884     break;
22885   case ISD::SETEQ:
22886   case ISD::SETOEQ:
22887   case ISD::SETUEQ:
22888   case ISD::SETNE:
22889   case ISD::SETONE:
22890   case ISD::SETUNE:
22891   case ISD::SETO:
22892   case ISD::SETUO:
22893     IsAlwaysSignaling = false;
22894     break;
22895   }
22896 
22897   return SSECC;
22898 }
22899 
22900 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
22901 /// concatenate the result back.
splitIntVSETCC(EVT VT,SDValue LHS,SDValue RHS,ISD::CondCode Cond,SelectionDAG & DAG,const SDLoc & dl)22902 static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,
22903                               ISD::CondCode Cond, SelectionDAG &DAG,
22904                               const SDLoc &dl) {
22905   assert(VT.isInteger() && VT == LHS.getValueType() &&
22906          VT == RHS.getValueType() && "Unsupported VTs!");
22907 
22908   SDValue CC = DAG.getCondCode(Cond);
22909 
22910   // Extract the LHS Lo/Hi vectors
22911   SDValue LHS1, LHS2;
22912   std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
22913 
22914   // Extract the RHS Lo/Hi vectors
22915   SDValue RHS1, RHS2;
22916   std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
22917 
22918   // Issue the operation on the smaller types and concatenate the result back
22919   EVT LoVT, HiVT;
22920   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
22921   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22922                      DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
22923                      DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
22924 }
22925 
LowerIntVSETCC_AVX512(SDValue Op,SelectionDAG & DAG)22926 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
22927 
22928   SDValue Op0 = Op.getOperand(0);
22929   SDValue Op1 = Op.getOperand(1);
22930   SDValue CC = Op.getOperand(2);
22931   MVT VT = Op.getSimpleValueType();
22932   SDLoc dl(Op);
22933 
22934   assert(VT.getVectorElementType() == MVT::i1 &&
22935          "Cannot set masked compare for this operation");
22936 
22937   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
22938 
22939   // Prefer SETGT over SETLT.
22940   if (SetCCOpcode == ISD::SETLT) {
22941     SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
22942     std::swap(Op0, Op1);
22943   }
22944 
22945   return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
22946 }
22947 
22948 /// Given a buildvector constant, return a new vector constant with each element
22949 /// incremented or decremented. If incrementing or decrementing would result in
22950 /// unsigned overflow or underflow or this is not a simple vector constant,
22951 /// return an empty value.
incDecVectorConstant(SDValue V,SelectionDAG & DAG,bool IsInc)22952 static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
22953   auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
22954   if (!BV)
22955     return SDValue();
22956 
22957   MVT VT = V.getSimpleValueType();
22958   MVT EltVT = VT.getVectorElementType();
22959   unsigned NumElts = VT.getVectorNumElements();
22960   SmallVector<SDValue, 8> NewVecC;
22961   SDLoc DL(V);
22962   for (unsigned i = 0; i < NumElts; ++i) {
22963     auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
22964     if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
22965       return SDValue();
22966 
22967     // Avoid overflow/underflow.
22968     const APInt &EltC = Elt->getAPIntValue();
22969     if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue()))
22970       return SDValue();
22971 
22972     NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
22973   }
22974 
22975   return DAG.getBuildVector(VT, DL, NewVecC);
22976 }
22977 
22978 /// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
22979 /// Op0 u<= Op1:
22980 ///   t = psubus Op0, Op1
22981 ///   pcmpeq t, <0..0>
LowerVSETCCWithSUBUS(SDValue Op0,SDValue Op1,MVT VT,ISD::CondCode Cond,const SDLoc & dl,const X86Subtarget & Subtarget,SelectionDAG & DAG)22982 static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
22983                                     ISD::CondCode Cond, const SDLoc &dl,
22984                                     const X86Subtarget &Subtarget,
22985                                     SelectionDAG &DAG) {
22986   if (!Subtarget.hasSSE2())
22987     return SDValue();
22988 
22989   MVT VET = VT.getVectorElementType();
22990   if (VET != MVT::i8 && VET != MVT::i16)
22991     return SDValue();
22992 
22993   switch (Cond) {
22994   default:
22995     return SDValue();
22996   case ISD::SETULT: {
22997     // If the comparison is against a constant we can turn this into a
22998     // setule.  With psubus, setule does not require a swap.  This is
22999     // beneficial because the constant in the register is no longer
23000     // destructed as the destination so it can be hoisted out of a loop.
23001     // Only do this pre-AVX since vpcmp* is no longer destructive.
23002     if (Subtarget.hasAVX())
23003       return SDValue();
23004     SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);
23005     if (!ULEOp1)
23006       return SDValue();
23007     Op1 = ULEOp1;
23008     break;
23009   }
23010   case ISD::SETUGT: {
23011     // If the comparison is against a constant, we can turn this into a setuge.
23012     // This is beneficial because materializing a constant 0 for the PCMPEQ is
23013     // probably cheaper than XOR+PCMPGT using 2 different vector constants:
23014     // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23015     SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);
23016     if (!UGEOp1)
23017       return SDValue();
23018     Op1 = Op0;
23019     Op0 = UGEOp1;
23020     break;
23021   }
23022   // Psubus is better than flip-sign because it requires no inversion.
23023   case ISD::SETUGE:
23024     std::swap(Op0, Op1);
23025     break;
23026   case ISD::SETULE:
23027     break;
23028   }
23029 
23030   SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
23031   return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
23032                      DAG.getConstant(0, dl, VT));
23033 }
23034 
LowerVSETCC(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)23035 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
23036                            SelectionDAG &DAG) {
23037   bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23038                   Op.getOpcode() == ISD::STRICT_FSETCCS;
23039   SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23040   SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23041   SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
23042   MVT VT = Op->getSimpleValueType(0);
23043   ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
23044   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
23045   SDLoc dl(Op);
23046 
23047   if (isFP) {
23048 #ifndef NDEBUG
23049     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
23050     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
23051 #endif
23052 
23053     bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23054     SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23055 
23056     // If we have a strict compare with a vXi1 result and the input is 128/256
23057     // bits we can't use a masked compare unless we have VLX. If we use a wider
23058     // compare like we do for non-strict, we might trigger spurious exceptions
23059     // from the upper elements. Instead emit a AVX compare and convert to mask.
23060     unsigned Opc;
23061     if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
23062         (!IsStrict || Subtarget.hasVLX() ||
23063          Op0.getSimpleValueType().is512BitVector())) {
23064       assert(VT.getVectorNumElements() <= 16);
23065       Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
23066     } else {
23067       Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
23068       // The SSE/AVX packed FP comparison nodes are defined with a
23069       // floating-point vector result that matches the operand type. This allows
23070       // them to work with an SSE1 target (integer vector types are not legal).
23071       VT = Op0.getSimpleValueType();
23072     }
23073 
23074     SDValue Cmp;
23075     bool IsAlwaysSignaling;
23076     unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
23077     if (!Subtarget.hasAVX()) {
23078       // TODO: We could use following steps to handle a quiet compare with
23079       // signaling encodings.
23080       // 1. Get ordered masks from a quiet ISD::SETO
23081       // 2. Use the masks to mask potential unordered elements in operand A, B
23082       // 3. Get the compare results of masked A, B
23083       // 4. Calculating final result using the mask and result from 3
23084       // But currently, we just fall back to scalar operations.
23085       if (IsStrict && IsAlwaysSignaling && !IsSignaling)
23086         return SDValue();
23087 
23088       // Insert an extra signaling instruction to raise exception.
23089       if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
23090         SDValue SignalCmp = DAG.getNode(
23091             Opc, dl, {VT, MVT::Other},
23092             {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
23093         // FIXME: It seems we need to update the flags of all new strict nodes.
23094         // Otherwise, mayRaiseFPException in MI will return false due to
23095         // NoFPExcept = false by default. However, I didn't find it in other
23096         // patches.
23097         SignalCmp->setFlags(Op->getFlags());
23098         Chain = SignalCmp.getValue(1);
23099       }
23100 
23101       // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
23102       // emit two comparisons and a logic op to tie them together.
23103       if (SSECC >= 8) {
23104         // LLVM predicate is SETUEQ or SETONE.
23105         unsigned CC0, CC1;
23106         unsigned CombineOpc;
23107         if (Cond == ISD::SETUEQ) {
23108           CC0 = 3; // UNORD
23109           CC1 = 0; // EQ
23110           CombineOpc = X86ISD::FOR;
23111         } else {
23112           assert(Cond == ISD::SETONE);
23113           CC0 = 7; // ORD
23114           CC1 = 4; // NEQ
23115           CombineOpc = X86ISD::FAND;
23116         }
23117 
23118         SDValue Cmp0, Cmp1;
23119         if (IsStrict) {
23120           Cmp0 = DAG.getNode(
23121               Opc, dl, {VT, MVT::Other},
23122               {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
23123           Cmp1 = DAG.getNode(
23124               Opc, dl, {VT, MVT::Other},
23125               {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
23126           Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
23127                               Cmp1.getValue(1));
23128         } else {
23129           Cmp0 = DAG.getNode(
23130               Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
23131           Cmp1 = DAG.getNode(
23132               Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
23133         }
23134         Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
23135       } else {
23136         if (IsStrict) {
23137           Cmp = DAG.getNode(
23138               Opc, dl, {VT, MVT::Other},
23139               {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23140           Chain = Cmp.getValue(1);
23141         } else
23142           Cmp = DAG.getNode(
23143               Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23144       }
23145     } else {
23146       // Handle all other FP comparisons here.
23147       if (IsStrict) {
23148         // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
23149         SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
23150         Cmp = DAG.getNode(
23151             Opc, dl, {VT, MVT::Other},
23152             {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23153         Chain = Cmp.getValue(1);
23154       } else
23155         Cmp = DAG.getNode(
23156             Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23157     }
23158 
23159     if (VT.getFixedSizeInBits() >
23160         Op.getSimpleValueType().getFixedSizeInBits()) {
23161       // We emitted a compare with an XMM/YMM result. Finish converting to a
23162       // mask register using a vptestm.
23163       EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
23164       Cmp = DAG.getBitcast(CastVT, Cmp);
23165       Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
23166                          DAG.getConstant(0, dl, CastVT), ISD::SETNE);
23167     } else {
23168       // If this is SSE/AVX CMPP, bitcast the result back to integer to match
23169       // the result type of SETCC. The bitcast is expected to be optimized
23170       // away during combining/isel.
23171       Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
23172     }
23173 
23174     if (IsStrict)
23175       return DAG.getMergeValues({Cmp, Chain}, dl);
23176 
23177     return Cmp;
23178   }
23179 
23180   assert(!IsStrict && "Strict SETCC only handles FP operands.");
23181 
23182   MVT VTOp0 = Op0.getSimpleValueType();
23183   (void)VTOp0;
23184   assert(VTOp0 == Op1.getSimpleValueType() &&
23185          "Expected operands with same type!");
23186   assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
23187          "Invalid number of packed elements for source and destination!");
23188 
23189   // The non-AVX512 code below works under the assumption that source and
23190   // destination types are the same.
23191   assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
23192          "Value types for source and destination must be the same!");
23193 
23194   // The result is boolean, but operands are int/float
23195   if (VT.getVectorElementType() == MVT::i1) {
23196     // In AVX-512 architecture setcc returns mask with i1 elements,
23197     // But there is no compare instruction for i8 and i16 elements in KNL.
23198     assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
23199            "Unexpected operand type");
23200     return LowerIntVSETCC_AVX512(Op, DAG);
23201   }
23202 
23203   // Lower using XOP integer comparisons.
23204   if (VT.is128BitVector() && Subtarget.hasXOP()) {
23205     // Translate compare code to XOP PCOM compare mode.
23206     unsigned CmpMode = 0;
23207     switch (Cond) {
23208     default: llvm_unreachable("Unexpected SETCC condition");
23209     case ISD::SETULT:
23210     case ISD::SETLT: CmpMode = 0x00; break;
23211     case ISD::SETULE:
23212     case ISD::SETLE: CmpMode = 0x01; break;
23213     case ISD::SETUGT:
23214     case ISD::SETGT: CmpMode = 0x02; break;
23215     case ISD::SETUGE:
23216     case ISD::SETGE: CmpMode = 0x03; break;
23217     case ISD::SETEQ: CmpMode = 0x04; break;
23218     case ISD::SETNE: CmpMode = 0x05; break;
23219     }
23220 
23221     // Are we comparing unsigned or signed integers?
23222     unsigned Opc =
23223         ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
23224 
23225     return DAG.getNode(Opc, dl, VT, Op0, Op1,
23226                        DAG.getTargetConstant(CmpMode, dl, MVT::i8));
23227   }
23228 
23229   // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
23230   // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
23231   if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
23232     SDValue BC0 = peekThroughBitcasts(Op0);
23233     if (BC0.getOpcode() == ISD::AND) {
23234       APInt UndefElts;
23235       SmallVector<APInt, 64> EltBits;
23236       if (getTargetConstantBitsFromNode(BC0.getOperand(1),
23237                                         VT.getScalarSizeInBits(), UndefElts,
23238                                         EltBits, false, false)) {
23239         if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
23240           Cond = ISD::SETEQ;
23241           Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
23242         }
23243       }
23244     }
23245   }
23246 
23247   // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
23248   if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
23249       Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
23250     ConstantSDNode *C1 = isConstOrConstSplat(Op1);
23251     if (C1 && C1->getAPIntValue().isPowerOf2()) {
23252       unsigned BitWidth = VT.getScalarSizeInBits();
23253       unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
23254 
23255       SDValue Result = Op0.getOperand(0);
23256       Result = DAG.getNode(ISD::SHL, dl, VT, Result,
23257                            DAG.getConstant(ShiftAmt, dl, VT));
23258       Result = DAG.getNode(ISD::SRA, dl, VT, Result,
23259                            DAG.getConstant(BitWidth - 1, dl, VT));
23260       return Result;
23261     }
23262   }
23263 
23264   // Break 256-bit integer vector compare into smaller ones.
23265   if (VT.is256BitVector() && !Subtarget.hasInt256())
23266     return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23267 
23268   if (VT == MVT::v32i16 || VT == MVT::v64i8) {
23269     assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!");
23270     return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23271   }
23272 
23273   // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
23274   // not-of-PCMPEQ:
23275   // X != INT_MIN --> X >s INT_MIN
23276   // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
23277   // +X != 0 --> +X >s 0
23278   APInt ConstValue;
23279   if (Cond == ISD::SETNE &&
23280       ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
23281     if (ConstValue.isMinSignedValue())
23282       Cond = ISD::SETGT;
23283     else if (ConstValue.isMaxSignedValue())
23284       Cond = ISD::SETLT;
23285     else if (ConstValue.isNullValue() && DAG.SignBitIsZero(Op0))
23286       Cond = ISD::SETGT;
23287   }
23288 
23289   // If both operands are known non-negative, then an unsigned compare is the
23290   // same as a signed compare and there's no need to flip signbits.
23291   // TODO: We could check for more general simplifications here since we're
23292   // computing known bits.
23293   bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
23294                    !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
23295 
23296   // Special case: Use min/max operations for unsigned compares.
23297   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23298   if (ISD::isUnsignedIntSetCC(Cond) &&
23299       (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
23300       TLI.isOperationLegal(ISD::UMIN, VT)) {
23301     // If we have a constant operand, increment/decrement it and change the
23302     // condition to avoid an invert.
23303     if (Cond == ISD::SETUGT) {
23304       // X > C --> X >= (C+1) --> X == umax(X, C+1)
23305       if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {
23306         Op1 = UGTOp1;
23307         Cond = ISD::SETUGE;
23308       }
23309     }
23310     if (Cond == ISD::SETULT) {
23311       // X < C --> X <= (C-1) --> X == umin(X, C-1)
23312       if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {
23313         Op1 = ULTOp1;
23314         Cond = ISD::SETULE;
23315       }
23316     }
23317     bool Invert = false;
23318     unsigned Opc;
23319     switch (Cond) {
23320     default: llvm_unreachable("Unexpected condition code");
23321     case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
23322     case ISD::SETULE: Opc = ISD::UMIN; break;
23323     case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
23324     case ISD::SETUGE: Opc = ISD::UMAX; break;
23325     }
23326 
23327     SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23328     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
23329 
23330     // If the logical-not of the result is required, perform that now.
23331     if (Invert)
23332       Result = DAG.getNOT(dl, Result, VT);
23333 
23334     return Result;
23335   }
23336 
23337   // Try to use SUBUS and PCMPEQ.
23338   if (FlipSigns)
23339     if (SDValue V =
23340             LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
23341       return V;
23342 
23343   // We are handling one of the integer comparisons here. Since SSE only has
23344   // GT and EQ comparisons for integer, swapping operands and multiple
23345   // operations may be required for some comparisons.
23346   unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
23347                                                             : X86ISD::PCMPGT;
23348   bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
23349               Cond == ISD::SETGE || Cond == ISD::SETUGE;
23350   bool Invert = Cond == ISD::SETNE ||
23351                 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
23352 
23353   if (Swap)
23354     std::swap(Op0, Op1);
23355 
23356   // Check that the operation in question is available (most are plain SSE2,
23357   // but PCMPGTQ and PCMPEQQ have different requirements).
23358   if (VT == MVT::v2i64) {
23359     if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
23360       assert(Subtarget.hasSSE2() && "Don't know how to lower!");
23361 
23362       // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
23363       // the odd elements over the even elements.
23364       if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
23365         Op0 = DAG.getConstant(0, dl, MVT::v4i32);
23366         Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23367 
23368         SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23369         static const int MaskHi[] = { 1, 1, 3, 3 };
23370         SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23371 
23372         return DAG.getBitcast(VT, Result);
23373       }
23374 
23375       if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
23376         Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23377         Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
23378 
23379         SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23380         static const int MaskHi[] = { 1, 1, 3, 3 };
23381         SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23382 
23383         return DAG.getBitcast(VT, Result);
23384       }
23385 
23386       // Since SSE has no unsigned integer comparisons, we need to flip the sign
23387       // bits of the inputs before performing those operations. The lower
23388       // compare is always unsigned.
23389       SDValue SB;
23390       if (FlipSigns) {
23391         SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
23392       } else {
23393         SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
23394       }
23395       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
23396       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
23397 
23398       // Cast everything to the right type.
23399       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23400       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23401 
23402       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
23403       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23404       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
23405 
23406       // Create masks for only the low parts/high parts of the 64 bit integers.
23407       static const int MaskHi[] = { 1, 1, 3, 3 };
23408       static const int MaskLo[] = { 0, 0, 2, 2 };
23409       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
23410       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
23411       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23412 
23413       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
23414       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
23415 
23416       if (Invert)
23417         Result = DAG.getNOT(dl, Result, MVT::v4i32);
23418 
23419       return DAG.getBitcast(VT, Result);
23420     }
23421 
23422     if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
23423       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
23424       // pcmpeqd + pshufd + pand.
23425       assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
23426 
23427       // First cast everything to the right type.
23428       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23429       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23430 
23431       // Do the compare.
23432       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
23433 
23434       // Make sure the lower and upper halves are both all-ones.
23435       static const int Mask[] = { 1, 0, 3, 2 };
23436       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
23437       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
23438 
23439       if (Invert)
23440         Result = DAG.getNOT(dl, Result, MVT::v4i32);
23441 
23442       return DAG.getBitcast(VT, Result);
23443     }
23444   }
23445 
23446   // Since SSE has no unsigned integer comparisons, we need to flip the sign
23447   // bits of the inputs before performing those operations.
23448   if (FlipSigns) {
23449     MVT EltVT = VT.getVectorElementType();
23450     SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
23451                                  VT);
23452     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
23453     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
23454   }
23455 
23456   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23457 
23458   // If the logical-not of the result is required, perform that now.
23459   if (Invert)
23460     Result = DAG.getNOT(dl, Result, VT);
23461 
23462   return Result;
23463 }
23464 
23465 // Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
EmitAVX512Test(SDValue Op0,SDValue Op1,ISD::CondCode CC,const SDLoc & dl,SelectionDAG & DAG,const X86Subtarget & Subtarget,SDValue & X86CC)23466 static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
23467                               const SDLoc &dl, SelectionDAG &DAG,
23468                               const X86Subtarget &Subtarget,
23469                               SDValue &X86CC) {
23470   // Only support equality comparisons.
23471   if (CC != ISD::SETEQ && CC != ISD::SETNE)
23472     return SDValue();
23473 
23474   // Must be a bitcast from vXi1.
23475   if (Op0.getOpcode() != ISD::BITCAST)
23476     return SDValue();
23477 
23478   Op0 = Op0.getOperand(0);
23479   MVT VT = Op0.getSimpleValueType();
23480   if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
23481       !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
23482       !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
23483     return SDValue();
23484 
23485   X86::CondCode X86Cond;
23486   if (isNullConstant(Op1)) {
23487     X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
23488   } else if (isAllOnesConstant(Op1)) {
23489     // C flag is set for all ones.
23490     X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
23491   } else
23492     return SDValue();
23493 
23494   // If the input is an AND, we can combine it's operands into the KTEST.
23495   bool KTestable = false;
23496   if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
23497     KTestable = true;
23498   if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
23499     KTestable = true;
23500   if (!isNullConstant(Op1))
23501     KTestable = false;
23502   if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
23503     SDValue LHS = Op0.getOperand(0);
23504     SDValue RHS = Op0.getOperand(1);
23505     X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23506     return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
23507   }
23508 
23509   // If the input is an OR, we can combine it's operands into the KORTEST.
23510   SDValue LHS = Op0;
23511   SDValue RHS = Op0;
23512   if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
23513     LHS = Op0.getOperand(0);
23514     RHS = Op0.getOperand(1);
23515   }
23516 
23517   X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23518   return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
23519 }
23520 
23521 /// Emit flags for the given setcc condition and operands. Also returns the
23522 /// corresponding X86 condition code constant in X86CC.
emitFlagsForSetcc(SDValue Op0,SDValue Op1,ISD::CondCode CC,const SDLoc & dl,SelectionDAG & DAG,SDValue & X86CC) const23523 SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
23524                                              ISD::CondCode CC, const SDLoc &dl,
23525                                              SelectionDAG &DAG,
23526                                              SDValue &X86CC) const {
23527   // Optimize to BT if possible.
23528   // Lower (X & (1 << N)) == 0 to BT(X, N).
23529   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
23530   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
23531   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
23532       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23533     if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
23534       return BT;
23535   }
23536 
23537   // Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.
23538   // TODO: We could do AND tree with all 1s as well by using the C flag.
23539   if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE))
23540     if (SDValue CmpZ =
23541             MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))
23542       return CmpZ;
23543 
23544   // Try to lower using KORTEST or KTEST.
23545   if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
23546     return Test;
23547 
23548   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
23549   // these.
23550   if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
23551       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23552     // If the input is a setcc, then reuse the input setcc or use a new one with
23553     // the inverted condition.
23554     if (Op0.getOpcode() == X86ISD::SETCC) {
23555       bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
23556 
23557       X86CC = Op0.getOperand(0);
23558       if (Invert) {
23559         X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
23560         CCode = X86::GetOppositeBranchCondition(CCode);
23561         X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
23562       }
23563 
23564       return Op0.getOperand(1);
23565     }
23566   }
23567 
23568   // Try to use the carry flag from the add in place of an separate CMP for:
23569   // (seteq (add X, -1), -1). Similar for setne.
23570   if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
23571       Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23572     if (isProfitableToUseFlagOp(Op0)) {
23573       SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
23574 
23575       SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
23576                                 Op0.getOperand(1));
23577       DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
23578       X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23579       X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
23580       return SDValue(New.getNode(), 1);
23581     }
23582   }
23583 
23584   X86::CondCode CondCode =
23585       TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
23586   assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
23587 
23588   SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
23589   X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23590   return EFLAGS;
23591 }
23592 
LowerSETCC(SDValue Op,SelectionDAG & DAG) const23593 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
23594 
23595   bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23596                   Op.getOpcode() == ISD::STRICT_FSETCCS;
23597   MVT VT = Op->getSimpleValueType(0);
23598 
23599   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
23600 
23601   assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
23602   SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23603   SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23604   SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23605   SDLoc dl(Op);
23606   ISD::CondCode CC =
23607       cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
23608 
23609   // Handle f128 first, since one possible outcome is a normal integer
23610   // comparison which gets handled by emitFlagsForSetcc.
23611   if (Op0.getValueType() == MVT::f128) {
23612     softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
23613                         Op.getOpcode() == ISD::STRICT_FSETCCS);
23614 
23615     // If softenSetCCOperands returned a scalar, use it.
23616     if (!Op1.getNode()) {
23617       assert(Op0.getValueType() == Op.getValueType() &&
23618              "Unexpected setcc expansion!");
23619       if (IsStrict)
23620         return DAG.getMergeValues({Op0, Chain}, dl);
23621       return Op0;
23622     }
23623   }
23624 
23625   if (Op0.getSimpleValueType().isInteger()) {
23626     // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
23627     // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
23628     // this may translate to less uops depending on uarch implementation. The
23629     // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
23630     // canonicalize to that CondCode.
23631     // NOTE: Only do this if incrementing the constant doesn't increase the bit
23632     // encoding size - so it must either already be a i8 or i32 immediate, or it
23633     // shrinks down to that. We don't do this for any i64's to avoid additional
23634     // constant materializations.
23635     // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
23636     if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
23637       const APInt &Op1Val = Op1C->getAPIntValue();
23638       if (!Op1Val.isNullValue()) {
23639         // Ensure the constant+1 doesn't overflow.
23640         if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
23641             (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
23642           APInt Op1ValPlusOne = Op1Val + 1;
23643           if (Op1ValPlusOne.isSignedIntN(32) &&
23644               (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
23645             Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
23646             CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE
23647                                             : ISD::CondCode::SETUGE;
23648           }
23649         }
23650       }
23651     }
23652 
23653     SDValue X86CC;
23654     SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
23655     SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23656     return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23657   }
23658 
23659   // Handle floating point.
23660   X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
23661   if (CondCode == X86::COND_INVALID)
23662     return SDValue();
23663 
23664   SDValue EFLAGS;
23665   if (IsStrict) {
23666     bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23667     EFLAGS =
23668         DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
23669                     dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
23670     Chain = EFLAGS.getValue(1);
23671   } else {
23672     EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
23673   }
23674 
23675   SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23676   SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23677   return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23678 }
23679 
LowerSETCCCARRY(SDValue Op,SelectionDAG & DAG) const23680 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
23681   SDValue LHS = Op.getOperand(0);
23682   SDValue RHS = Op.getOperand(1);
23683   SDValue Carry = Op.getOperand(2);
23684   SDValue Cond = Op.getOperand(3);
23685   SDLoc DL(Op);
23686 
23687   assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
23688   X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
23689 
23690   // Recreate the carry if needed.
23691   EVT CarryVT = Carry.getValueType();
23692   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23693                       Carry, DAG.getAllOnesConstant(DL, CarryVT));
23694 
23695   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
23696   SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
23697   return getSETCC(CC, Cmp.getValue(1), DL, DAG);
23698 }
23699 
23700 // This function returns three things: the arithmetic computation itself
23701 // (Value), an EFLAGS result (Overflow), and a condition code (Cond).  The
23702 // flag and the condition code define the case in which the arithmetic
23703 // computation overflows.
23704 static std::pair<SDValue, SDValue>
getX86XALUOOp(X86::CondCode & Cond,SDValue Op,SelectionDAG & DAG)23705 getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
23706   assert(Op.getResNo() == 0 && "Unexpected result number!");
23707   SDValue Value, Overflow;
23708   SDValue LHS = Op.getOperand(0);
23709   SDValue RHS = Op.getOperand(1);
23710   unsigned BaseOp = 0;
23711   SDLoc DL(Op);
23712   switch (Op.getOpcode()) {
23713   default: llvm_unreachable("Unknown ovf instruction!");
23714   case ISD::SADDO:
23715     BaseOp = X86ISD::ADD;
23716     Cond = X86::COND_O;
23717     break;
23718   case ISD::UADDO:
23719     BaseOp = X86ISD::ADD;
23720     Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
23721     break;
23722   case ISD::SSUBO:
23723     BaseOp = X86ISD::SUB;
23724     Cond = X86::COND_O;
23725     break;
23726   case ISD::USUBO:
23727     BaseOp = X86ISD::SUB;
23728     Cond = X86::COND_B;
23729     break;
23730   case ISD::SMULO:
23731     BaseOp = X86ISD::SMUL;
23732     Cond = X86::COND_O;
23733     break;
23734   case ISD::UMULO:
23735     BaseOp = X86ISD::UMUL;
23736     Cond = X86::COND_O;
23737     break;
23738   }
23739 
23740   if (BaseOp) {
23741     // Also sets EFLAGS.
23742     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23743     Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
23744     Overflow = Value.getValue(1);
23745   }
23746 
23747   return std::make_pair(Value, Overflow);
23748 }
23749 
LowerXALUO(SDValue Op,SelectionDAG & DAG)23750 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
23751   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
23752   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
23753   // looks for this combo and may remove the "setcc" instruction if the "setcc"
23754   // has only one use.
23755   SDLoc DL(Op);
23756   X86::CondCode Cond;
23757   SDValue Value, Overflow;
23758   std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
23759 
23760   SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
23761   assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
23762   return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
23763 }
23764 
23765 /// Return true if opcode is a X86 logical comparison.
isX86LogicalCmp(SDValue Op)23766 static bool isX86LogicalCmp(SDValue Op) {
23767   unsigned Opc = Op.getOpcode();
23768   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
23769       Opc == X86ISD::FCMP)
23770     return true;
23771   if (Op.getResNo() == 1 &&
23772       (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
23773        Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
23774        Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
23775     return true;
23776 
23777   return false;
23778 }
23779 
isTruncWithZeroHighBitsInput(SDValue V,SelectionDAG & DAG)23780 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
23781   if (V.getOpcode() != ISD::TRUNCATE)
23782     return false;
23783 
23784   SDValue VOp0 = V.getOperand(0);
23785   unsigned InBits = VOp0.getValueSizeInBits();
23786   unsigned Bits = V.getValueSizeInBits();
23787   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
23788 }
23789 
LowerSELECT(SDValue Op,SelectionDAG & DAG) const23790 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
23791   bool AddTest = true;
23792   SDValue Cond  = Op.getOperand(0);
23793   SDValue Op1 = Op.getOperand(1);
23794   SDValue Op2 = Op.getOperand(2);
23795   SDLoc DL(Op);
23796   MVT VT = Op1.getSimpleValueType();
23797   SDValue CC;
23798 
23799   // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
23800   // are available or VBLENDV if AVX is available.
23801   // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
23802   if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
23803       VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
23804     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
23805     bool IsAlwaysSignaling;
23806     unsigned SSECC =
23807         translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
23808                            CondOp0, CondOp1, IsAlwaysSignaling);
23809 
23810     if (Subtarget.hasAVX512()) {
23811       SDValue Cmp =
23812           DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
23813                       DAG.getTargetConstant(SSECC, DL, MVT::i8));
23814       assert(!VT.isVector() && "Not a scalar type?");
23815       return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
23816     }
23817 
23818     if (SSECC < 8 || Subtarget.hasAVX()) {
23819       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
23820                                 DAG.getTargetConstant(SSECC, DL, MVT::i8));
23821 
23822       // If we have AVX, we can use a variable vector select (VBLENDV) instead
23823       // of 3 logic instructions for size savings and potentially speed.
23824       // Unfortunately, there is no scalar form of VBLENDV.
23825 
23826       // If either operand is a +0.0 constant, don't try this. We can expect to
23827       // optimize away at least one of the logic instructions later in that
23828       // case, so that sequence would be faster than a variable blend.
23829 
23830       // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
23831       // uses XMM0 as the selection register. That may need just as many
23832       // instructions as the AND/ANDN/OR sequence due to register moves, so
23833       // don't bother.
23834       if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
23835           !isNullFPConstant(Op2)) {
23836         // Convert to vectors, do a VSELECT, and convert back to scalar.
23837         // All of the conversions should be optimized away.
23838         MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
23839         SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
23840         SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
23841         SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
23842 
23843         MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
23844         VCmp = DAG.getBitcast(VCmpVT, VCmp);
23845 
23846         SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
23847 
23848         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
23849                            VSel, DAG.getIntPtrConstant(0, DL));
23850       }
23851       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
23852       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
23853       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
23854     }
23855   }
23856 
23857   // AVX512 fallback is to lower selects of scalar floats to masked moves.
23858   if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
23859     SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
23860     return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
23861   }
23862 
23863   if (Cond.getOpcode() == ISD::SETCC) {
23864     if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
23865       Cond = NewCond;
23866       // If the condition was updated, it's possible that the operands of the
23867       // select were also updated (for example, EmitTest has a RAUW). Refresh
23868       // the local references to the select operands in case they got stale.
23869       Op1 = Op.getOperand(1);
23870       Op2 = Op.getOperand(2);
23871     }
23872   }
23873 
23874   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
23875   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
23876   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
23877   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
23878   // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
23879   // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
23880   if (Cond.getOpcode() == X86ISD::SETCC &&
23881       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
23882       isNullConstant(Cond.getOperand(1).getOperand(1))) {
23883     SDValue Cmp = Cond.getOperand(1);
23884     SDValue CmpOp0 = Cmp.getOperand(0);
23885     unsigned CondCode = Cond.getConstantOperandVal(0);
23886 
23887     // Special handling for __builtin_ffs(X) - 1 pattern which looks like
23888     // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
23889     // handle to keep the CMP with 0. This should be removed by
23890     // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
23891     // cttz_zero_undef.
23892     auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
23893       return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
23894               Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
23895     };
23896     if (Subtarget.hasCMov() && (VT == MVT::i32 || VT == MVT::i64) &&
23897         ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
23898          (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
23899       // Keep Cmp.
23900     } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
23901         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
23902       SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
23903 
23904       SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23905       SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
23906 
23907       // Apply further optimizations for special cases
23908       // (select (x != 0), -1, 0) -> neg & sbb
23909       // (select (x == 0), 0, -1) -> neg & sbb
23910       if (isNullConstant(Y) &&
23911           (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
23912         SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
23913         SDValue Neg = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
23914         Zero = DAG.getConstant(0, DL, Op.getValueType());
23915         return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Neg.getValue(1));
23916       }
23917 
23918       Cmp = DAG.getNode(X86ISD::SUB, DL, CmpVTs,
23919                         CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
23920 
23921       SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
23922       SDValue Res =   // Res = 0 or -1.
23923         DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp.getValue(1));
23924 
23925       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
23926         Res = DAG.getNOT(DL, Res, Res.getValueType());
23927 
23928       return DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
23929     } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
23930                Cmp.getOperand(0).getOpcode() == ISD::AND &&
23931                isOneConstant(Cmp.getOperand(0).getOperand(1))) {
23932       SDValue Src1, Src2;
23933       // true if Op2 is XOR or OR operator and one of its operands
23934       // is equal to Op1
23935       // ( a , a op b) || ( b , a op b)
23936       auto isOrXorPattern = [&]() {
23937         if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
23938             (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
23939           Src1 =
23940               Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
23941           Src2 = Op1;
23942           return true;
23943         }
23944         return false;
23945       };
23946 
23947       if (isOrXorPattern()) {
23948         SDValue Neg;
23949         unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
23950         // we need mask of all zeros or ones with same size of the other
23951         // operands.
23952         if (CmpSz > VT.getSizeInBits())
23953           Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
23954         else if (CmpSz < VT.getSizeInBits())
23955           Neg = DAG.getNode(ISD::AND, DL, VT,
23956               DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
23957               DAG.getConstant(1, DL, VT));
23958         else
23959           Neg = CmpOp0;
23960         SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
23961                                    Neg); // -(and (x, 0x1))
23962         SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
23963         return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2);  // And Op y
23964       }
23965     }
23966   }
23967 
23968   // Look past (and (setcc_carry (cmp ...)), 1).
23969   if (Cond.getOpcode() == ISD::AND &&
23970       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
23971       isOneConstant(Cond.getOperand(1)))
23972     Cond = Cond.getOperand(0);
23973 
23974   // If condition flag is set by a X86ISD::CMP, then use it as the condition
23975   // setting operand in place of the X86ISD::SETCC.
23976   unsigned CondOpcode = Cond.getOpcode();
23977   if (CondOpcode == X86ISD::SETCC ||
23978       CondOpcode == X86ISD::SETCC_CARRY) {
23979     CC = Cond.getOperand(0);
23980 
23981     SDValue Cmp = Cond.getOperand(1);
23982     bool IllegalFPCMov = false;
23983     if (VT.isFloatingPoint() && !VT.isVector() &&
23984         !isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov())  // FPStack?
23985       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
23986 
23987     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
23988         Cmp.getOpcode() == X86ISD::BT) { // FIXME
23989       Cond = Cmp;
23990       AddTest = false;
23991     }
23992   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
23993              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
23994              CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
23995     SDValue Value;
23996     X86::CondCode X86Cond;
23997     std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
23998 
23999     CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
24000     AddTest = false;
24001   }
24002 
24003   if (AddTest) {
24004     // Look past the truncate if the high bits are known zero.
24005     if (isTruncWithZeroHighBitsInput(Cond, DAG))
24006       Cond = Cond.getOperand(0);
24007 
24008     // We know the result of AND is compared against zero. Try to match
24009     // it to BT.
24010     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
24011       SDValue BTCC;
24012       if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
24013         CC = BTCC;
24014         Cond = BT;
24015         AddTest = false;
24016       }
24017     }
24018   }
24019 
24020   if (AddTest) {
24021     CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
24022     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
24023   }
24024 
24025   // a <  b ? -1 :  0 -> RES = ~setcc_carry
24026   // a <  b ?  0 : -1 -> RES = setcc_carry
24027   // a >= b ? -1 :  0 -> RES = setcc_carry
24028   // a >= b ?  0 : -1 -> RES = ~setcc_carry
24029   if (Cond.getOpcode() == X86ISD::SUB) {
24030     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
24031 
24032     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
24033         (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24034         (isNullConstant(Op1) || isNullConstant(Op2))) {
24035       SDValue Res =
24036           DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
24037                       DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
24038       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
24039         return DAG.getNOT(DL, Res, Res.getValueType());
24040       return Res;
24041     }
24042   }
24043 
24044   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
24045   // widen the cmov and push the truncate through. This avoids introducing a new
24046   // branch during isel and doesn't add any extensions.
24047   if (Op.getValueType() == MVT::i8 &&
24048       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
24049     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
24050     if (T1.getValueType() == T2.getValueType() &&
24051         // Exclude CopyFromReg to avoid partial register stalls.
24052         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
24053       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
24054                                  CC, Cond);
24055       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24056     }
24057   }
24058 
24059   // Or finally, promote i8 cmovs if we have CMOV,
24060   //                 or i16 cmovs if it won't prevent folding a load.
24061   // FIXME: we should not limit promotion of i8 case to only when the CMOV is
24062   //        legal, but EmitLoweredSelect() can not deal with these extensions
24063   //        being inserted between two CMOV's. (in i16 case too TBN)
24064   //        https://bugs.llvm.org/show_bug.cgi?id=40974
24065   if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
24066       (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
24067        !MayFoldLoad(Op2))) {
24068     Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
24069     Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
24070     SDValue Ops[] = { Op2, Op1, CC, Cond };
24071     SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
24072     return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24073   }
24074 
24075   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
24076   // condition is true.
24077   SDValue Ops[] = { Op2, Op1, CC, Cond };
24078   return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
24079 }
24080 
LowerSIGN_EXTEND_Mask(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)24081 static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
24082                                      const X86Subtarget &Subtarget,
24083                                      SelectionDAG &DAG) {
24084   MVT VT = Op->getSimpleValueType(0);
24085   SDValue In = Op->getOperand(0);
24086   MVT InVT = In.getSimpleValueType();
24087   assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
24088   MVT VTElt = VT.getVectorElementType();
24089   SDLoc dl(Op);
24090 
24091   unsigned NumElts = VT.getVectorNumElements();
24092 
24093   // Extend VT if the scalar type is i8/i16 and BWI is not supported.
24094   MVT ExtVT = VT;
24095   if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
24096     // If v16i32 is to be avoided, we'll need to split and concatenate.
24097     if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
24098       return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
24099 
24100     ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
24101   }
24102 
24103   // Widen to 512-bits if VLX is not supported.
24104   MVT WideVT = ExtVT;
24105   if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
24106     NumElts *= 512 / ExtVT.getSizeInBits();
24107     InVT = MVT::getVectorVT(MVT::i1, NumElts);
24108     In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
24109                      In, DAG.getIntPtrConstant(0, dl));
24110     WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
24111   }
24112 
24113   SDValue V;
24114   MVT WideEltVT = WideVT.getVectorElementType();
24115   if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
24116       (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
24117     V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
24118   } else {
24119     SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
24120     SDValue Zero = DAG.getConstant(0, dl, WideVT);
24121     V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
24122   }
24123 
24124   // Truncate if we had to extend i16/i8 above.
24125   if (VT != ExtVT) {
24126     WideVT = MVT::getVectorVT(VTElt, NumElts);
24127     V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
24128   }
24129 
24130   // Extract back to 128/256-bit if we widened.
24131   if (WideVT != VT)
24132     V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
24133                     DAG.getIntPtrConstant(0, dl));
24134 
24135   return V;
24136 }
24137 
LowerANY_EXTEND(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)24138 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
24139                                SelectionDAG &DAG) {
24140   SDValue In = Op->getOperand(0);
24141   MVT InVT = In.getSimpleValueType();
24142 
24143   if (InVT.getVectorElementType() == MVT::i1)
24144     return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24145 
24146   assert(Subtarget.hasAVX() && "Expected AVX support");
24147   return LowerAVXExtend(Op, DAG, Subtarget);
24148 }
24149 
24150 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
24151 // For sign extend this needs to handle all vector sizes and SSE4.1 and
24152 // non-SSE4.1 targets. For zero extend this should only handle inputs of
24153 // MVT::v64i8 when BWI is not supported, but AVX512 is.
LowerEXTEND_VECTOR_INREG(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)24154 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
24155                                         const X86Subtarget &Subtarget,
24156                                         SelectionDAG &DAG) {
24157   SDValue In = Op->getOperand(0);
24158   MVT VT = Op->getSimpleValueType(0);
24159   MVT InVT = In.getSimpleValueType();
24160 
24161   MVT SVT = VT.getVectorElementType();
24162   MVT InSVT = InVT.getVectorElementType();
24163   assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits());
24164 
24165   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
24166     return SDValue();
24167   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
24168     return SDValue();
24169   if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
24170       !(VT.is256BitVector() && Subtarget.hasAVX()) &&
24171       !(VT.is512BitVector() && Subtarget.hasAVX512()))
24172     return SDValue();
24173 
24174   SDLoc dl(Op);
24175   unsigned Opc = Op.getOpcode();
24176   unsigned NumElts = VT.getVectorNumElements();
24177 
24178   // For 256-bit vectors, we only need the lower (128-bit) half of the input.
24179   // For 512-bit vectors, we need 128-bits or 256-bits.
24180   if (InVT.getSizeInBits() > 128) {
24181     // Input needs to be at least the same number of elements as output, and
24182     // at least 128-bits.
24183     int InSize = InSVT.getSizeInBits() * NumElts;
24184     In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
24185     InVT = In.getSimpleValueType();
24186   }
24187 
24188   // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
24189   // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
24190   // need to be handled here for 256/512-bit results.
24191   if (Subtarget.hasInt256()) {
24192     assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
24193 
24194     if (InVT.getVectorNumElements() != NumElts)
24195       return DAG.getNode(Op.getOpcode(), dl, VT, In);
24196 
24197     // FIXME: Apparently we create inreg operations that could be regular
24198     // extends.
24199     unsigned ExtOpc =
24200         Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
24201                                              : ISD::ZERO_EXTEND;
24202     return DAG.getNode(ExtOpc, dl, VT, In);
24203   }
24204 
24205   // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
24206   if (Subtarget.hasAVX()) {
24207     assert(VT.is256BitVector() && "256-bit vector expected");
24208     MVT HalfVT = VT.getHalfNumVectorElementsVT();
24209     int HalfNumElts = HalfVT.getVectorNumElements();
24210 
24211     unsigned NumSrcElts = InVT.getVectorNumElements();
24212     SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
24213     for (int i = 0; i != HalfNumElts; ++i)
24214       HiMask[i] = HalfNumElts + i;
24215 
24216     SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
24217     SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
24218     Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
24219     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
24220   }
24221 
24222   // We should only get here for sign extend.
24223   assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
24224   assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
24225 
24226   // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
24227   SDValue Curr = In;
24228   SDValue SignExt = Curr;
24229 
24230   // As SRAI is only available on i16/i32 types, we expand only up to i32
24231   // and handle i64 separately.
24232   if (InVT != MVT::v4i32) {
24233     MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
24234 
24235     unsigned DestWidth = DestVT.getScalarSizeInBits();
24236     unsigned Scale = DestWidth / InSVT.getSizeInBits();
24237 
24238     unsigned InNumElts = InVT.getVectorNumElements();
24239     unsigned DestElts = DestVT.getVectorNumElements();
24240 
24241     // Build a shuffle mask that takes each input element and places it in the
24242     // MSBs of the new element size.
24243     SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
24244     for (unsigned i = 0; i != DestElts; ++i)
24245       Mask[i * Scale + (Scale - 1)] = i;
24246 
24247     Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
24248     Curr = DAG.getBitcast(DestVT, Curr);
24249 
24250     unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
24251     SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
24252                           DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
24253   }
24254 
24255   if (VT == MVT::v2i64) {
24256     assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
24257     SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
24258     SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
24259     SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
24260     SignExt = DAG.getBitcast(VT, SignExt);
24261   }
24262 
24263   return SignExt;
24264 }
24265 
LowerSIGN_EXTEND(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)24266 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
24267                                 SelectionDAG &DAG) {
24268   MVT VT = Op->getSimpleValueType(0);
24269   SDValue In = Op->getOperand(0);
24270   MVT InVT = In.getSimpleValueType();
24271   SDLoc dl(Op);
24272 
24273   if (InVT.getVectorElementType() == MVT::i1)
24274     return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24275 
24276   assert(VT.isVector() && InVT.isVector() && "Expected vector type");
24277   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
24278          "Expected same number of elements");
24279   assert((VT.getVectorElementType() == MVT::i16 ||
24280           VT.getVectorElementType() == MVT::i32 ||
24281           VT.getVectorElementType() == MVT::i64) &&
24282          "Unexpected element type");
24283   assert((InVT.getVectorElementType() == MVT::i8 ||
24284           InVT.getVectorElementType() == MVT::i16 ||
24285           InVT.getVectorElementType() == MVT::i32) &&
24286          "Unexpected element type");
24287 
24288   if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
24289     assert(InVT == MVT::v32i8 && "Unexpected VT!");
24290     return splitVectorIntUnary(Op, DAG);
24291   }
24292 
24293   if (Subtarget.hasInt256())
24294     return Op;
24295 
24296   // Optimize vectors in AVX mode
24297   // Sign extend  v8i16 to v8i32 and
24298   //              v4i32 to v4i64
24299   //
24300   // Divide input vector into two parts
24301   // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
24302   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
24303   // concat the vectors to original VT
24304   MVT HalfVT = VT.getHalfNumVectorElementsVT();
24305   SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
24306 
24307   unsigned NumElems = InVT.getVectorNumElements();
24308   SmallVector<int,8> ShufMask(NumElems, -1);
24309   for (unsigned i = 0; i != NumElems/2; ++i)
24310     ShufMask[i] = i + NumElems/2;
24311 
24312   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
24313   OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
24314 
24315   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
24316 }
24317 
24318 /// Change a vector store into a pair of half-size vector stores.
splitVectorStore(StoreSDNode * Store,SelectionDAG & DAG)24319 static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
24320   SDValue StoredVal = Store->getValue();
24321   assert((StoredVal.getValueType().is256BitVector() ||
24322           StoredVal.getValueType().is512BitVector()) &&
24323          "Expecting 256/512-bit op");
24324 
24325   // Splitting volatile memory ops is not allowed unless the operation was not
24326   // legal to begin with. Assume the input store is legal (this transform is
24327   // only used for targets with AVX). Note: It is possible that we have an
24328   // illegal type like v2i128, and so we could allow splitting a volatile store
24329   // in that case if that is important.
24330   if (!Store->isSimple())
24331     return SDValue();
24332 
24333   SDLoc DL(Store);
24334   SDValue Value0, Value1;
24335   std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
24336   unsigned HalfOffset = Value0.getValueType().getStoreSize();
24337   SDValue Ptr0 = Store->getBasePtr();
24338   SDValue Ptr1 =
24339       DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
24340   SDValue Ch0 =
24341       DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
24342                    Store->getOriginalAlign(),
24343                    Store->getMemOperand()->getFlags());
24344   SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
24345                              Store->getPointerInfo().getWithOffset(HalfOffset),
24346                              Store->getOriginalAlign(),
24347                              Store->getMemOperand()->getFlags());
24348   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
24349 }
24350 
24351 /// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
24352 /// type.
scalarizeVectorStore(StoreSDNode * Store,MVT StoreVT,SelectionDAG & DAG)24353 static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
24354                                     SelectionDAG &DAG) {
24355   SDValue StoredVal = Store->getValue();
24356   assert(StoreVT.is128BitVector() &&
24357          StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
24358   StoredVal = DAG.getBitcast(StoreVT, StoredVal);
24359 
24360   // Splitting volatile memory ops is not allowed unless the operation was not
24361   // legal to begin with. We are assuming the input op is legal (this transform
24362   // is only used for targets with AVX).
24363   if (!Store->isSimple())
24364     return SDValue();
24365 
24366   MVT StoreSVT = StoreVT.getScalarType();
24367   unsigned NumElems = StoreVT.getVectorNumElements();
24368   unsigned ScalarSize = StoreSVT.getStoreSize();
24369 
24370   SDLoc DL(Store);
24371   SmallVector<SDValue, 4> Stores;
24372   for (unsigned i = 0; i != NumElems; ++i) {
24373     unsigned Offset = i * ScalarSize;
24374     SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
24375                                            TypeSize::Fixed(Offset), DL);
24376     SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
24377                               DAG.getIntPtrConstant(i, DL));
24378     SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
24379                               Store->getPointerInfo().getWithOffset(Offset),
24380                               Store->getOriginalAlign(),
24381                               Store->getMemOperand()->getFlags());
24382     Stores.push_back(Ch);
24383   }
24384   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
24385 }
24386 
LowerStore(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)24387 static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
24388                           SelectionDAG &DAG) {
24389   StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
24390   SDLoc dl(St);
24391   SDValue StoredVal = St->getValue();
24392 
24393   // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
24394   if (StoredVal.getValueType().isVector() &&
24395       StoredVal.getValueType().getVectorElementType() == MVT::i1) {
24396     unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
24397     assert(NumElts <= 8 && "Unexpected VT");
24398     assert(!St->isTruncatingStore() && "Expected non-truncating store");
24399     assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
24400            "Expected AVX512F without AVX512DQI");
24401 
24402     // We must pad with zeros to ensure we store zeroes to any unused bits.
24403     StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
24404                             DAG.getUNDEF(MVT::v16i1), StoredVal,
24405                             DAG.getIntPtrConstant(0, dl));
24406     StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
24407     StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
24408     // Make sure we store zeros in the extra bits.
24409     if (NumElts < 8)
24410       StoredVal = DAG.getZeroExtendInReg(
24411           StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
24412 
24413     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24414                         St->getPointerInfo(), St->getOriginalAlign(),
24415                         St->getMemOperand()->getFlags());
24416   }
24417 
24418   if (St->isTruncatingStore())
24419     return SDValue();
24420 
24421   // If this is a 256-bit store of concatenated ops, we are better off splitting
24422   // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
24423   // and each half can execute independently. Some cores would split the op into
24424   // halves anyway, so the concat (vinsertf128) is purely an extra op.
24425   MVT StoreVT = StoredVal.getSimpleValueType();
24426   if (StoreVT.is256BitVector() ||
24427       ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
24428        !Subtarget.hasBWI())) {
24429     SmallVector<SDValue, 4> CatOps;
24430     if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
24431       return splitVectorStore(St, DAG);
24432     return SDValue();
24433   }
24434 
24435   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24436   assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
24437          "Unexpected VT");
24438   assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
24439              TargetLowering::TypeWidenVector && "Unexpected type action!");
24440 
24441   EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
24442   StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
24443                           DAG.getUNDEF(StoreVT));
24444 
24445   if (Subtarget.hasSSE2()) {
24446     // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
24447     // and store it.
24448     MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
24449     MVT CastVT = MVT::getVectorVT(StVT, 2);
24450     StoredVal = DAG.getBitcast(CastVT, StoredVal);
24451     StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
24452                             DAG.getIntPtrConstant(0, dl));
24453 
24454     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24455                         St->getPointerInfo(), St->getOriginalAlign(),
24456                         St->getMemOperand()->getFlags());
24457   }
24458   assert(Subtarget.hasSSE1() && "Expected SSE");
24459   SDVTList Tys = DAG.getVTList(MVT::Other);
24460   SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
24461   return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
24462                                  St->getMemOperand());
24463 }
24464 
24465 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
24466 // may emit an illegal shuffle but the expansion is still better than scalar
24467 // code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
24468 // we'll emit a shuffle and a arithmetic shift.
24469 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
24470 // TODO: It is possible to support ZExt by zeroing the undef values during
24471 // the shuffle phase or after the shuffle.
LowerLoad(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)24472 static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
24473                                  SelectionDAG &DAG) {
24474   MVT RegVT = Op.getSimpleValueType();
24475   assert(RegVT.isVector() && "We only custom lower vector loads.");
24476   assert(RegVT.isInteger() &&
24477          "We only custom lower integer vector loads.");
24478 
24479   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
24480   SDLoc dl(Ld);
24481 
24482   // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
24483   if (RegVT.getVectorElementType() == MVT::i1) {
24484     assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
24485     assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
24486     assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
24487            "Expected AVX512F without AVX512DQI");
24488 
24489     SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
24490                                 Ld->getPointerInfo(), Ld->getOriginalAlign(),
24491                                 Ld->getMemOperand()->getFlags());
24492 
24493     // Replace chain users with the new chain.
24494     assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
24495 
24496     SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
24497     Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
24498                       DAG.getBitcast(MVT::v16i1, Val),
24499                       DAG.getIntPtrConstant(0, dl));
24500     return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
24501   }
24502 
24503   return SDValue();
24504 }
24505 
24506 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
24507 /// each of which has no other use apart from the AND / OR.
isAndOrOfSetCCs(SDValue Op,unsigned & Opc)24508 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
24509   Opc = Op.getOpcode();
24510   if (Opc != ISD::OR && Opc != ISD::AND)
24511     return false;
24512   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
24513           Op.getOperand(0).hasOneUse() &&
24514           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
24515           Op.getOperand(1).hasOneUse());
24516 }
24517 
LowerBRCOND(SDValue Op,SelectionDAG & DAG) const24518 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
24519   SDValue Chain = Op.getOperand(0);
24520   SDValue Cond  = Op.getOperand(1);
24521   SDValue Dest  = Op.getOperand(2);
24522   SDLoc dl(Op);
24523 
24524   if (Cond.getOpcode() == ISD::SETCC &&
24525       Cond.getOperand(0).getValueType() != MVT::f128) {
24526     SDValue LHS = Cond.getOperand(0);
24527     SDValue RHS = Cond.getOperand(1);
24528     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
24529 
24530     // Special case for
24531     // setcc([su]{add,sub,mul}o == 0)
24532     // setcc([su]{add,sub,mul}o != 1)
24533     if (ISD::isOverflowIntrOpRes(LHS) &&
24534         (CC == ISD::SETEQ || CC == ISD::SETNE) &&
24535         (isNullConstant(RHS) || isOneConstant(RHS))) {
24536       SDValue Value, Overflow;
24537       X86::CondCode X86Cond;
24538       std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
24539 
24540       if ((CC == ISD::SETEQ) == isNullConstant(RHS))
24541         X86Cond = X86::GetOppositeBranchCondition(X86Cond);
24542 
24543       SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24544       return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24545                          Overflow);
24546     }
24547 
24548     if (LHS.getSimpleValueType().isInteger()) {
24549       SDValue CCVal;
24550       SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
24551       return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24552                          EFLAGS);
24553     }
24554 
24555     if (CC == ISD::SETOEQ) {
24556       // For FCMP_OEQ, we can emit
24557       // two branches instead of an explicit AND instruction with a
24558       // separate test. However, we only do this if this block doesn't
24559       // have a fall-through edge, because this requires an explicit
24560       // jmp when the condition is false.
24561       if (Op.getNode()->hasOneUse()) {
24562         SDNode *User = *Op.getNode()->use_begin();
24563         // Look for an unconditional branch following this conditional branch.
24564         // We need this because we need to reverse the successors in order
24565         // to implement FCMP_OEQ.
24566         if (User->getOpcode() == ISD::BR) {
24567           SDValue FalseBB = User->getOperand(1);
24568           SDNode *NewBR =
24569             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
24570           assert(NewBR == User);
24571           (void)NewBR;
24572           Dest = FalseBB;
24573 
24574           SDValue Cmp =
24575               DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24576           SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24577           Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
24578                               CCVal, Cmp);
24579           CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24580           return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24581                              Cmp);
24582         }
24583       }
24584     } else if (CC == ISD::SETUNE) {
24585       // For FCMP_UNE, we can emit
24586       // two branches instead of an explicit OR instruction with a
24587       // separate test.
24588       SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24589       SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24590       Chain =
24591           DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
24592       CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24593       return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24594                          Cmp);
24595     } else {
24596       X86::CondCode X86Cond =
24597           TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
24598       SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24599       SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24600       return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24601                          Cmp);
24602     }
24603   }
24604 
24605   if (ISD::isOverflowIntrOpRes(Cond)) {
24606     SDValue Value, Overflow;
24607     X86::CondCode X86Cond;
24608     std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24609 
24610     SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24611     return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24612                        Overflow);
24613   }
24614 
24615   // Look past the truncate if the high bits are known zero.
24616   if (isTruncWithZeroHighBitsInput(Cond, DAG))
24617     Cond = Cond.getOperand(0);
24618 
24619   EVT CondVT = Cond.getValueType();
24620 
24621   // Add an AND with 1 if we don't already have one.
24622   if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
24623     Cond =
24624         DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
24625 
24626   SDValue LHS = Cond;
24627   SDValue RHS = DAG.getConstant(0, dl, CondVT);
24628 
24629   SDValue CCVal;
24630   SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
24631   return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24632                      EFLAGS);
24633 }
24634 
24635 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
24636 // Calls to _alloca are needed to probe the stack when allocating more than 4k
24637 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
24638 // that the guard pages used by the OS virtual memory manager are allocated in
24639 // correct sequence.
24640 SDValue
LowerDYNAMIC_STACKALLOC(SDValue Op,SelectionDAG & DAG) const24641 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
24642                                            SelectionDAG &DAG) const {
24643   MachineFunction &MF = DAG.getMachineFunction();
24644   bool SplitStack = MF.shouldSplitStack();
24645   bool EmitStackProbeCall = hasStackProbeSymbol(MF);
24646   bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
24647                SplitStack || EmitStackProbeCall;
24648   SDLoc dl(Op);
24649 
24650   // Get the inputs.
24651   SDNode *Node = Op.getNode();
24652   SDValue Chain = Op.getOperand(0);
24653   SDValue Size  = Op.getOperand(1);
24654   MaybeAlign Alignment(Op.getConstantOperandVal(2));
24655   EVT VT = Node->getValueType(0);
24656 
24657   // Chain the dynamic stack allocation so that it doesn't modify the stack
24658   // pointer when other instructions are using the stack.
24659   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
24660 
24661   bool Is64Bit = Subtarget.is64Bit();
24662   MVT SPTy = getPointerTy(DAG.getDataLayout());
24663 
24664   SDValue Result;
24665   if (!Lower) {
24666     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24667     Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
24668     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
24669                     " not tell us which reg is the stack pointer!");
24670 
24671     const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
24672     const Align StackAlign = TFI.getStackAlign();
24673     if (hasInlineStackProbe(MF)) {
24674       MachineRegisterInfo &MRI = MF.getRegInfo();
24675 
24676       const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24677       Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24678       Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24679       Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
24680                            DAG.getRegister(Vreg, SPTy));
24681     } else {
24682       SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
24683       Chain = SP.getValue(1);
24684       Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
24685     }
24686     if (Alignment && *Alignment > StackAlign)
24687       Result =
24688           DAG.getNode(ISD::AND, dl, VT, Result,
24689                       DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24690     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
24691   } else if (SplitStack) {
24692     MachineRegisterInfo &MRI = MF.getRegInfo();
24693 
24694     if (Is64Bit) {
24695       // The 64 bit implementation of segmented stacks needs to clobber both r10
24696       // r11. This makes it impossible to use it along with nested parameters.
24697       const Function &F = MF.getFunction();
24698       for (const auto &A : F.args()) {
24699         if (A.hasNestAttr())
24700           report_fatal_error("Cannot use segmented stacks with functions that "
24701                              "have nested arguments.");
24702       }
24703     }
24704 
24705     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24706     Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24707     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24708     Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
24709                                 DAG.getRegister(Vreg, SPTy));
24710   } else {
24711     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
24712     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
24713     MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
24714 
24715     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
24716     Register SPReg = RegInfo->getStackRegister();
24717     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
24718     Chain = SP.getValue(1);
24719 
24720     if (Alignment) {
24721       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
24722                        DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24723       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
24724     }
24725 
24726     Result = SP;
24727   }
24728 
24729   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
24730                              DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
24731 
24732   SDValue Ops[2] = {Result, Chain};
24733   return DAG.getMergeValues(Ops, dl);
24734 }
24735 
LowerVASTART(SDValue Op,SelectionDAG & DAG) const24736 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
24737   MachineFunction &MF = DAG.getMachineFunction();
24738   auto PtrVT = getPointerTy(MF.getDataLayout());
24739   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
24740 
24741   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
24742   SDLoc DL(Op);
24743 
24744   if (!Subtarget.is64Bit() ||
24745       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
24746     // vastart just stores the address of the VarArgsFrameIndex slot into the
24747     // memory location argument.
24748     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24749     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
24750                         MachinePointerInfo(SV));
24751   }
24752 
24753   // __va_list_tag:
24754   //   gp_offset         (0 - 6 * 8)
24755   //   fp_offset         (48 - 48 + 8 * 16)
24756   //   overflow_arg_area (point to parameters coming in memory).
24757   //   reg_save_area
24758   SmallVector<SDValue, 8> MemOps;
24759   SDValue FIN = Op.getOperand(1);
24760   // Store gp_offset
24761   SDValue Store = DAG.getStore(
24762       Op.getOperand(0), DL,
24763       DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
24764       MachinePointerInfo(SV));
24765   MemOps.push_back(Store);
24766 
24767   // Store fp_offset
24768   FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
24769   Store = DAG.getStore(
24770       Op.getOperand(0), DL,
24771       DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
24772       MachinePointerInfo(SV, 4));
24773   MemOps.push_back(Store);
24774 
24775   // Store ptr to overflow_arg_area
24776   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
24777   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24778   Store =
24779       DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
24780   MemOps.push_back(Store);
24781 
24782   // Store ptr to reg_save_area.
24783   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
24784       Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
24785   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
24786   Store = DAG.getStore(
24787       Op.getOperand(0), DL, RSFIN, FIN,
24788       MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
24789   MemOps.push_back(Store);
24790   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
24791 }
24792 
LowerVAARG(SDValue Op,SelectionDAG & DAG) const24793 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
24794   assert(Subtarget.is64Bit() &&
24795          "LowerVAARG only handles 64-bit va_arg!");
24796   assert(Op.getNumOperands() == 4);
24797 
24798   MachineFunction &MF = DAG.getMachineFunction();
24799   if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
24800     // The Win64 ABI uses char* instead of a structure.
24801     return DAG.expandVAArg(Op.getNode());
24802 
24803   SDValue Chain = Op.getOperand(0);
24804   SDValue SrcPtr = Op.getOperand(1);
24805   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
24806   unsigned Align = Op.getConstantOperandVal(3);
24807   SDLoc dl(Op);
24808 
24809   EVT ArgVT = Op.getNode()->getValueType(0);
24810   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
24811   uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
24812   uint8_t ArgMode;
24813 
24814   // Decide which area this value should be read from.
24815   // TODO: Implement the AMD64 ABI in its entirety. This simple
24816   // selection mechanism works only for the basic types.
24817   assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
24818   if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
24819     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
24820   } else {
24821     assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
24822            "Unhandled argument type in LowerVAARG");
24823     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
24824   }
24825 
24826   if (ArgMode == 2) {
24827     // Sanity Check: Make sure using fp_offset makes sense.
24828     assert(!Subtarget.useSoftFloat() &&
24829            !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
24830            Subtarget.hasSSE1());
24831   }
24832 
24833   // Insert VAARG node into the DAG
24834   // VAARG returns two values: Variable Argument Address, Chain
24835   SDValue InstOps[] = {Chain, SrcPtr,
24836                        DAG.getTargetConstant(ArgSize, dl, MVT::i32),
24837                        DAG.getTargetConstant(ArgMode, dl, MVT::i8),
24838                        DAG.getTargetConstant(Align, dl, MVT::i32)};
24839   SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
24840   SDValue VAARG = DAG.getMemIntrinsicNode(
24841       Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
24842       VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
24843       /*Alignment=*/None,
24844       MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
24845   Chain = VAARG.getValue(1);
24846 
24847   // Load the next argument and return it
24848   return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
24849 }
24850 
LowerVACOPY(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)24851 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
24852                            SelectionDAG &DAG) {
24853   // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
24854   // where a va_list is still an i8*.
24855   assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
24856   if (Subtarget.isCallingConvWin64(
24857         DAG.getMachineFunction().getFunction().getCallingConv()))
24858     // Probably a Win64 va_copy.
24859     return DAG.expandVACopy(Op.getNode());
24860 
24861   SDValue Chain = Op.getOperand(0);
24862   SDValue DstPtr = Op.getOperand(1);
24863   SDValue SrcPtr = Op.getOperand(2);
24864   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
24865   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
24866   SDLoc DL(Op);
24867 
24868   return DAG.getMemcpy(
24869       Chain, DL, DstPtr, SrcPtr,
24870       DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
24871       Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
24872       false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
24873 }
24874 
24875 // Helper to get immediate/variable SSE shift opcode from other shift opcodes.
getTargetVShiftUniformOpcode(unsigned Opc,bool IsVariable)24876 static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
24877   switch (Opc) {
24878   case ISD::SHL:
24879   case X86ISD::VSHL:
24880   case X86ISD::VSHLI:
24881     return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
24882   case ISD::SRL:
24883   case X86ISD::VSRL:
24884   case X86ISD::VSRLI:
24885     return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
24886   case ISD::SRA:
24887   case X86ISD::VSRA:
24888   case X86ISD::VSRAI:
24889     return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
24890   }
24891   llvm_unreachable("Unknown target vector shift node");
24892 }
24893 
24894 /// Handle vector element shifts where the shift amount is a constant.
24895 /// Takes immediate version of shift as input.
getTargetVShiftByConstNode(unsigned Opc,const SDLoc & dl,MVT VT,SDValue SrcOp,uint64_t ShiftAmt,SelectionDAG & DAG)24896 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
24897                                           SDValue SrcOp, uint64_t ShiftAmt,
24898                                           SelectionDAG &DAG) {
24899   MVT ElementType = VT.getVectorElementType();
24900 
24901   // Bitcast the source vector to the output type, this is mainly necessary for
24902   // vXi8/vXi64 shifts.
24903   if (VT != SrcOp.getSimpleValueType())
24904     SrcOp = DAG.getBitcast(VT, SrcOp);
24905 
24906   // Fold this packed shift into its first operand if ShiftAmt is 0.
24907   if (ShiftAmt == 0)
24908     return SrcOp;
24909 
24910   // Check for ShiftAmt >= element width
24911   if (ShiftAmt >= ElementType.getSizeInBits()) {
24912     if (Opc == X86ISD::VSRAI)
24913       ShiftAmt = ElementType.getSizeInBits() - 1;
24914     else
24915       return DAG.getConstant(0, dl, VT);
24916   }
24917 
24918   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
24919          && "Unknown target vector shift-by-constant node");
24920 
24921   // Fold this packed vector shift into a build vector if SrcOp is a
24922   // vector of Constants or UNDEFs.
24923   if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
24924     SmallVector<SDValue, 8> Elts;
24925     unsigned NumElts = SrcOp->getNumOperands();
24926 
24927     switch (Opc) {
24928     default: llvm_unreachable("Unknown opcode!");
24929     case X86ISD::VSHLI:
24930       for (unsigned i = 0; i != NumElts; ++i) {
24931         SDValue CurrentOp = SrcOp->getOperand(i);
24932         if (CurrentOp->isUndef()) {
24933           // Must produce 0s in the correct bits.
24934           Elts.push_back(DAG.getConstant(0, dl, ElementType));
24935           continue;
24936         }
24937         auto *ND = cast<ConstantSDNode>(CurrentOp);
24938         const APInt &C = ND->getAPIntValue();
24939         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
24940       }
24941       break;
24942     case X86ISD::VSRLI:
24943       for (unsigned i = 0; i != NumElts; ++i) {
24944         SDValue CurrentOp = SrcOp->getOperand(i);
24945         if (CurrentOp->isUndef()) {
24946           // Must produce 0s in the correct bits.
24947           Elts.push_back(DAG.getConstant(0, dl, ElementType));
24948           continue;
24949         }
24950         auto *ND = cast<ConstantSDNode>(CurrentOp);
24951         const APInt &C = ND->getAPIntValue();
24952         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
24953       }
24954       break;
24955     case X86ISD::VSRAI:
24956       for (unsigned i = 0; i != NumElts; ++i) {
24957         SDValue CurrentOp = SrcOp->getOperand(i);
24958         if (CurrentOp->isUndef()) {
24959           // All shifted in bits must be the same so use 0.
24960           Elts.push_back(DAG.getConstant(0, dl, ElementType));
24961           continue;
24962         }
24963         auto *ND = cast<ConstantSDNode>(CurrentOp);
24964         const APInt &C = ND->getAPIntValue();
24965         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
24966       }
24967       break;
24968     }
24969 
24970     return DAG.getBuildVector(VT, dl, Elts);
24971   }
24972 
24973   return DAG.getNode(Opc, dl, VT, SrcOp,
24974                      DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
24975 }
24976 
24977 /// Handle vector element shifts where the shift amount may or may not be a
24978 /// constant. Takes immediate version of shift as input.
getTargetVShiftNode(unsigned Opc,const SDLoc & dl,MVT VT,SDValue SrcOp,SDValue ShAmt,const X86Subtarget & Subtarget,SelectionDAG & DAG)24979 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
24980                                    SDValue SrcOp, SDValue ShAmt,
24981                                    const X86Subtarget &Subtarget,
24982                                    SelectionDAG &DAG) {
24983   MVT SVT = ShAmt.getSimpleValueType();
24984   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
24985 
24986   // Catch shift-by-constant.
24987   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
24988     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
24989                                       CShAmt->getZExtValue(), DAG);
24990 
24991   // Change opcode to non-immediate version.
24992   Opc = getTargetVShiftUniformOpcode(Opc, true);
24993 
24994   // Need to build a vector containing shift amount.
24995   // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
24996   // +====================+============+=======================================+
24997   // | ShAmt is           | HasSSE4.1? | Construct ShAmt vector as             |
24998   // +====================+============+=======================================+
24999   // | i64                | Yes, No    | Use ShAmt as lowest elt               |
25000   // | i32                | Yes        | zero-extend in-reg                    |
25001   // | (i32 zext(i16/i8)) | Yes        | zero-extend in-reg                    |
25002   // | (i32 zext(i16/i8)) | No         | byte-shift-in-reg                     |
25003   // | i16/i32            | No         | v4i32 build_vector(ShAmt, 0, ud, ud)) |
25004   // +====================+============+=======================================+
25005 
25006   if (SVT == MVT::i64)
25007     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
25008   else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
25009            ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
25010            (ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 ||
25011             ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
25012     ShAmt = ShAmt.getOperand(0);
25013     MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
25014     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
25015     if (Subtarget.hasSSE41())
25016       ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25017                           MVT::v2i64, ShAmt);
25018     else {
25019       SDValue ByteShift = DAG.getTargetConstant(
25020           (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
25021       ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
25022       ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25023                           ByteShift);
25024       ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25025                           ByteShift);
25026     }
25027   } else if (Subtarget.hasSSE41() &&
25028              ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
25029     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
25030     ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25031                         MVT::v2i64, ShAmt);
25032   } else {
25033     SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
25034                         DAG.getUNDEF(SVT)};
25035     ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
25036   }
25037 
25038   // The return type has to be a 128-bit type with the same element
25039   // type as the input type.
25040   MVT EltVT = VT.getVectorElementType();
25041   MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
25042 
25043   ShAmt = DAG.getBitcast(ShVT, ShAmt);
25044   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
25045 }
25046 
25047 /// Return Mask with the necessary casting or extending
25048 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
getMaskNode(SDValue Mask,MVT MaskVT,const X86Subtarget & Subtarget,SelectionDAG & DAG,const SDLoc & dl)25049 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
25050                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
25051                            const SDLoc &dl) {
25052 
25053   if (isAllOnesConstant(Mask))
25054     return DAG.getConstant(1, dl, MaskVT);
25055   if (X86::isZeroNode(Mask))
25056     return DAG.getConstant(0, dl, MaskVT);
25057 
25058   assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
25059 
25060   if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
25061     assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
25062     assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
25063     // In case 32bit mode, bitcast i64 is illegal, extend/split it.
25064     SDValue Lo, Hi;
25065     Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
25066                         DAG.getConstant(0, dl, MVT::i32));
25067     Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
25068                         DAG.getConstant(1, dl, MVT::i32));
25069 
25070     Lo = DAG.getBitcast(MVT::v32i1, Lo);
25071     Hi = DAG.getBitcast(MVT::v32i1, Hi);
25072 
25073     return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
25074   } else {
25075     MVT BitcastVT = MVT::getVectorVT(MVT::i1,
25076                                      Mask.getSimpleValueType().getSizeInBits());
25077     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
25078     // are extracted by EXTRACT_SUBVECTOR.
25079     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
25080                        DAG.getBitcast(BitcastVT, Mask),
25081                        DAG.getIntPtrConstant(0, dl));
25082   }
25083 }
25084 
25085 /// Return (and \p Op, \p Mask) for compare instructions or
25086 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
25087 /// necessary casting or extending for \p Mask when lowering masking intrinsics
getVectorMaskingNode(SDValue Op,SDValue Mask,SDValue PreservedSrc,const X86Subtarget & Subtarget,SelectionDAG & DAG)25088 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
25089                   SDValue PreservedSrc,
25090                   const X86Subtarget &Subtarget,
25091                   SelectionDAG &DAG) {
25092   MVT VT = Op.getSimpleValueType();
25093   MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
25094   unsigned OpcodeSelect = ISD::VSELECT;
25095   SDLoc dl(Op);
25096 
25097   if (isAllOnesConstant(Mask))
25098     return Op;
25099 
25100   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25101 
25102   if (PreservedSrc.isUndef())
25103     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25104   return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
25105 }
25106 
25107 /// Creates an SDNode for a predicated scalar operation.
25108 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
25109 /// The mask is coming as MVT::i8 and it should be transformed
25110 /// to MVT::v1i1 while lowering masking intrinsics.
25111 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
25112 /// "X86select" instead of "vselect". We just can't create the "vselect" node
25113 /// for a scalar instruction.
getScalarMaskingNode(SDValue Op,SDValue Mask,SDValue PreservedSrc,const X86Subtarget & Subtarget,SelectionDAG & DAG)25114 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
25115                                     SDValue PreservedSrc,
25116                                     const X86Subtarget &Subtarget,
25117                                     SelectionDAG &DAG) {
25118 
25119   if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
25120     if (MaskConst->getZExtValue() & 0x1)
25121       return Op;
25122 
25123   MVT VT = Op.getSimpleValueType();
25124   SDLoc dl(Op);
25125 
25126   assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
25127   SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
25128                               DAG.getBitcast(MVT::v8i1, Mask),
25129                               DAG.getIntPtrConstant(0, dl));
25130   if (Op.getOpcode() == X86ISD::FSETCCM ||
25131       Op.getOpcode() == X86ISD::FSETCCM_SAE ||
25132       Op.getOpcode() == X86ISD::VFPCLASSS)
25133     return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
25134 
25135   if (PreservedSrc.isUndef())
25136     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25137   return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
25138 }
25139 
getSEHRegistrationNodeSize(const Function * Fn)25140 static int getSEHRegistrationNodeSize(const Function *Fn) {
25141   if (!Fn->hasPersonalityFn())
25142     report_fatal_error(
25143         "querying registration node size for function without personality");
25144   // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
25145   // WinEHStatePass for the full struct definition.
25146   switch (classifyEHPersonality(Fn->getPersonalityFn())) {
25147   case EHPersonality::MSVC_X86SEH: return 24;
25148   case EHPersonality::MSVC_CXX: return 16;
25149   default: break;
25150   }
25151   report_fatal_error(
25152       "can only recover FP for 32-bit MSVC EH personality functions");
25153 }
25154 
25155 /// When the MSVC runtime transfers control to us, either to an outlined
25156 /// function or when returning to a parent frame after catching an exception, we
25157 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
25158 /// Here's the math:
25159 ///   RegNodeBase = EntryEBP - RegNodeSize
25160 ///   ParentFP = RegNodeBase - ParentFrameOffset
25161 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
25162 /// subtracting the offset (negative on x86) takes us back to the parent FP.
recoverFramePointer(SelectionDAG & DAG,const Function * Fn,SDValue EntryEBP)25163 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
25164                                    SDValue EntryEBP) {
25165   MachineFunction &MF = DAG.getMachineFunction();
25166   SDLoc dl;
25167 
25168   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25169   MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25170 
25171   // It's possible that the parent function no longer has a personality function
25172   // if the exceptional code was optimized away, in which case we just return
25173   // the incoming EBP.
25174   if (!Fn->hasPersonalityFn())
25175     return EntryEBP;
25176 
25177   // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
25178   // registration, or the .set_setframe offset.
25179   MCSymbol *OffsetSym =
25180       MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
25181           GlobalValue::dropLLVMManglingEscape(Fn->getName()));
25182   SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
25183   SDValue ParentFrameOffset =
25184       DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
25185 
25186   // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
25187   // prologue to RBP in the parent function.
25188   const X86Subtarget &Subtarget =
25189       static_cast<const X86Subtarget &>(DAG.getSubtarget());
25190   if (Subtarget.is64Bit())
25191     return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
25192 
25193   int RegNodeSize = getSEHRegistrationNodeSize(Fn);
25194   // RegNodeBase = EntryEBP - RegNodeSize
25195   // ParentFP = RegNodeBase - ParentFrameOffset
25196   SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
25197                                     DAG.getConstant(RegNodeSize, dl, PtrVT));
25198   return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
25199 }
25200 
LowerINTRINSIC_WO_CHAIN(SDValue Op,SelectionDAG & DAG) const25201 SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
25202                                                    SelectionDAG &DAG) const {
25203   // Helper to detect if the operand is CUR_DIRECTION rounding mode.
25204   auto isRoundModeCurDirection = [](SDValue Rnd) {
25205     if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
25206       return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
25207 
25208     return false;
25209   };
25210   auto isRoundModeSAE = [](SDValue Rnd) {
25211     if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25212       unsigned RC = C->getZExtValue();
25213       if (RC & X86::STATIC_ROUNDING::NO_EXC) {
25214         // Clear the NO_EXC bit and check remaining bits.
25215         RC ^= X86::STATIC_ROUNDING::NO_EXC;
25216         // As a convenience we allow no other bits or explicitly
25217         // current direction.
25218         return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
25219       }
25220     }
25221 
25222     return false;
25223   };
25224   auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
25225     if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25226       RC = C->getZExtValue();
25227       if (RC & X86::STATIC_ROUNDING::NO_EXC) {
25228         // Clear the NO_EXC bit and check remaining bits.
25229         RC ^= X86::STATIC_ROUNDING::NO_EXC;
25230         return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
25231                RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
25232                RC == X86::STATIC_ROUNDING::TO_POS_INF ||
25233                RC == X86::STATIC_ROUNDING::TO_ZERO;
25234       }
25235     }
25236 
25237     return false;
25238   };
25239 
25240   SDLoc dl(Op);
25241   unsigned IntNo = Op.getConstantOperandVal(0);
25242   MVT VT = Op.getSimpleValueType();
25243   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
25244 
25245   // Propagate flags from original node to transformed node(s).
25246   SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
25247 
25248   if (IntrData) {
25249     switch(IntrData->Type) {
25250     case INTR_TYPE_1OP: {
25251       // We specify 2 possible opcodes for intrinsics with rounding modes.
25252       // First, we check if the intrinsic may have non-default rounding mode,
25253       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25254       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25255       if (IntrWithRoundingModeOpcode != 0) {
25256         SDValue Rnd = Op.getOperand(2);
25257         unsigned RC = 0;
25258         if (isRoundModeSAEToX(Rnd, RC))
25259           return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25260                              Op.getOperand(1),
25261                              DAG.getTargetConstant(RC, dl, MVT::i32));
25262         if (!isRoundModeCurDirection(Rnd))
25263           return SDValue();
25264       }
25265       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25266                          Op.getOperand(1));
25267     }
25268     case INTR_TYPE_1OP_SAE: {
25269       SDValue Sae = Op.getOperand(2);
25270 
25271       unsigned Opc;
25272       if (isRoundModeCurDirection(Sae))
25273         Opc = IntrData->Opc0;
25274       else if (isRoundModeSAE(Sae))
25275         Opc = IntrData->Opc1;
25276       else
25277         return SDValue();
25278 
25279       return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
25280     }
25281     case INTR_TYPE_2OP: {
25282       SDValue Src2 = Op.getOperand(2);
25283 
25284       // We specify 2 possible opcodes for intrinsics with rounding modes.
25285       // First, we check if the intrinsic may have non-default rounding mode,
25286       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25287       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25288       if (IntrWithRoundingModeOpcode != 0) {
25289         SDValue Rnd = Op.getOperand(3);
25290         unsigned RC = 0;
25291         if (isRoundModeSAEToX(Rnd, RC))
25292           return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25293                              Op.getOperand(1), Src2,
25294                              DAG.getTargetConstant(RC, dl, MVT::i32));
25295         if (!isRoundModeCurDirection(Rnd))
25296           return SDValue();
25297       }
25298 
25299       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25300                          Op.getOperand(1), Src2);
25301     }
25302     case INTR_TYPE_2OP_SAE: {
25303       SDValue Sae = Op.getOperand(3);
25304 
25305       unsigned Opc;
25306       if (isRoundModeCurDirection(Sae))
25307         Opc = IntrData->Opc0;
25308       else if (isRoundModeSAE(Sae))
25309         Opc = IntrData->Opc1;
25310       else
25311         return SDValue();
25312 
25313       return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
25314                          Op.getOperand(2));
25315     }
25316     case INTR_TYPE_3OP:
25317     case INTR_TYPE_3OP_IMM8: {
25318       SDValue Src1 = Op.getOperand(1);
25319       SDValue Src2 = Op.getOperand(2);
25320       SDValue Src3 = Op.getOperand(3);
25321 
25322       if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
25323           Src3.getValueType() != MVT::i8) {
25324         Src3 = DAG.getTargetConstant(
25325             cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
25326       }
25327 
25328       // We specify 2 possible opcodes for intrinsics with rounding modes.
25329       // First, we check if the intrinsic may have non-default rounding mode,
25330       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25331       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25332       if (IntrWithRoundingModeOpcode != 0) {
25333         SDValue Rnd = Op.getOperand(4);
25334         unsigned RC = 0;
25335         if (isRoundModeSAEToX(Rnd, RC))
25336           return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25337                              Src1, Src2, Src3,
25338                              DAG.getTargetConstant(RC, dl, MVT::i32));
25339         if (!isRoundModeCurDirection(Rnd))
25340           return SDValue();
25341       }
25342 
25343       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25344                          {Src1, Src2, Src3});
25345     }
25346     case INTR_TYPE_4OP_IMM8: {
25347       assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
25348       SDValue Src4 = Op.getOperand(4);
25349       if (Src4.getValueType() != MVT::i8) {
25350         Src4 = DAG.getTargetConstant(
25351             cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
25352       }
25353 
25354       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25355                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
25356                          Src4);
25357     }
25358     case INTR_TYPE_1OP_MASK: {
25359       SDValue Src = Op.getOperand(1);
25360       SDValue PassThru = Op.getOperand(2);
25361       SDValue Mask = Op.getOperand(3);
25362       // We add rounding mode to the Node when
25363       //   - RC Opcode is specified and
25364       //   - RC is not "current direction".
25365       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25366       if (IntrWithRoundingModeOpcode != 0) {
25367         SDValue Rnd = Op.getOperand(4);
25368         unsigned RC = 0;
25369         if (isRoundModeSAEToX(Rnd, RC))
25370           return getVectorMaskingNode(
25371               DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25372                           Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
25373               Mask, PassThru, Subtarget, DAG);
25374         if (!isRoundModeCurDirection(Rnd))
25375           return SDValue();
25376       }
25377       return getVectorMaskingNode(
25378           DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
25379           Subtarget, DAG);
25380     }
25381     case INTR_TYPE_1OP_MASK_SAE: {
25382       SDValue Src = Op.getOperand(1);
25383       SDValue PassThru = Op.getOperand(2);
25384       SDValue Mask = Op.getOperand(3);
25385       SDValue Rnd = Op.getOperand(4);
25386 
25387       unsigned Opc;
25388       if (isRoundModeCurDirection(Rnd))
25389         Opc = IntrData->Opc0;
25390       else if (isRoundModeSAE(Rnd))
25391         Opc = IntrData->Opc1;
25392       else
25393         return SDValue();
25394 
25395       return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
25396                                   Subtarget, DAG);
25397     }
25398     case INTR_TYPE_SCALAR_MASK: {
25399       SDValue Src1 = Op.getOperand(1);
25400       SDValue Src2 = Op.getOperand(2);
25401       SDValue passThru = Op.getOperand(3);
25402       SDValue Mask = Op.getOperand(4);
25403       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25404       // There are 2 kinds of intrinsics in this group:
25405       // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
25406       // (2) With rounding mode and sae - 7 operands.
25407       bool HasRounding = IntrWithRoundingModeOpcode != 0;
25408       if (Op.getNumOperands() == (5U + HasRounding)) {
25409         if (HasRounding) {
25410           SDValue Rnd = Op.getOperand(5);
25411           unsigned RC = 0;
25412           if (isRoundModeSAEToX(Rnd, RC))
25413             return getScalarMaskingNode(
25414                 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
25415                             DAG.getTargetConstant(RC, dl, MVT::i32)),
25416                 Mask, passThru, Subtarget, DAG);
25417           if (!isRoundModeCurDirection(Rnd))
25418             return SDValue();
25419         }
25420         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
25421                                                 Src2),
25422                                     Mask, passThru, Subtarget, DAG);
25423       }
25424 
25425       assert(Op.getNumOperands() == (6U + HasRounding) &&
25426              "Unexpected intrinsic form");
25427       SDValue RoundingMode = Op.getOperand(5);
25428       unsigned Opc = IntrData->Opc0;
25429       if (HasRounding) {
25430         SDValue Sae = Op.getOperand(6);
25431         if (isRoundModeSAE(Sae))
25432           Opc = IntrWithRoundingModeOpcode;
25433         else if (!isRoundModeCurDirection(Sae))
25434           return SDValue();
25435       }
25436       return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
25437                                               Src2, RoundingMode),
25438                                   Mask, passThru, Subtarget, DAG);
25439     }
25440     case INTR_TYPE_SCALAR_MASK_RND: {
25441       SDValue Src1 = Op.getOperand(1);
25442       SDValue Src2 = Op.getOperand(2);
25443       SDValue passThru = Op.getOperand(3);
25444       SDValue Mask = Op.getOperand(4);
25445       SDValue Rnd = Op.getOperand(5);
25446 
25447       SDValue NewOp;
25448       unsigned RC = 0;
25449       if (isRoundModeCurDirection(Rnd))
25450         NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25451       else if (isRoundModeSAEToX(Rnd, RC))
25452         NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25453                             DAG.getTargetConstant(RC, dl, MVT::i32));
25454       else
25455         return SDValue();
25456 
25457       return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
25458     }
25459     case INTR_TYPE_SCALAR_MASK_SAE: {
25460       SDValue Src1 = Op.getOperand(1);
25461       SDValue Src2 = Op.getOperand(2);
25462       SDValue passThru = Op.getOperand(3);
25463       SDValue Mask = Op.getOperand(4);
25464       SDValue Sae = Op.getOperand(5);
25465       unsigned Opc;
25466       if (isRoundModeCurDirection(Sae))
25467         Opc = IntrData->Opc0;
25468       else if (isRoundModeSAE(Sae))
25469         Opc = IntrData->Opc1;
25470       else
25471         return SDValue();
25472 
25473       return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25474                                   Mask, passThru, Subtarget, DAG);
25475     }
25476     case INTR_TYPE_2OP_MASK: {
25477       SDValue Src1 = Op.getOperand(1);
25478       SDValue Src2 = Op.getOperand(2);
25479       SDValue PassThru = Op.getOperand(3);
25480       SDValue Mask = Op.getOperand(4);
25481       SDValue NewOp;
25482       if (IntrData->Opc1 != 0) {
25483         SDValue Rnd = Op.getOperand(5);
25484         unsigned RC = 0;
25485         if (isRoundModeSAEToX(Rnd, RC))
25486           NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25487                               DAG.getTargetConstant(RC, dl, MVT::i32));
25488         else if (!isRoundModeCurDirection(Rnd))
25489           return SDValue();
25490       }
25491       if (!NewOp)
25492         NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25493       return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25494     }
25495     case INTR_TYPE_2OP_MASK_SAE: {
25496       SDValue Src1 = Op.getOperand(1);
25497       SDValue Src2 = Op.getOperand(2);
25498       SDValue PassThru = Op.getOperand(3);
25499       SDValue Mask = Op.getOperand(4);
25500 
25501       unsigned Opc = IntrData->Opc0;
25502       if (IntrData->Opc1 != 0) {
25503         SDValue Sae = Op.getOperand(5);
25504         if (isRoundModeSAE(Sae))
25505           Opc = IntrData->Opc1;
25506         else if (!isRoundModeCurDirection(Sae))
25507           return SDValue();
25508       }
25509 
25510       return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25511                                   Mask, PassThru, Subtarget, DAG);
25512     }
25513     case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
25514       SDValue Src1 = Op.getOperand(1);
25515       SDValue Src2 = Op.getOperand(2);
25516       SDValue Src3 = Op.getOperand(3);
25517       SDValue PassThru = Op.getOperand(4);
25518       SDValue Mask = Op.getOperand(5);
25519       SDValue Sae = Op.getOperand(6);
25520       unsigned Opc;
25521       if (isRoundModeCurDirection(Sae))
25522         Opc = IntrData->Opc0;
25523       else if (isRoundModeSAE(Sae))
25524         Opc = IntrData->Opc1;
25525       else
25526         return SDValue();
25527 
25528       return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25529                                   Mask, PassThru, Subtarget, DAG);
25530     }
25531     case INTR_TYPE_3OP_MASK_SAE: {
25532       SDValue Src1 = Op.getOperand(1);
25533       SDValue Src2 = Op.getOperand(2);
25534       SDValue Src3 = Op.getOperand(3);
25535       SDValue PassThru = Op.getOperand(4);
25536       SDValue Mask = Op.getOperand(5);
25537 
25538       unsigned Opc = IntrData->Opc0;
25539       if (IntrData->Opc1 != 0) {
25540         SDValue Sae = Op.getOperand(6);
25541         if (isRoundModeSAE(Sae))
25542           Opc = IntrData->Opc1;
25543         else if (!isRoundModeCurDirection(Sae))
25544           return SDValue();
25545       }
25546       return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25547                                   Mask, PassThru, Subtarget, DAG);
25548     }
25549     case BLENDV: {
25550       SDValue Src1 = Op.getOperand(1);
25551       SDValue Src2 = Op.getOperand(2);
25552       SDValue Src3 = Op.getOperand(3);
25553 
25554       EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
25555       Src3 = DAG.getBitcast(MaskVT, Src3);
25556 
25557       // Reverse the operands to match VSELECT order.
25558       return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
25559     }
25560     case VPERM_2OP : {
25561       SDValue Src1 = Op.getOperand(1);
25562       SDValue Src2 = Op.getOperand(2);
25563 
25564       // Swap Src1 and Src2 in the node creation
25565       return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
25566     }
25567     case IFMA_OP:
25568       // NOTE: We need to swizzle the operands to pass the multiply operands
25569       // first.
25570       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25571                          Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
25572     case FPCLASSS: {
25573       SDValue Src1 = Op.getOperand(1);
25574       SDValue Imm = Op.getOperand(2);
25575       SDValue Mask = Op.getOperand(3);
25576       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
25577       SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
25578                                                  Subtarget, DAG);
25579       // Need to fill with zeros to ensure the bitcast will produce zeroes
25580       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25581       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25582                                 DAG.getConstant(0, dl, MVT::v8i1),
25583                                 FPclassMask, DAG.getIntPtrConstant(0, dl));
25584       return DAG.getBitcast(MVT::i8, Ins);
25585     }
25586 
25587     case CMP_MASK_CC: {
25588       MVT MaskVT = Op.getSimpleValueType();
25589       SDValue CC = Op.getOperand(3);
25590       SDValue Mask = Op.getOperand(4);
25591       // We specify 2 possible opcodes for intrinsics with rounding modes.
25592       // First, we check if the intrinsic may have non-default rounding mode,
25593       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25594       if (IntrData->Opc1 != 0) {
25595         SDValue Sae = Op.getOperand(5);
25596         if (isRoundModeSAE(Sae))
25597           return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
25598                              Op.getOperand(2), CC, Mask, Sae);
25599         if (!isRoundModeCurDirection(Sae))
25600           return SDValue();
25601       }
25602       //default rounding mode
25603       return DAG.getNode(IntrData->Opc0, dl, MaskVT,
25604                          {Op.getOperand(1), Op.getOperand(2), CC, Mask});
25605     }
25606     case CMP_MASK_SCALAR_CC: {
25607       SDValue Src1 = Op.getOperand(1);
25608       SDValue Src2 = Op.getOperand(2);
25609       SDValue CC = Op.getOperand(3);
25610       SDValue Mask = Op.getOperand(4);
25611 
25612       SDValue Cmp;
25613       if (IntrData->Opc1 != 0) {
25614         SDValue Sae = Op.getOperand(5);
25615         if (isRoundModeSAE(Sae))
25616           Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
25617         else if (!isRoundModeCurDirection(Sae))
25618           return SDValue();
25619       }
25620       //default rounding mode
25621       if (!Cmp.getNode())
25622         Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
25623 
25624       SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
25625                                              Subtarget, DAG);
25626       // Need to fill with zeros to ensure the bitcast will produce zeroes
25627       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25628       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25629                                 DAG.getConstant(0, dl, MVT::v8i1),
25630                                 CmpMask, DAG.getIntPtrConstant(0, dl));
25631       return DAG.getBitcast(MVT::i8, Ins);
25632     }
25633     case COMI: { // Comparison intrinsics
25634       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
25635       SDValue LHS = Op.getOperand(1);
25636       SDValue RHS = Op.getOperand(2);
25637       // Some conditions require the operands to be swapped.
25638       if (CC == ISD::SETLT || CC == ISD::SETLE)
25639         std::swap(LHS, RHS);
25640 
25641       SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
25642       SDValue SetCC;
25643       switch (CC) {
25644       case ISD::SETEQ: { // (ZF = 0 and PF = 0)
25645         SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
25646         SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
25647         SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
25648         break;
25649       }
25650       case ISD::SETNE: { // (ZF = 1 or PF = 1)
25651         SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
25652         SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
25653         SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
25654         break;
25655       }
25656       case ISD::SETGT: // (CF = 0 and ZF = 0)
25657       case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
25658         SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
25659         break;
25660       }
25661       case ISD::SETGE: // CF = 0
25662       case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
25663         SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
25664         break;
25665       default:
25666         llvm_unreachable("Unexpected illegal condition!");
25667       }
25668       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25669     }
25670     case COMI_RM: { // Comparison intrinsics with Sae
25671       SDValue LHS = Op.getOperand(1);
25672       SDValue RHS = Op.getOperand(2);
25673       unsigned CondVal = Op.getConstantOperandVal(3);
25674       SDValue Sae = Op.getOperand(4);
25675 
25676       SDValue FCmp;
25677       if (isRoundModeCurDirection(Sae))
25678         FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
25679                            DAG.getTargetConstant(CondVal, dl, MVT::i8));
25680       else if (isRoundModeSAE(Sae))
25681         FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
25682                            DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
25683       else
25684         return SDValue();
25685       // Need to fill with zeros to ensure the bitcast will produce zeroes
25686       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25687       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25688                                 DAG.getConstant(0, dl, MVT::v16i1),
25689                                 FCmp, DAG.getIntPtrConstant(0, dl));
25690       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
25691                          DAG.getBitcast(MVT::i16, Ins));
25692     }
25693     case VSHIFT:
25694       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
25695                                  Op.getOperand(1), Op.getOperand(2), Subtarget,
25696                                  DAG);
25697     case COMPRESS_EXPAND_IN_REG: {
25698       SDValue Mask = Op.getOperand(3);
25699       SDValue DataToCompress = Op.getOperand(1);
25700       SDValue PassThru = Op.getOperand(2);
25701       if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
25702         return Op.getOperand(1);
25703 
25704       // Avoid false dependency.
25705       if (PassThru.isUndef())
25706         PassThru = DAG.getConstant(0, dl, VT);
25707 
25708       return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
25709                          Mask);
25710     }
25711     case FIXUPIMM:
25712     case FIXUPIMM_MASKZ: {
25713       SDValue Src1 = Op.getOperand(1);
25714       SDValue Src2 = Op.getOperand(2);
25715       SDValue Src3 = Op.getOperand(3);
25716       SDValue Imm = Op.getOperand(4);
25717       SDValue Mask = Op.getOperand(5);
25718       SDValue Passthru = (IntrData->Type == FIXUPIMM)
25719                              ? Src1
25720                              : getZeroVector(VT, Subtarget, DAG, dl);
25721 
25722       unsigned Opc = IntrData->Opc0;
25723       if (IntrData->Opc1 != 0) {
25724         SDValue Sae = Op.getOperand(6);
25725         if (isRoundModeSAE(Sae))
25726           Opc = IntrData->Opc1;
25727         else if (!isRoundModeCurDirection(Sae))
25728           return SDValue();
25729       }
25730 
25731       SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
25732 
25733       if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
25734         return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
25735 
25736       return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
25737     }
25738     case ROUNDP: {
25739       assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
25740       // Clear the upper bits of the rounding immediate so that the legacy
25741       // intrinsic can't trigger the scaling behavior of VRNDSCALE.
25742       auto Round = cast<ConstantSDNode>(Op.getOperand(2));
25743       SDValue RoundingMode =
25744           DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
25745       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25746                          Op.getOperand(1), RoundingMode);
25747     }
25748     case ROUNDS: {
25749       assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
25750       // Clear the upper bits of the rounding immediate so that the legacy
25751       // intrinsic can't trigger the scaling behavior of VRNDSCALE.
25752       auto Round = cast<ConstantSDNode>(Op.getOperand(3));
25753       SDValue RoundingMode =
25754           DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
25755       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25756                          Op.getOperand(1), Op.getOperand(2), RoundingMode);
25757     }
25758     case BEXTRI: {
25759       assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
25760 
25761       uint64_t Imm = Op.getConstantOperandVal(2);
25762       SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
25763                                               Op.getValueType());
25764       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25765                          Op.getOperand(1), Control);
25766     }
25767     // ADC/ADCX/SBB
25768     case ADX: {
25769       SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
25770       SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
25771 
25772       SDValue Res;
25773       // If the carry in is zero, then we should just use ADD/SUB instead of
25774       // ADC/SBB.
25775       if (isNullConstant(Op.getOperand(1))) {
25776         Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
25777                           Op.getOperand(3));
25778       } else {
25779         SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
25780                                     DAG.getConstant(-1, dl, MVT::i8));
25781         Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
25782                           Op.getOperand(3), GenCF.getValue(1));
25783       }
25784       SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
25785       SDValue Results[] = { SetCC, Res };
25786       return DAG.getMergeValues(Results, dl);
25787     }
25788     case CVTPD2PS_MASK:
25789     case CVTPD2DQ_MASK:
25790     case CVTQQ2PS_MASK:
25791     case TRUNCATE_TO_REG: {
25792       SDValue Src = Op.getOperand(1);
25793       SDValue PassThru = Op.getOperand(2);
25794       SDValue Mask = Op.getOperand(3);
25795 
25796       if (isAllOnesConstant(Mask))
25797         return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
25798 
25799       MVT SrcVT = Src.getSimpleValueType();
25800       MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
25801       Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25802       return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
25803                          {Src, PassThru, Mask});
25804     }
25805     case CVTPS2PH_MASK: {
25806       SDValue Src = Op.getOperand(1);
25807       SDValue Rnd = Op.getOperand(2);
25808       SDValue PassThru = Op.getOperand(3);
25809       SDValue Mask = Op.getOperand(4);
25810 
25811       if (isAllOnesConstant(Mask))
25812         return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);
25813 
25814       MVT SrcVT = Src.getSimpleValueType();
25815       MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
25816       Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25817       return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
25818                          PassThru, Mask);
25819 
25820     }
25821     case CVTNEPS2BF16_MASK: {
25822       SDValue Src = Op.getOperand(1);
25823       SDValue PassThru = Op.getOperand(2);
25824       SDValue Mask = Op.getOperand(3);
25825 
25826       if (ISD::isBuildVectorAllOnes(Mask.getNode()))
25827         return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
25828 
25829       // Break false dependency.
25830       if (PassThru.isUndef())
25831         PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
25832 
25833       return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
25834                          Mask);
25835     }
25836     default:
25837       break;
25838     }
25839   }
25840 
25841   switch (IntNo) {
25842   default: return SDValue();    // Don't custom lower most intrinsics.
25843 
25844   // ptest and testp intrinsics. The intrinsic these come from are designed to
25845   // return an integer value, not just an instruction so lower it to the ptest
25846   // or testp pattern and a setcc for the result.
25847   case Intrinsic::x86_avx512_ktestc_b:
25848   case Intrinsic::x86_avx512_ktestc_w:
25849   case Intrinsic::x86_avx512_ktestc_d:
25850   case Intrinsic::x86_avx512_ktestc_q:
25851   case Intrinsic::x86_avx512_ktestz_b:
25852   case Intrinsic::x86_avx512_ktestz_w:
25853   case Intrinsic::x86_avx512_ktestz_d:
25854   case Intrinsic::x86_avx512_ktestz_q:
25855   case Intrinsic::x86_sse41_ptestz:
25856   case Intrinsic::x86_sse41_ptestc:
25857   case Intrinsic::x86_sse41_ptestnzc:
25858   case Intrinsic::x86_avx_ptestz_256:
25859   case Intrinsic::x86_avx_ptestc_256:
25860   case Intrinsic::x86_avx_ptestnzc_256:
25861   case Intrinsic::x86_avx_vtestz_ps:
25862   case Intrinsic::x86_avx_vtestc_ps:
25863   case Intrinsic::x86_avx_vtestnzc_ps:
25864   case Intrinsic::x86_avx_vtestz_pd:
25865   case Intrinsic::x86_avx_vtestc_pd:
25866   case Intrinsic::x86_avx_vtestnzc_pd:
25867   case Intrinsic::x86_avx_vtestz_ps_256:
25868   case Intrinsic::x86_avx_vtestc_ps_256:
25869   case Intrinsic::x86_avx_vtestnzc_ps_256:
25870   case Intrinsic::x86_avx_vtestz_pd_256:
25871   case Intrinsic::x86_avx_vtestc_pd_256:
25872   case Intrinsic::x86_avx_vtestnzc_pd_256: {
25873     unsigned TestOpc = X86ISD::PTEST;
25874     X86::CondCode X86CC;
25875     switch (IntNo) {
25876     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
25877     case Intrinsic::x86_avx512_ktestc_b:
25878     case Intrinsic::x86_avx512_ktestc_w:
25879     case Intrinsic::x86_avx512_ktestc_d:
25880     case Intrinsic::x86_avx512_ktestc_q:
25881       // CF = 1
25882       TestOpc = X86ISD::KTEST;
25883       X86CC = X86::COND_B;
25884       break;
25885     case Intrinsic::x86_avx512_ktestz_b:
25886     case Intrinsic::x86_avx512_ktestz_w:
25887     case Intrinsic::x86_avx512_ktestz_d:
25888     case Intrinsic::x86_avx512_ktestz_q:
25889       TestOpc = X86ISD::KTEST;
25890       X86CC = X86::COND_E;
25891       break;
25892     case Intrinsic::x86_avx_vtestz_ps:
25893     case Intrinsic::x86_avx_vtestz_pd:
25894     case Intrinsic::x86_avx_vtestz_ps_256:
25895     case Intrinsic::x86_avx_vtestz_pd_256:
25896       TestOpc = X86ISD::TESTP;
25897       LLVM_FALLTHROUGH;
25898     case Intrinsic::x86_sse41_ptestz:
25899     case Intrinsic::x86_avx_ptestz_256:
25900       // ZF = 1
25901       X86CC = X86::COND_E;
25902       break;
25903     case Intrinsic::x86_avx_vtestc_ps:
25904     case Intrinsic::x86_avx_vtestc_pd:
25905     case Intrinsic::x86_avx_vtestc_ps_256:
25906     case Intrinsic::x86_avx_vtestc_pd_256:
25907       TestOpc = X86ISD::TESTP;
25908       LLVM_FALLTHROUGH;
25909     case Intrinsic::x86_sse41_ptestc:
25910     case Intrinsic::x86_avx_ptestc_256:
25911       // CF = 1
25912       X86CC = X86::COND_B;
25913       break;
25914     case Intrinsic::x86_avx_vtestnzc_ps:
25915     case Intrinsic::x86_avx_vtestnzc_pd:
25916     case Intrinsic::x86_avx_vtestnzc_ps_256:
25917     case Intrinsic::x86_avx_vtestnzc_pd_256:
25918       TestOpc = X86ISD::TESTP;
25919       LLVM_FALLTHROUGH;
25920     case Intrinsic::x86_sse41_ptestnzc:
25921     case Intrinsic::x86_avx_ptestnzc_256:
25922       // ZF and CF = 0
25923       X86CC = X86::COND_A;
25924       break;
25925     }
25926 
25927     SDValue LHS = Op.getOperand(1);
25928     SDValue RHS = Op.getOperand(2);
25929     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
25930     SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
25931     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25932   }
25933 
25934   case Intrinsic::x86_sse42_pcmpistria128:
25935   case Intrinsic::x86_sse42_pcmpestria128:
25936   case Intrinsic::x86_sse42_pcmpistric128:
25937   case Intrinsic::x86_sse42_pcmpestric128:
25938   case Intrinsic::x86_sse42_pcmpistrio128:
25939   case Intrinsic::x86_sse42_pcmpestrio128:
25940   case Intrinsic::x86_sse42_pcmpistris128:
25941   case Intrinsic::x86_sse42_pcmpestris128:
25942   case Intrinsic::x86_sse42_pcmpistriz128:
25943   case Intrinsic::x86_sse42_pcmpestriz128: {
25944     unsigned Opcode;
25945     X86::CondCode X86CC;
25946     switch (IntNo) {
25947     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
25948     case Intrinsic::x86_sse42_pcmpistria128:
25949       Opcode = X86ISD::PCMPISTR;
25950       X86CC = X86::COND_A;
25951       break;
25952     case Intrinsic::x86_sse42_pcmpestria128:
25953       Opcode = X86ISD::PCMPESTR;
25954       X86CC = X86::COND_A;
25955       break;
25956     case Intrinsic::x86_sse42_pcmpistric128:
25957       Opcode = X86ISD::PCMPISTR;
25958       X86CC = X86::COND_B;
25959       break;
25960     case Intrinsic::x86_sse42_pcmpestric128:
25961       Opcode = X86ISD::PCMPESTR;
25962       X86CC = X86::COND_B;
25963       break;
25964     case Intrinsic::x86_sse42_pcmpistrio128:
25965       Opcode = X86ISD::PCMPISTR;
25966       X86CC = X86::COND_O;
25967       break;
25968     case Intrinsic::x86_sse42_pcmpestrio128:
25969       Opcode = X86ISD::PCMPESTR;
25970       X86CC = X86::COND_O;
25971       break;
25972     case Intrinsic::x86_sse42_pcmpistris128:
25973       Opcode = X86ISD::PCMPISTR;
25974       X86CC = X86::COND_S;
25975       break;
25976     case Intrinsic::x86_sse42_pcmpestris128:
25977       Opcode = X86ISD::PCMPESTR;
25978       X86CC = X86::COND_S;
25979       break;
25980     case Intrinsic::x86_sse42_pcmpistriz128:
25981       Opcode = X86ISD::PCMPISTR;
25982       X86CC = X86::COND_E;
25983       break;
25984     case Intrinsic::x86_sse42_pcmpestriz128:
25985       Opcode = X86ISD::PCMPESTR;
25986       X86CC = X86::COND_E;
25987       break;
25988     }
25989     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
25990     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
25991     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
25992     SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
25993     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25994   }
25995 
25996   case Intrinsic::x86_sse42_pcmpistri128:
25997   case Intrinsic::x86_sse42_pcmpestri128: {
25998     unsigned Opcode;
25999     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
26000       Opcode = X86ISD::PCMPISTR;
26001     else
26002       Opcode = X86ISD::PCMPESTR;
26003 
26004     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
26005     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26006     return DAG.getNode(Opcode, dl, VTs, NewOps);
26007   }
26008 
26009   case Intrinsic::x86_sse42_pcmpistrm128:
26010   case Intrinsic::x86_sse42_pcmpestrm128: {
26011     unsigned Opcode;
26012     if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
26013       Opcode = X86ISD::PCMPISTR;
26014     else
26015       Opcode = X86ISD::PCMPESTR;
26016 
26017     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
26018     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26019     return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
26020   }
26021 
26022   case Intrinsic::eh_sjlj_lsda: {
26023     MachineFunction &MF = DAG.getMachineFunction();
26024     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26025     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26026     auto &Context = MF.getMMI().getContext();
26027     MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
26028                                             Twine(MF.getFunctionNumber()));
26029     return DAG.getNode(getGlobalWrapperKind(), dl, VT,
26030                        DAG.getMCSymbol(S, PtrVT));
26031   }
26032 
26033   case Intrinsic::x86_seh_lsda: {
26034     // Compute the symbol for the LSDA. We know it'll get emitted later.
26035     MachineFunction &MF = DAG.getMachineFunction();
26036     SDValue Op1 = Op.getOperand(1);
26037     auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
26038     MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
26039         GlobalValue::dropLLVMManglingEscape(Fn->getName()));
26040 
26041     // Generate a simple absolute symbol reference. This intrinsic is only
26042     // supported on 32-bit Windows, which isn't PIC.
26043     SDValue Result = DAG.getMCSymbol(LSDASym, VT);
26044     return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
26045   }
26046 
26047   case Intrinsic::eh_recoverfp: {
26048     SDValue FnOp = Op.getOperand(1);
26049     SDValue IncomingFPOp = Op.getOperand(2);
26050     GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
26051     auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
26052     if (!Fn)
26053       report_fatal_error(
26054           "llvm.eh.recoverfp must take a function as the first argument");
26055     return recoverFramePointer(DAG, Fn, IncomingFPOp);
26056   }
26057 
26058   case Intrinsic::localaddress: {
26059     // Returns one of the stack, base, or frame pointer registers, depending on
26060     // which is used to reference local variables.
26061     MachineFunction &MF = DAG.getMachineFunction();
26062     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26063     unsigned Reg;
26064     if (RegInfo->hasBasePointer(MF))
26065       Reg = RegInfo->getBaseRegister();
26066     else { // Handles the SP or FP case.
26067       bool CantUseFP = RegInfo->hasStackRealignment(MF);
26068       if (CantUseFP)
26069         Reg = RegInfo->getPtrSizedStackRegister(MF);
26070       else
26071         Reg = RegInfo->getPtrSizedFrameRegister(MF);
26072     }
26073     return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
26074   }
26075   case Intrinsic::swift_async_context_addr: {
26076     auto &MF = DAG.getMachineFunction();
26077     auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
26078     if (Subtarget.is64Bit()) {
26079       MF.getFrameInfo().setFrameAddressIsTaken(true);
26080       X86FI->setHasSwiftAsyncContext(true);
26081       return SDValue(
26082           DAG.getMachineNode(
26083               X86::SUB64ri8, dl, MVT::i64,
26084               DAG.getCopyFromReg(DAG.getEntryNode(), dl, X86::RBP, MVT::i64),
26085               DAG.getTargetConstant(8, dl, MVT::i32)),
26086           0);
26087     } else {
26088       // 32-bit so no special extended frame, create or reuse an existing stack
26089       // slot.
26090       if (!X86FI->getSwiftAsyncContextFrameIdx())
26091         X86FI->setSwiftAsyncContextFrameIdx(
26092             MF.getFrameInfo().CreateStackObject(4, Align(4), false));
26093       return DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
26094     }
26095   }
26096   case Intrinsic::x86_avx512_vp2intersect_q_512:
26097   case Intrinsic::x86_avx512_vp2intersect_q_256:
26098   case Intrinsic::x86_avx512_vp2intersect_q_128:
26099   case Intrinsic::x86_avx512_vp2intersect_d_512:
26100   case Intrinsic::x86_avx512_vp2intersect_d_256:
26101   case Intrinsic::x86_avx512_vp2intersect_d_128: {
26102     MVT MaskVT = Op.getSimpleValueType();
26103 
26104     SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
26105     SDLoc DL(Op);
26106 
26107     SDValue Operation =
26108         DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
26109                     Op->getOperand(1), Op->getOperand(2));
26110 
26111     SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
26112                                                  MaskVT, Operation);
26113     SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
26114                                                  MaskVT, Operation);
26115     return DAG.getMergeValues({Result0, Result1}, DL);
26116   }
26117   case Intrinsic::x86_mmx_pslli_w:
26118   case Intrinsic::x86_mmx_pslli_d:
26119   case Intrinsic::x86_mmx_pslli_q:
26120   case Intrinsic::x86_mmx_psrli_w:
26121   case Intrinsic::x86_mmx_psrli_d:
26122   case Intrinsic::x86_mmx_psrli_q:
26123   case Intrinsic::x86_mmx_psrai_w:
26124   case Intrinsic::x86_mmx_psrai_d: {
26125     SDLoc DL(Op);
26126     SDValue ShAmt = Op.getOperand(2);
26127     // If the argument is a constant, convert it to a target constant.
26128     if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
26129       // Clamp out of bounds shift amounts since they will otherwise be masked
26130       // to 8-bits which may make it no longer out of bounds.
26131       unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
26132       if (ShiftAmount == 0)
26133         return Op.getOperand(1);
26134 
26135       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26136                          Op.getOperand(0), Op.getOperand(1),
26137                          DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
26138     }
26139 
26140     unsigned NewIntrinsic;
26141     switch (IntNo) {
26142     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
26143     case Intrinsic::x86_mmx_pslli_w:
26144       NewIntrinsic = Intrinsic::x86_mmx_psll_w;
26145       break;
26146     case Intrinsic::x86_mmx_pslli_d:
26147       NewIntrinsic = Intrinsic::x86_mmx_psll_d;
26148       break;
26149     case Intrinsic::x86_mmx_pslli_q:
26150       NewIntrinsic = Intrinsic::x86_mmx_psll_q;
26151       break;
26152     case Intrinsic::x86_mmx_psrli_w:
26153       NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
26154       break;
26155     case Intrinsic::x86_mmx_psrli_d:
26156       NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
26157       break;
26158     case Intrinsic::x86_mmx_psrli_q:
26159       NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
26160       break;
26161     case Intrinsic::x86_mmx_psrai_w:
26162       NewIntrinsic = Intrinsic::x86_mmx_psra_w;
26163       break;
26164     case Intrinsic::x86_mmx_psrai_d:
26165       NewIntrinsic = Intrinsic::x86_mmx_psra_d;
26166       break;
26167     }
26168 
26169     // The vector shift intrinsics with scalars uses 32b shift amounts but
26170     // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
26171     // MMX register.
26172     ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
26173     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26174                        DAG.getTargetConstant(NewIntrinsic, DL,
26175                                              getPointerTy(DAG.getDataLayout())),
26176                        Op.getOperand(1), ShAmt);
26177   }
26178   }
26179 }
26180 
getAVX2GatherNode(unsigned Opc,SDValue Op,SelectionDAG & DAG,SDValue Src,SDValue Mask,SDValue Base,SDValue Index,SDValue ScaleOp,SDValue Chain,const X86Subtarget & Subtarget)26181 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26182                                  SDValue Src, SDValue Mask, SDValue Base,
26183                                  SDValue Index, SDValue ScaleOp, SDValue Chain,
26184                                  const X86Subtarget &Subtarget) {
26185   SDLoc dl(Op);
26186   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26187   // Scale must be constant.
26188   if (!C)
26189     return SDValue();
26190   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26191   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26192                                         TLI.getPointerTy(DAG.getDataLayout()));
26193   EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
26194   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26195   // If source is undef or we know it won't be used, use a zero vector
26196   // to break register dependency.
26197   // TODO: use undef instead and let BreakFalseDeps deal with it?
26198   if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26199     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26200 
26201   // Cast mask to an integer type.
26202   Mask = DAG.getBitcast(MaskVT, Mask);
26203 
26204   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26205 
26206   SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26207   SDValue Res =
26208       DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26209                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26210   return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26211 }
26212 
getGatherNode(SDValue Op,SelectionDAG & DAG,SDValue Src,SDValue Mask,SDValue Base,SDValue Index,SDValue ScaleOp,SDValue Chain,const X86Subtarget & Subtarget)26213 static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
26214                              SDValue Src, SDValue Mask, SDValue Base,
26215                              SDValue Index, SDValue ScaleOp, SDValue Chain,
26216                              const X86Subtarget &Subtarget) {
26217   MVT VT = Op.getSimpleValueType();
26218   SDLoc dl(Op);
26219   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26220   // Scale must be constant.
26221   if (!C)
26222     return SDValue();
26223   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26224   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26225                                         TLI.getPointerTy(DAG.getDataLayout()));
26226   unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26227                               VT.getVectorNumElements());
26228   MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26229 
26230   // We support two versions of the gather intrinsics. One with scalar mask and
26231   // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26232   if (Mask.getValueType() != MaskVT)
26233     Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26234 
26235   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26236   // If source is undef or we know it won't be used, use a zero vector
26237   // to break register dependency.
26238   // TODO: use undef instead and let BreakFalseDeps deal with it?
26239   if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26240     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26241 
26242   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26243 
26244   SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26245   SDValue Res =
26246       DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26247                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26248   return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26249 }
26250 
getScatterNode(unsigned Opc,SDValue Op,SelectionDAG & DAG,SDValue Src,SDValue Mask,SDValue Base,SDValue Index,SDValue ScaleOp,SDValue Chain,const X86Subtarget & Subtarget)26251 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26252                                SDValue Src, SDValue Mask, SDValue Base,
26253                                SDValue Index, SDValue ScaleOp, SDValue Chain,
26254                                const X86Subtarget &Subtarget) {
26255   SDLoc dl(Op);
26256   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26257   // Scale must be constant.
26258   if (!C)
26259     return SDValue();
26260   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26261   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26262                                         TLI.getPointerTy(DAG.getDataLayout()));
26263   unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26264                               Src.getSimpleValueType().getVectorNumElements());
26265   MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26266 
26267   // We support two versions of the scatter intrinsics. One with scalar mask and
26268   // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26269   if (Mask.getValueType() != MaskVT)
26270     Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26271 
26272   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26273 
26274   SDVTList VTs = DAG.getVTList(MVT::Other);
26275   SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
26276   SDValue Res =
26277       DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
26278                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26279   return Res;
26280 }
26281 
getPrefetchNode(unsigned Opc,SDValue Op,SelectionDAG & DAG,SDValue Mask,SDValue Base,SDValue Index,SDValue ScaleOp,SDValue Chain,const X86Subtarget & Subtarget)26282 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26283                                SDValue Mask, SDValue Base, SDValue Index,
26284                                SDValue ScaleOp, SDValue Chain,
26285                                const X86Subtarget &Subtarget) {
26286   SDLoc dl(Op);
26287   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26288   // Scale must be constant.
26289   if (!C)
26290     return SDValue();
26291   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26292   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26293                                         TLI.getPointerTy(DAG.getDataLayout()));
26294   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
26295   SDValue Segment = DAG.getRegister(0, MVT::i32);
26296   MVT MaskVT =
26297     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
26298   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26299   SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
26300   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
26301   return SDValue(Res, 0);
26302 }
26303 
26304 /// Handles the lowering of builtin intrinsics with chain that return their
26305 /// value into registers EDX:EAX.
26306 /// If operand ScrReg is a valid register identifier, then operand 2 of N is
26307 /// copied to SrcReg. The assumption is that SrcReg is an implicit input to
26308 /// TargetOpcode.
26309 /// Returns a Glue value which can be used to add extra copy-from-reg if the
26310 /// expanded intrinsics implicitly defines extra registers (i.e. not just
26311 /// EDX:EAX).
expandIntrinsicWChainHelper(SDNode * N,const SDLoc & DL,SelectionDAG & DAG,unsigned TargetOpcode,unsigned SrcReg,const X86Subtarget & Subtarget,SmallVectorImpl<SDValue> & Results)26312 static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
26313                                         SelectionDAG &DAG,
26314                                         unsigned TargetOpcode,
26315                                         unsigned SrcReg,
26316                                         const X86Subtarget &Subtarget,
26317                                         SmallVectorImpl<SDValue> &Results) {
26318   SDValue Chain = N->getOperand(0);
26319   SDValue Glue;
26320 
26321   if (SrcReg) {
26322     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
26323     Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
26324     Glue = Chain.getValue(1);
26325   }
26326 
26327   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
26328   SDValue N1Ops[] = {Chain, Glue};
26329   SDNode *N1 = DAG.getMachineNode(
26330       TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
26331   Chain = SDValue(N1, 0);
26332 
26333   // Reads the content of XCR and returns it in registers EDX:EAX.
26334   SDValue LO, HI;
26335   if (Subtarget.is64Bit()) {
26336     LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
26337     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
26338                             LO.getValue(2));
26339   } else {
26340     LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
26341     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
26342                             LO.getValue(2));
26343   }
26344   Chain = HI.getValue(1);
26345   Glue = HI.getValue(2);
26346 
26347   if (Subtarget.is64Bit()) {
26348     // Merge the two 32-bit values into a 64-bit one.
26349     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
26350                               DAG.getConstant(32, DL, MVT::i8));
26351     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
26352     Results.push_back(Chain);
26353     return Glue;
26354   }
26355 
26356   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
26357   SDValue Ops[] = { LO, HI };
26358   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
26359   Results.push_back(Pair);
26360   Results.push_back(Chain);
26361   return Glue;
26362 }
26363 
26364 /// Handles the lowering of builtin intrinsics that read the time stamp counter
26365 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
26366 /// READCYCLECOUNTER nodes.
getReadTimeStampCounter(SDNode * N,const SDLoc & DL,unsigned Opcode,SelectionDAG & DAG,const X86Subtarget & Subtarget,SmallVectorImpl<SDValue> & Results)26367 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
26368                                     SelectionDAG &DAG,
26369                                     const X86Subtarget &Subtarget,
26370                                     SmallVectorImpl<SDValue> &Results) {
26371   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
26372   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
26373   // and the EAX register is loaded with the low-order 32 bits.
26374   SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
26375                                              /* NoRegister */0, Subtarget,
26376                                              Results);
26377   if (Opcode != X86::RDTSCP)
26378     return;
26379 
26380   SDValue Chain = Results[1];
26381   // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
26382   // the ECX register. Add 'ecx' explicitly to the chain.
26383   SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
26384   Results[1] = ecx;
26385   Results.push_back(ecx.getValue(1));
26386 }
26387 
LowerREADCYCLECOUNTER(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)26388 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
26389                                      SelectionDAG &DAG) {
26390   SmallVector<SDValue, 3> Results;
26391   SDLoc DL(Op);
26392   getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
26393                           Results);
26394   return DAG.getMergeValues(Results, DL);
26395 }
26396 
MarkEHRegistrationNode(SDValue Op,SelectionDAG & DAG)26397 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
26398   MachineFunction &MF = DAG.getMachineFunction();
26399   SDValue Chain = Op.getOperand(0);
26400   SDValue RegNode = Op.getOperand(2);
26401   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26402   if (!EHInfo)
26403     report_fatal_error("EH registrations only live in functions using WinEH");
26404 
26405   // Cast the operand to an alloca, and remember the frame index.
26406   auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
26407   if (!FINode)
26408     report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
26409   EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
26410 
26411   // Return the chain operand without making any DAG nodes.
26412   return Chain;
26413 }
26414 
MarkEHGuard(SDValue Op,SelectionDAG & DAG)26415 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
26416   MachineFunction &MF = DAG.getMachineFunction();
26417   SDValue Chain = Op.getOperand(0);
26418   SDValue EHGuard = Op.getOperand(2);
26419   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26420   if (!EHInfo)
26421     report_fatal_error("EHGuard only live in functions using WinEH");
26422 
26423   // Cast the operand to an alloca, and remember the frame index.
26424   auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
26425   if (!FINode)
26426     report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
26427   EHInfo->EHGuardFrameIndex = FINode->getIndex();
26428 
26429   // Return the chain operand without making any DAG nodes.
26430   return Chain;
26431 }
26432 
26433 /// Emit Truncating Store with signed or unsigned saturation.
26434 static SDValue
EmitTruncSStore(bool SignedSat,SDValue Chain,const SDLoc & Dl,SDValue Val,SDValue Ptr,EVT MemVT,MachineMemOperand * MMO,SelectionDAG & DAG)26435 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
26436                 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
26437                 SelectionDAG &DAG) {
26438   SDVTList VTs = DAG.getVTList(MVT::Other);
26439   SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
26440   SDValue Ops[] = { Chain, Val, Ptr, Undef };
26441   unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
26442   return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
26443 }
26444 
26445 /// Emit Masked Truncating Store with signed or unsigned saturation.
26446 static SDValue
EmitMaskedTruncSStore(bool SignedSat,SDValue Chain,const SDLoc & Dl,SDValue Val,SDValue Ptr,SDValue Mask,EVT MemVT,MachineMemOperand * MMO,SelectionDAG & DAG)26447 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
26448                       SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
26449                       MachineMemOperand *MMO, SelectionDAG &DAG) {
26450   SDVTList VTs = DAG.getVTList(MVT::Other);
26451   SDValue Ops[] = { Chain, Val, Ptr, Mask };
26452   unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
26453   return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
26454 }
26455 
LowerINTRINSIC_W_CHAIN(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)26456 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
26457                                       SelectionDAG &DAG) {
26458   unsigned IntNo = Op.getConstantOperandVal(1);
26459   const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
26460   if (!IntrData) {
26461     switch (IntNo) {
26462     case llvm::Intrinsic::x86_seh_ehregnode:
26463       return MarkEHRegistrationNode(Op, DAG);
26464     case llvm::Intrinsic::x86_seh_ehguard:
26465       return MarkEHGuard(Op, DAG);
26466     case llvm::Intrinsic::x86_rdpkru: {
26467       SDLoc dl(Op);
26468       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26469       // Create a RDPKRU node and pass 0 to the ECX parameter.
26470       return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
26471                          DAG.getConstant(0, dl, MVT::i32));
26472     }
26473     case llvm::Intrinsic::x86_wrpkru: {
26474       SDLoc dl(Op);
26475       // Create a WRPKRU node, pass the input to the EAX parameter,  and pass 0
26476       // to the EDX and ECX parameters.
26477       return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
26478                          Op.getOperand(0), Op.getOperand(2),
26479                          DAG.getConstant(0, dl, MVT::i32),
26480                          DAG.getConstant(0, dl, MVT::i32));
26481     }
26482     case llvm::Intrinsic::x86_flags_read_u32:
26483     case llvm::Intrinsic::x86_flags_read_u64:
26484     case llvm::Intrinsic::x86_flags_write_u32:
26485     case llvm::Intrinsic::x86_flags_write_u64: {
26486       // We need a frame pointer because this will get lowered to a PUSH/POP
26487       // sequence.
26488       MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
26489       MFI.setHasCopyImplyingStackAdjustment(true);
26490       // Don't do anything here, we will expand these intrinsics out later
26491       // during FinalizeISel in EmitInstrWithCustomInserter.
26492       return Op;
26493     }
26494     case Intrinsic::x86_lwpins32:
26495     case Intrinsic::x86_lwpins64:
26496     case Intrinsic::x86_umwait:
26497     case Intrinsic::x86_tpause: {
26498       SDLoc dl(Op);
26499       SDValue Chain = Op->getOperand(0);
26500       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26501       unsigned Opcode;
26502 
26503       switch (IntNo) {
26504       default: llvm_unreachable("Impossible intrinsic");
26505       case Intrinsic::x86_umwait:
26506         Opcode = X86ISD::UMWAIT;
26507         break;
26508       case Intrinsic::x86_tpause:
26509         Opcode = X86ISD::TPAUSE;
26510         break;
26511       case Intrinsic::x86_lwpins32:
26512       case Intrinsic::x86_lwpins64:
26513         Opcode = X86ISD::LWPINS;
26514         break;
26515       }
26516 
26517       SDValue Operation =
26518           DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
26519                       Op->getOperand(3), Op->getOperand(4));
26520       SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26521       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26522                          Operation.getValue(1));
26523     }
26524     case Intrinsic::x86_enqcmd:
26525     case Intrinsic::x86_enqcmds: {
26526       SDLoc dl(Op);
26527       SDValue Chain = Op.getOperand(0);
26528       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26529       unsigned Opcode;
26530       switch (IntNo) {
26531       default: llvm_unreachable("Impossible intrinsic!");
26532       case Intrinsic::x86_enqcmd:
26533         Opcode = X86ISD::ENQCMD;
26534         break;
26535       case Intrinsic::x86_enqcmds:
26536         Opcode = X86ISD::ENQCMDS;
26537         break;
26538       }
26539       SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
26540                                       Op.getOperand(3));
26541       SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
26542       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26543                          Operation.getValue(1));
26544     }
26545     case Intrinsic::x86_aesenc128kl:
26546     case Intrinsic::x86_aesdec128kl:
26547     case Intrinsic::x86_aesenc256kl:
26548     case Intrinsic::x86_aesdec256kl: {
26549       SDLoc DL(Op);
26550       SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
26551       SDValue Chain = Op.getOperand(0);
26552       unsigned Opcode;
26553 
26554       switch (IntNo) {
26555       default: llvm_unreachable("Impossible intrinsic");
26556       case Intrinsic::x86_aesenc128kl:
26557         Opcode = X86ISD::AESENC128KL;
26558         break;
26559       case Intrinsic::x86_aesdec128kl:
26560         Opcode = X86ISD::AESDEC128KL;
26561         break;
26562       case Intrinsic::x86_aesenc256kl:
26563         Opcode = X86ISD::AESENC256KL;
26564         break;
26565       case Intrinsic::x86_aesdec256kl:
26566         Opcode = X86ISD::AESDEC256KL;
26567         break;
26568       }
26569 
26570       MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26571       MachineMemOperand *MMO = MemIntr->getMemOperand();
26572       EVT MemVT = MemIntr->getMemoryVT();
26573       SDValue Operation = DAG.getMemIntrinsicNode(
26574           Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
26575           MMO);
26576       SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
26577 
26578       return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26579                          {ZF, Operation.getValue(0), Operation.getValue(2)});
26580     }
26581     case Intrinsic::x86_aesencwide128kl:
26582     case Intrinsic::x86_aesdecwide128kl:
26583     case Intrinsic::x86_aesencwide256kl:
26584     case Intrinsic::x86_aesdecwide256kl: {
26585       SDLoc DL(Op);
26586       SDVTList VTs = DAG.getVTList(
26587           {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
26588            MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
26589       SDValue Chain = Op.getOperand(0);
26590       unsigned Opcode;
26591 
26592       switch (IntNo) {
26593       default: llvm_unreachable("Impossible intrinsic");
26594       case Intrinsic::x86_aesencwide128kl:
26595         Opcode = X86ISD::AESENCWIDE128KL;
26596         break;
26597       case Intrinsic::x86_aesdecwide128kl:
26598         Opcode = X86ISD::AESDECWIDE128KL;
26599         break;
26600       case Intrinsic::x86_aesencwide256kl:
26601         Opcode = X86ISD::AESENCWIDE256KL;
26602         break;
26603       case Intrinsic::x86_aesdecwide256kl:
26604         Opcode = X86ISD::AESDECWIDE256KL;
26605         break;
26606       }
26607 
26608       MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26609       MachineMemOperand *MMO = MemIntr->getMemOperand();
26610       EVT MemVT = MemIntr->getMemoryVT();
26611       SDValue Operation = DAG.getMemIntrinsicNode(
26612           Opcode, DL, VTs,
26613           {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
26614            Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
26615            Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
26616           MemVT, MMO);
26617       SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
26618 
26619       return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26620                          {ZF, Operation.getValue(1), Operation.getValue(2),
26621                           Operation.getValue(3), Operation.getValue(4),
26622                           Operation.getValue(5), Operation.getValue(6),
26623                           Operation.getValue(7), Operation.getValue(8),
26624                           Operation.getValue(9)});
26625     }
26626     case Intrinsic::x86_testui: {
26627       SDLoc dl(Op);
26628       SDValue Chain = Op.getOperand(0);
26629       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26630       SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
26631       SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26632       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26633                          Operation.getValue(1));
26634     }
26635     }
26636     return SDValue();
26637   }
26638 
26639   SDLoc dl(Op);
26640   switch(IntrData->Type) {
26641   default: llvm_unreachable("Unknown Intrinsic Type");
26642   case RDSEED:
26643   case RDRAND: {
26644     // Emit the node with the right value type.
26645     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
26646     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
26647 
26648     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
26649     // Otherwise return the value from Rand, which is always 0, casted to i32.
26650     SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
26651                      DAG.getConstant(1, dl, Op->getValueType(1)),
26652                      DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
26653                      SDValue(Result.getNode(), 1)};
26654     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
26655 
26656     // Return { result, isValid, chain }.
26657     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
26658                        SDValue(Result.getNode(), 2));
26659   }
26660   case GATHER_AVX2: {
26661     SDValue Chain = Op.getOperand(0);
26662     SDValue Src   = Op.getOperand(2);
26663     SDValue Base  = Op.getOperand(3);
26664     SDValue Index = Op.getOperand(4);
26665     SDValue Mask  = Op.getOperand(5);
26666     SDValue Scale = Op.getOperand(6);
26667     return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
26668                              Scale, Chain, Subtarget);
26669   }
26670   case GATHER: {
26671   //gather(v1, mask, index, base, scale);
26672     SDValue Chain = Op.getOperand(0);
26673     SDValue Src   = Op.getOperand(2);
26674     SDValue Base  = Op.getOperand(3);
26675     SDValue Index = Op.getOperand(4);
26676     SDValue Mask  = Op.getOperand(5);
26677     SDValue Scale = Op.getOperand(6);
26678     return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
26679                          Chain, Subtarget);
26680   }
26681   case SCATTER: {
26682   //scatter(base, mask, index, v1, scale);
26683     SDValue Chain = Op.getOperand(0);
26684     SDValue Base  = Op.getOperand(2);
26685     SDValue Mask  = Op.getOperand(3);
26686     SDValue Index = Op.getOperand(4);
26687     SDValue Src   = Op.getOperand(5);
26688     SDValue Scale = Op.getOperand(6);
26689     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
26690                           Scale, Chain, Subtarget);
26691   }
26692   case PREFETCH: {
26693     const APInt &HintVal = Op.getConstantOperandAPInt(6);
26694     assert((HintVal == 2 || HintVal == 3) &&
26695            "Wrong prefetch hint in intrinsic: should be 2 or 3");
26696     unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
26697     SDValue Chain = Op.getOperand(0);
26698     SDValue Mask  = Op.getOperand(2);
26699     SDValue Index = Op.getOperand(3);
26700     SDValue Base  = Op.getOperand(4);
26701     SDValue Scale = Op.getOperand(5);
26702     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
26703                            Subtarget);
26704   }
26705   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
26706   case RDTSC: {
26707     SmallVector<SDValue, 2> Results;
26708     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
26709                             Results);
26710     return DAG.getMergeValues(Results, dl);
26711   }
26712   // Read Performance Monitoring Counters.
26713   case RDPMC:
26714   // GetExtended Control Register.
26715   case XGETBV: {
26716     SmallVector<SDValue, 2> Results;
26717 
26718     // RDPMC uses ECX to select the index of the performance counter to read.
26719     // XGETBV uses ECX to select the index of the XCR register to return.
26720     // The result is stored into registers EDX:EAX.
26721     expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
26722                                 Subtarget, Results);
26723     return DAG.getMergeValues(Results, dl);
26724   }
26725   // XTEST intrinsics.
26726   case XTEST: {
26727     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
26728     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
26729 
26730     SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
26731     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
26732     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
26733                        Ret, SDValue(InTrans.getNode(), 1));
26734   }
26735   case TRUNCATE_TO_MEM_VI8:
26736   case TRUNCATE_TO_MEM_VI16:
26737   case TRUNCATE_TO_MEM_VI32: {
26738     SDValue Mask = Op.getOperand(4);
26739     SDValue DataToTruncate = Op.getOperand(3);
26740     SDValue Addr = Op.getOperand(2);
26741     SDValue Chain = Op.getOperand(0);
26742 
26743     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
26744     assert(MemIntr && "Expected MemIntrinsicSDNode!");
26745 
26746     EVT MemVT  = MemIntr->getMemoryVT();
26747 
26748     uint16_t TruncationOp = IntrData->Opc0;
26749     switch (TruncationOp) {
26750     case X86ISD::VTRUNC: {
26751       if (isAllOnesConstant(Mask)) // return just a truncate store
26752         return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
26753                                  MemIntr->getMemOperand());
26754 
26755       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
26756       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26757       SDValue Offset = DAG.getUNDEF(VMask.getValueType());
26758 
26759       return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
26760                                 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
26761                                 true /* truncating */);
26762     }
26763     case X86ISD::VTRUNCUS:
26764     case X86ISD::VTRUNCS: {
26765       bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
26766       if (isAllOnesConstant(Mask))
26767         return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
26768                                MemIntr->getMemOperand(), DAG);
26769 
26770       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
26771       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26772 
26773       return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
26774                                    VMask, MemVT, MemIntr->getMemOperand(), DAG);
26775     }
26776     default:
26777       llvm_unreachable("Unsupported truncstore intrinsic");
26778     }
26779   }
26780   }
26781 }
26782 
LowerRETURNADDR(SDValue Op,SelectionDAG & DAG) const26783 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
26784                                            SelectionDAG &DAG) const {
26785   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
26786   MFI.setReturnAddressIsTaken(true);
26787 
26788   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
26789     return SDValue();
26790 
26791   unsigned Depth = Op.getConstantOperandVal(0);
26792   SDLoc dl(Op);
26793   EVT PtrVT = getPointerTy(DAG.getDataLayout());
26794 
26795   if (Depth > 0) {
26796     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
26797     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26798     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
26799     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
26800                        DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
26801                        MachinePointerInfo());
26802   }
26803 
26804   // Just load the return address.
26805   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
26806   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
26807                      MachinePointerInfo());
26808 }
26809 
LowerADDROFRETURNADDR(SDValue Op,SelectionDAG & DAG) const26810 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
26811                                                  SelectionDAG &DAG) const {
26812   DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
26813   return getReturnAddressFrameIndex(DAG);
26814 }
26815 
LowerFRAMEADDR(SDValue Op,SelectionDAG & DAG) const26816 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
26817   MachineFunction &MF = DAG.getMachineFunction();
26818   MachineFrameInfo &MFI = MF.getFrameInfo();
26819   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
26820   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26821   EVT VT = Op.getValueType();
26822 
26823   MFI.setFrameAddressIsTaken(true);
26824 
26825   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
26826     // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
26827     // is not possible to crawl up the stack without looking at the unwind codes
26828     // simultaneously.
26829     int FrameAddrIndex = FuncInfo->getFAIndex();
26830     if (!FrameAddrIndex) {
26831       // Set up a frame object for the return address.
26832       unsigned SlotSize = RegInfo->getSlotSize();
26833       FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
26834           SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
26835       FuncInfo->setFAIndex(FrameAddrIndex);
26836     }
26837     return DAG.getFrameIndex(FrameAddrIndex, VT);
26838   }
26839 
26840   unsigned FrameReg =
26841       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
26842   SDLoc dl(Op);  // FIXME probably not meaningful
26843   unsigned Depth = Op.getConstantOperandVal(0);
26844   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
26845           (FrameReg == X86::EBP && VT == MVT::i32)) &&
26846          "Invalid Frame Register!");
26847   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
26848   while (Depth--)
26849     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
26850                             MachinePointerInfo());
26851   return FrameAddr;
26852 }
26853 
26854 // FIXME? Maybe this could be a TableGen attribute on some registers and
26855 // this table could be generated automatically from RegInfo.
getRegisterByName(const char * RegName,LLT VT,const MachineFunction & MF) const26856 Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
26857                                               const MachineFunction &MF) const {
26858   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
26859 
26860   Register Reg = StringSwitch<unsigned>(RegName)
26861                        .Case("esp", X86::ESP)
26862                        .Case("rsp", X86::RSP)
26863                        .Case("ebp", X86::EBP)
26864                        .Case("rbp", X86::RBP)
26865                        .Default(0);
26866 
26867   if (Reg == X86::EBP || Reg == X86::RBP) {
26868     if (!TFI.hasFP(MF))
26869       report_fatal_error("register " + StringRef(RegName) +
26870                          " is allocatable: function has no frame pointer");
26871 #ifndef NDEBUG
26872     else {
26873       const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26874       Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
26875       assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
26876              "Invalid Frame Register!");
26877     }
26878 #endif
26879   }
26880 
26881   if (Reg)
26882     return Reg;
26883 
26884   report_fatal_error("Invalid register name global variable");
26885 }
26886 
LowerFRAME_TO_ARGS_OFFSET(SDValue Op,SelectionDAG & DAG) const26887 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
26888                                                      SelectionDAG &DAG) const {
26889   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26890   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
26891 }
26892 
getExceptionPointerRegister(const Constant * PersonalityFn) const26893 Register X86TargetLowering::getExceptionPointerRegister(
26894     const Constant *PersonalityFn) const {
26895   if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
26896     return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
26897 
26898   return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
26899 }
26900 
getExceptionSelectorRegister(const Constant * PersonalityFn) const26901 Register X86TargetLowering::getExceptionSelectorRegister(
26902     const Constant *PersonalityFn) const {
26903   // Funclet personalities don't use selectors (the runtime does the selection).
26904   if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
26905     return X86::NoRegister;
26906   return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
26907 }
26908 
needsFixedCatchObjects() const26909 bool X86TargetLowering::needsFixedCatchObjects() const {
26910   return Subtarget.isTargetWin64();
26911 }
26912 
LowerEH_RETURN(SDValue Op,SelectionDAG & DAG) const26913 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
26914   SDValue Chain     = Op.getOperand(0);
26915   SDValue Offset    = Op.getOperand(1);
26916   SDValue Handler   = Op.getOperand(2);
26917   SDLoc dl      (Op);
26918 
26919   EVT PtrVT = getPointerTy(DAG.getDataLayout());
26920   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26921   Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
26922   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
26923           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
26924          "Invalid Frame Register!");
26925   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
26926   Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
26927 
26928   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
26929                                  DAG.getIntPtrConstant(RegInfo->getSlotSize(),
26930                                                        dl));
26931   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
26932   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
26933   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
26934 
26935   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
26936                      DAG.getRegister(StoreAddrReg, PtrVT));
26937 }
26938 
lowerEH_SJLJ_SETJMP(SDValue Op,SelectionDAG & DAG) const26939 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
26940                                                SelectionDAG &DAG) const {
26941   SDLoc DL(Op);
26942   // If the subtarget is not 64bit, we may need the global base reg
26943   // after isel expand pseudo, i.e., after CGBR pass ran.
26944   // Therefore, ask for the GlobalBaseReg now, so that the pass
26945   // inserts the code for us in case we need it.
26946   // Otherwise, we will end up in a situation where we will
26947   // reference a virtual register that is not defined!
26948   if (!Subtarget.is64Bit()) {
26949     const X86InstrInfo *TII = Subtarget.getInstrInfo();
26950     (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
26951   }
26952   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
26953                      DAG.getVTList(MVT::i32, MVT::Other),
26954                      Op.getOperand(0), Op.getOperand(1));
26955 }
26956 
lowerEH_SJLJ_LONGJMP(SDValue Op,SelectionDAG & DAG) const26957 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
26958                                                 SelectionDAG &DAG) const {
26959   SDLoc DL(Op);
26960   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
26961                      Op.getOperand(0), Op.getOperand(1));
26962 }
26963 
lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,SelectionDAG & DAG) const26964 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
26965                                                        SelectionDAG &DAG) const {
26966   SDLoc DL(Op);
26967   return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
26968                      Op.getOperand(0));
26969 }
26970 
LowerADJUST_TRAMPOLINE(SDValue Op,SelectionDAG & DAG)26971 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
26972   return Op.getOperand(0);
26973 }
26974 
LowerINIT_TRAMPOLINE(SDValue Op,SelectionDAG & DAG) const26975 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
26976                                                 SelectionDAG &DAG) const {
26977   SDValue Root = Op.getOperand(0);
26978   SDValue Trmp = Op.getOperand(1); // trampoline
26979   SDValue FPtr = Op.getOperand(2); // nested function
26980   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
26981   SDLoc dl (Op);
26982 
26983   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
26984   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26985 
26986   if (Subtarget.is64Bit()) {
26987     SDValue OutChains[6];
26988 
26989     // Large code-model.
26990     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
26991     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
26992 
26993     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
26994     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
26995 
26996     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
26997 
26998     // Load the pointer to the nested function into R11.
26999     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
27000     SDValue Addr = Trmp;
27001     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27002                                 Addr, MachinePointerInfo(TrmpAddr));
27003 
27004     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27005                        DAG.getConstant(2, dl, MVT::i64));
27006     OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
27007                                 MachinePointerInfo(TrmpAddr, 2), Align(2));
27008 
27009     // Load the 'nest' parameter value into R10.
27010     // R10 is specified in X86CallingConv.td
27011     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
27012     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27013                        DAG.getConstant(10, dl, MVT::i64));
27014     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27015                                 Addr, MachinePointerInfo(TrmpAddr, 10));
27016 
27017     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27018                        DAG.getConstant(12, dl, MVT::i64));
27019     OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
27020                                 MachinePointerInfo(TrmpAddr, 12), Align(2));
27021 
27022     // Jump to the nested function.
27023     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
27024     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27025                        DAG.getConstant(20, dl, MVT::i64));
27026     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27027                                 Addr, MachinePointerInfo(TrmpAddr, 20));
27028 
27029     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
27030     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27031                        DAG.getConstant(22, dl, MVT::i64));
27032     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
27033                                 Addr, MachinePointerInfo(TrmpAddr, 22));
27034 
27035     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27036   } else {
27037     const Function *Func =
27038       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
27039     CallingConv::ID CC = Func->getCallingConv();
27040     unsigned NestReg;
27041 
27042     switch (CC) {
27043     default:
27044       llvm_unreachable("Unsupported calling convention");
27045     case CallingConv::C:
27046     case CallingConv::X86_StdCall: {
27047       // Pass 'nest' parameter in ECX.
27048       // Must be kept in sync with X86CallingConv.td
27049       NestReg = X86::ECX;
27050 
27051       // Check that ECX wasn't needed by an 'inreg' parameter.
27052       FunctionType *FTy = Func->getFunctionType();
27053       const AttributeList &Attrs = Func->getAttributes();
27054 
27055       if (!Attrs.isEmpty() && !Func->isVarArg()) {
27056         unsigned InRegCount = 0;
27057         unsigned Idx = 1;
27058 
27059         for (FunctionType::param_iterator I = FTy->param_begin(),
27060              E = FTy->param_end(); I != E; ++I, ++Idx)
27061           if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
27062             const DataLayout &DL = DAG.getDataLayout();
27063             // FIXME: should only count parameters that are lowered to integers.
27064             InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
27065           }
27066 
27067         if (InRegCount > 2) {
27068           report_fatal_error("Nest register in use - reduce number of inreg"
27069                              " parameters!");
27070         }
27071       }
27072       break;
27073     }
27074     case CallingConv::X86_FastCall:
27075     case CallingConv::X86_ThisCall:
27076     case CallingConv::Fast:
27077     case CallingConv::Tail:
27078     case CallingConv::SwiftTail:
27079       // Pass 'nest' parameter in EAX.
27080       // Must be kept in sync with X86CallingConv.td
27081       NestReg = X86::EAX;
27082       break;
27083     }
27084 
27085     SDValue OutChains[4];
27086     SDValue Addr, Disp;
27087 
27088     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27089                        DAG.getConstant(10, dl, MVT::i32));
27090     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
27091 
27092     // This is storing the opcode for MOV32ri.
27093     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
27094     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
27095     OutChains[0] =
27096         DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
27097                      Trmp, MachinePointerInfo(TrmpAddr));
27098 
27099     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27100                        DAG.getConstant(1, dl, MVT::i32));
27101     OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
27102                                 MachinePointerInfo(TrmpAddr, 1), Align(1));
27103 
27104     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
27105     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27106                        DAG.getConstant(5, dl, MVT::i32));
27107     OutChains[2] =
27108         DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
27109                      MachinePointerInfo(TrmpAddr, 5), Align(1));
27110 
27111     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27112                        DAG.getConstant(6, dl, MVT::i32));
27113     OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
27114                                 MachinePointerInfo(TrmpAddr, 6), Align(1));
27115 
27116     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27117   }
27118 }
27119 
LowerFLT_ROUNDS_(SDValue Op,SelectionDAG & DAG) const27120 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
27121                                             SelectionDAG &DAG) const {
27122   /*
27123    The rounding mode is in bits 11:10 of FPSR, and has the following
27124    settings:
27125      00 Round to nearest
27126      01 Round to -inf
27127      10 Round to +inf
27128      11 Round to 0
27129 
27130   FLT_ROUNDS, on the other hand, expects the following:
27131     -1 Undefined
27132      0 Round to 0
27133      1 Round to nearest
27134      2 Round to +inf
27135      3 Round to -inf
27136 
27137   To perform the conversion, we use a packed lookup table of the four 2-bit
27138   values that we can index by FPSP[11:10]
27139     0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
27140 
27141     (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
27142   */
27143 
27144   MachineFunction &MF = DAG.getMachineFunction();
27145   MVT VT = Op.getSimpleValueType();
27146   SDLoc DL(Op);
27147 
27148   // Save FP Control Word to stack slot
27149   int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
27150   SDValue StackSlot =
27151       DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
27152 
27153   MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
27154 
27155   SDValue Chain = Op.getOperand(0);
27156   SDValue Ops[] = {Chain, StackSlot};
27157   Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
27158                                   DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
27159                                   Align(2), MachineMemOperand::MOStore);
27160 
27161   // Load FP Control Word from stack slot
27162   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
27163   Chain = CWD.getValue(1);
27164 
27165   // Mask and turn the control bits into a shift for the lookup table.
27166   SDValue Shift =
27167     DAG.getNode(ISD::SRL, DL, MVT::i16,
27168                 DAG.getNode(ISD::AND, DL, MVT::i16,
27169                             CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
27170                 DAG.getConstant(9, DL, MVT::i8));
27171   Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
27172 
27173   SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
27174   SDValue RetVal =
27175     DAG.getNode(ISD::AND, DL, MVT::i32,
27176                 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
27177                 DAG.getConstant(3, DL, MVT::i32));
27178 
27179   RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
27180 
27181   return DAG.getMergeValues({RetVal, Chain}, DL);
27182 }
27183 
LowerSET_ROUNDING(SDValue Op,SelectionDAG & DAG) const27184 SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
27185                                              SelectionDAG &DAG) const {
27186   MachineFunction &MF = DAG.getMachineFunction();
27187   SDLoc DL(Op);
27188   SDValue Chain = Op.getNode()->getOperand(0);
27189 
27190   // FP control word may be set only from data in memory. So we need to allocate
27191   // stack space to save/load FP control word.
27192   int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
27193   SDValue StackSlot =
27194       DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
27195   MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
27196   MachineMemOperand *MMO =
27197       MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));
27198 
27199   // Store FP control word into memory.
27200   SDValue Ops[] = {Chain, StackSlot};
27201   Chain = DAG.getMemIntrinsicNode(
27202       X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
27203 
27204   // Load FP Control Word from stack slot and clear RM field (bits 11:10).
27205   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
27206   Chain = CWD.getValue(1);
27207   CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
27208                     DAG.getConstant(0xf3ff, DL, MVT::i16));
27209 
27210   // Calculate new rounding mode.
27211   SDValue NewRM = Op.getNode()->getOperand(1);
27212   SDValue RMBits;
27213   if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
27214     uint64_t RM = CVal->getZExtValue();
27215     int FieldVal;
27216     switch (static_cast<RoundingMode>(RM)) {
27217     case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
27218     case RoundingMode::TowardNegative:    FieldVal = X86::rmDownward; break;
27219     case RoundingMode::TowardPositive:    FieldVal = X86::rmUpward; break;
27220     case RoundingMode::TowardZero:        FieldVal = X86::rmTowardZero; break;
27221     default:
27222       llvm_unreachable("rounding mode is not supported by X86 hardware");
27223     }
27224     RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
27225   } else {
27226     // Need to convert argument into bits of control word:
27227     //    0 Round to 0       -> 11
27228     //    1 Round to nearest -> 00
27229     //    2 Round to +inf    -> 10
27230     //    3 Round to -inf    -> 01
27231     // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
27232     // To make the conversion, put all these values into a value 0xc9 and shift
27233     // it left depending on the rounding mode:
27234     //    (0xc9 << 4) & 0xc00 = X86::rmTowardZero
27235     //    (0xc9 << 6) & 0xc00 = X86::rmToNearest
27236     //    ...
27237     // (0xc9 << (2 * NewRM + 4)) & 0xc00
27238     SDValue ShiftValue =
27239         DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
27240                     DAG.getNode(ISD::ADD, DL, MVT::i32,
27241                                 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
27242                                             DAG.getConstant(1, DL, MVT::i8)),
27243                                 DAG.getConstant(4, DL, MVT::i32)));
27244     SDValue Shifted =
27245         DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
27246                     ShiftValue);
27247     RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
27248                          DAG.getConstant(0xc00, DL, MVT::i16));
27249   }
27250 
27251   // Update rounding mode bits and store the new FP Control Word into stack.
27252   CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
27253   Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 2);
27254 
27255   // Load FP control word from the slot.
27256   SDValue OpsLD[] = {Chain, StackSlot};
27257   MachineMemOperand *MMOL =
27258       MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));
27259   Chain = DAG.getMemIntrinsicNode(
27260       X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
27261 
27262   // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
27263   // same way but in bits 14:13.
27264   if (Subtarget.hasSSE1()) {
27265     // Store MXCSR into memory.
27266     Chain = DAG.getNode(
27267         ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27268         DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27269         StackSlot);
27270 
27271     // Load MXCSR from stack slot and clear RM field (bits 14:13).
27272     SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
27273     Chain = CWD.getValue(1);
27274     CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
27275                       DAG.getConstant(0xffff9fff, DL, MVT::i32));
27276 
27277     // Shift X87 RM bits from 11:10 to 14:13.
27278     RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
27279     RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
27280                          DAG.getConstant(3, DL, MVT::i8));
27281 
27282     // Update rounding mode bits and store the new FP Control Word into stack.
27283     CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
27284     Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 4);
27285 
27286     // Load MXCSR from the slot.
27287     Chain = DAG.getNode(
27288         ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27289         DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27290         StackSlot);
27291   }
27292 
27293   return Chain;
27294 }
27295 
27296 /// Lower a vector CTLZ using native supported vector CTLZ instruction.
27297 //
27298 // i8/i16 vector implemented using dword LZCNT vector instruction
27299 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
27300 // split the vector, perform operation on it's Lo a Hi part and
27301 // concatenate the results.
LowerVectorCTLZ_AVX512CDI(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)27302 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
27303                                          const X86Subtarget &Subtarget) {
27304   assert(Op.getOpcode() == ISD::CTLZ);
27305   SDLoc dl(Op);
27306   MVT VT = Op.getSimpleValueType();
27307   MVT EltVT = VT.getVectorElementType();
27308   unsigned NumElems = VT.getVectorNumElements();
27309 
27310   assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
27311           "Unsupported element type");
27312 
27313   // Split vector, it's Lo and Hi parts will be handled in next iteration.
27314   if (NumElems > 16 ||
27315       (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
27316     return splitVectorIntUnary(Op, DAG);
27317 
27318   MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
27319   assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
27320           "Unsupported value type for operation");
27321 
27322   // Use native supported vector instruction vplzcntd.
27323   Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
27324   SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
27325   SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
27326   SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
27327 
27328   return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
27329 }
27330 
27331 // Lower CTLZ using a PSHUFB lookup table implementation.
LowerVectorCTLZInRegLUT(SDValue Op,const SDLoc & DL,const X86Subtarget & Subtarget,SelectionDAG & DAG)27332 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
27333                                        const X86Subtarget &Subtarget,
27334                                        SelectionDAG &DAG) {
27335   MVT VT = Op.getSimpleValueType();
27336   int NumElts = VT.getVectorNumElements();
27337   int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
27338   MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
27339 
27340   // Per-nibble leading zero PSHUFB lookup table.
27341   const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
27342                        /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
27343                        /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
27344                        /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
27345 
27346   SmallVector<SDValue, 64> LUTVec;
27347   for (int i = 0; i < NumBytes; ++i)
27348     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
27349   SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
27350 
27351   // Begin by bitcasting the input to byte vector, then split those bytes
27352   // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
27353   // If the hi input nibble is zero then we add both results together, otherwise
27354   // we just take the hi result (by masking the lo result to zero before the
27355   // add).
27356   SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
27357   SDValue Zero = DAG.getConstant(0, DL, CurrVT);
27358 
27359   SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
27360   SDValue Lo = Op0;
27361   SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
27362   SDValue HiZ;
27363   if (CurrVT.is512BitVector()) {
27364     MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27365     HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
27366     HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27367   } else {
27368     HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
27369   }
27370 
27371   Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
27372   Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
27373   Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
27374   SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
27375 
27376   // Merge result back from vXi8 back to VT, working on the lo/hi halves
27377   // of the current vector width in the same way we did for the nibbles.
27378   // If the upper half of the input element is zero then add the halves'
27379   // leading zero counts together, otherwise just use the upper half's.
27380   // Double the width of the result until we are at target width.
27381   while (CurrVT != VT) {
27382     int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
27383     int CurrNumElts = CurrVT.getVectorNumElements();
27384     MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
27385     MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
27386     SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
27387 
27388     // Check if the upper half of the input element is zero.
27389     if (CurrVT.is512BitVector()) {
27390       MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27391       HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
27392                          DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27393       HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27394     } else {
27395       HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
27396                          DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27397     }
27398     HiZ = DAG.getBitcast(NextVT, HiZ);
27399 
27400     // Move the upper/lower halves to the lower bits as we'll be extending to
27401     // NextVT. Mask the lower result to zero if HiZ is true and add the results
27402     // together.
27403     SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
27404     SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
27405     SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
27406     R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
27407     Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
27408     CurrVT = NextVT;
27409   }
27410 
27411   return Res;
27412 }
27413 
LowerVectorCTLZ(SDValue Op,const SDLoc & DL,const X86Subtarget & Subtarget,SelectionDAG & DAG)27414 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
27415                                const X86Subtarget &Subtarget,
27416                                SelectionDAG &DAG) {
27417   MVT VT = Op.getSimpleValueType();
27418 
27419   if (Subtarget.hasCDI() &&
27420       // vXi8 vectors need to be promoted to 512-bits for vXi32.
27421       (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
27422     return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
27423 
27424   // Decompose 256-bit ops into smaller 128-bit ops.
27425   if (VT.is256BitVector() && !Subtarget.hasInt256())
27426     return splitVectorIntUnary(Op, DAG);
27427 
27428   // Decompose 512-bit ops into smaller 256-bit ops.
27429   if (VT.is512BitVector() && !Subtarget.hasBWI())
27430     return splitVectorIntUnary(Op, DAG);
27431 
27432   assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
27433   return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
27434 }
27435 
LowerCTLZ(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)27436 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
27437                          SelectionDAG &DAG) {
27438   MVT VT = Op.getSimpleValueType();
27439   MVT OpVT = VT;
27440   unsigned NumBits = VT.getSizeInBits();
27441   SDLoc dl(Op);
27442   unsigned Opc = Op.getOpcode();
27443 
27444   if (VT.isVector())
27445     return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
27446 
27447   Op = Op.getOperand(0);
27448   if (VT == MVT::i8) {
27449     // Zero extend to i32 since there is not an i8 bsr.
27450     OpVT = MVT::i32;
27451     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
27452   }
27453 
27454   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
27455   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
27456   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
27457 
27458   if (Opc == ISD::CTLZ) {
27459     // If src is zero (i.e. bsr sets ZF), returns NumBits.
27460     SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
27461                      DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
27462                      Op.getValue(1)};
27463     Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
27464   }
27465 
27466   // Finally xor with NumBits-1.
27467   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
27468                    DAG.getConstant(NumBits - 1, dl, OpVT));
27469 
27470   if (VT == MVT::i8)
27471     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
27472   return Op;
27473 }
27474 
LowerCTTZ(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)27475 static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
27476                          SelectionDAG &DAG) {
27477   MVT VT = Op.getSimpleValueType();
27478   unsigned NumBits = VT.getScalarSizeInBits();
27479   SDValue N0 = Op.getOperand(0);
27480   SDLoc dl(Op);
27481 
27482   assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
27483          "Only scalar CTTZ requires custom lowering");
27484 
27485   // Issue a bsf (scan bits forward) which also sets EFLAGS.
27486   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
27487   Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
27488 
27489   // If src is zero (i.e. bsf sets ZF), returns NumBits.
27490   SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
27491                    DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
27492                    Op.getValue(1)};
27493   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
27494 }
27495 
lowerAddSub(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)27496 static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
27497                            const X86Subtarget &Subtarget) {
27498   MVT VT = Op.getSimpleValueType();
27499   if (VT == MVT::i16 || VT == MVT::i32)
27500     return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
27501 
27502   if (VT == MVT::v32i16 || VT == MVT::v64i8)
27503     return splitVectorIntBinary(Op, DAG);
27504 
27505   assert(Op.getSimpleValueType().is256BitVector() &&
27506          Op.getSimpleValueType().isInteger() &&
27507          "Only handle AVX 256-bit vector integer operation");
27508   return splitVectorIntBinary(Op, DAG);
27509 }
27510 
LowerADDSAT_SUBSAT(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)27511 static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
27512                                   const X86Subtarget &Subtarget) {
27513   MVT VT = Op.getSimpleValueType();
27514   SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
27515   unsigned Opcode = Op.getOpcode();
27516   SDLoc DL(Op);
27517 
27518   if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
27519       (VT.is256BitVector() && !Subtarget.hasInt256())) {
27520     assert(Op.getSimpleValueType().isInteger() &&
27521            "Only handle AVX vector integer operation");
27522     return splitVectorIntBinary(Op, DAG);
27523   }
27524 
27525   // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
27526   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27527   EVT SetCCResultType =
27528       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
27529 
27530   if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
27531     // usubsat X, Y --> (X >u Y) ? X - Y : 0
27532     SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
27533     SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
27534     // TODO: Move this to DAGCombiner?
27535     if (SetCCResultType == VT &&
27536         DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
27537       return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
27538     return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
27539   }
27540 
27541   // Use default expansion.
27542   return SDValue();
27543 }
27544 
LowerABS(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)27545 static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
27546                         SelectionDAG &DAG) {
27547   MVT VT = Op.getSimpleValueType();
27548   if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
27549     // Since X86 does not have CMOV for 8-bit integer, we don't convert
27550     // 8-bit integer abs to NEG and CMOV.
27551     SDLoc DL(Op);
27552     SDValue N0 = Op.getOperand(0);
27553     SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
27554                               DAG.getConstant(0, DL, VT), N0);
27555     SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8),
27556                      SDValue(Neg.getNode(), 1)};
27557     return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
27558   }
27559 
27560   // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
27561   if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
27562     SDLoc DL(Op);
27563     SDValue Src = Op.getOperand(0);
27564     SDValue Sub =
27565         DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
27566     return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
27567   }
27568 
27569   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
27570     assert(VT.isInteger() &&
27571            "Only handle AVX 256-bit vector integer operation");
27572     return splitVectorIntUnary(Op, DAG);
27573   }
27574 
27575   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27576     return splitVectorIntUnary(Op, DAG);
27577 
27578   // Default to expand.
27579   return SDValue();
27580 }
27581 
LowerMINMAX(SDValue Op,SelectionDAG & DAG)27582 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
27583   MVT VT = Op.getSimpleValueType();
27584 
27585   // For AVX1 cases, split to use legal ops (everything but v4i64).
27586   if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
27587     return splitVectorIntBinary(Op, DAG);
27588 
27589   if (VT == MVT::v32i16 || VT == MVT::v64i8)
27590     return splitVectorIntBinary(Op, DAG);
27591 
27592   // Default to expand.
27593   return SDValue();
27594 }
27595 
LowerMUL(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)27596 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
27597                         SelectionDAG &DAG) {
27598   SDLoc dl(Op);
27599   MVT VT = Op.getSimpleValueType();
27600 
27601   // Decompose 256-bit ops into 128-bit ops.
27602   if (VT.is256BitVector() && !Subtarget.hasInt256())
27603     return splitVectorIntBinary(Op, DAG);
27604 
27605   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27606     return splitVectorIntBinary(Op, DAG);
27607 
27608   SDValue A = Op.getOperand(0);
27609   SDValue B = Op.getOperand(1);
27610 
27611   // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
27612   // vector pairs, multiply and truncate.
27613   if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
27614     unsigned NumElts = VT.getVectorNumElements();
27615 
27616     if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
27617         (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
27618       MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
27619       return DAG.getNode(
27620           ISD::TRUNCATE, dl, VT,
27621           DAG.getNode(ISD::MUL, dl, ExVT,
27622                       DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
27623                       DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
27624     }
27625 
27626     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
27627 
27628     // Extract the lo/hi parts to any extend to i16.
27629     // We're going to mask off the low byte of each result element of the
27630     // pmullw, so it doesn't matter what's in the high byte of each 16-bit
27631     // element.
27632     SDValue Undef = DAG.getUNDEF(VT);
27633     SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
27634     SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
27635 
27636     SDValue BLo, BHi;
27637     if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
27638       // If the RHS is a constant, manually unpackl/unpackh.
27639       SmallVector<SDValue, 16> LoOps, HiOps;
27640       for (unsigned i = 0; i != NumElts; i += 16) {
27641         for (unsigned j = 0; j != 8; ++j) {
27642           LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
27643                                                MVT::i16));
27644           HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
27645                                                MVT::i16));
27646         }
27647       }
27648 
27649       BLo = DAG.getBuildVector(ExVT, dl, LoOps);
27650       BHi = DAG.getBuildVector(ExVT, dl, HiOps);
27651     } else {
27652       BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
27653       BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
27654     }
27655 
27656     // Multiply, mask the lower 8bits of the lo/hi results and pack.
27657     SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
27658     SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
27659     RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
27660     RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
27661     return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
27662   }
27663 
27664   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
27665   if (VT == MVT::v4i32) {
27666     assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
27667            "Should not custom lower when pmulld is available!");
27668 
27669     // Extract the odd parts.
27670     static const int UnpackMask[] = { 1, -1, 3, -1 };
27671     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
27672     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
27673 
27674     // Multiply the even parts.
27675     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
27676                                 DAG.getBitcast(MVT::v2i64, A),
27677                                 DAG.getBitcast(MVT::v2i64, B));
27678     // Now multiply odd parts.
27679     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
27680                                DAG.getBitcast(MVT::v2i64, Aodds),
27681                                DAG.getBitcast(MVT::v2i64, Bodds));
27682 
27683     Evens = DAG.getBitcast(VT, Evens);
27684     Odds = DAG.getBitcast(VT, Odds);
27685 
27686     // Merge the two vectors back together with a shuffle. This expands into 2
27687     // shuffles.
27688     static const int ShufMask[] = { 0, 4, 2, 6 };
27689     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
27690   }
27691 
27692   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
27693          "Only know how to lower V2I64/V4I64/V8I64 multiply");
27694   assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
27695 
27696   //  Ahi = psrlqi(a, 32);
27697   //  Bhi = psrlqi(b, 32);
27698   //
27699   //  AloBlo = pmuludq(a, b);
27700   //  AloBhi = pmuludq(a, Bhi);
27701   //  AhiBlo = pmuludq(Ahi, b);
27702   //
27703   //  Hi = psllqi(AloBhi + AhiBlo, 32);
27704   //  return AloBlo + Hi;
27705   KnownBits AKnown = DAG.computeKnownBits(A);
27706   KnownBits BKnown = DAG.computeKnownBits(B);
27707 
27708   APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
27709   bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
27710   bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
27711 
27712   APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
27713   bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
27714   bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
27715 
27716   SDValue Zero = DAG.getConstant(0, dl, VT);
27717 
27718   // Only multiply lo/hi halves that aren't known to be zero.
27719   SDValue AloBlo = Zero;
27720   if (!ALoIsZero && !BLoIsZero)
27721     AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
27722 
27723   SDValue AloBhi = Zero;
27724   if (!ALoIsZero && !BHiIsZero) {
27725     SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
27726     AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
27727   }
27728 
27729   SDValue AhiBlo = Zero;
27730   if (!AHiIsZero && !BLoIsZero) {
27731     SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
27732     AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
27733   }
27734 
27735   SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
27736   Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
27737 
27738   return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
27739 }
27740 
LowervXi8MulWithUNPCK(SDValue A,SDValue B,const SDLoc & dl,MVT VT,bool IsSigned,const X86Subtarget & Subtarget,SelectionDAG & DAG,SDValue * Low=nullptr)27741 static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
27742                                      MVT VT, bool IsSigned,
27743                                      const X86Subtarget &Subtarget,
27744                                      SelectionDAG &DAG,
27745                                      SDValue *Low = nullptr) {
27746   unsigned NumElts = VT.getVectorNumElements();
27747 
27748   // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
27749   // to a vXi16 type. Do the multiplies, shift the results and pack the half
27750   // lane results back together.
27751 
27752   // We'll take different approaches for signed and unsigned.
27753   // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
27754   // and use pmullw to calculate the full 16-bit product.
27755   // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
27756   // shift them left into the upper byte of each word. This allows us to use
27757   // pmulhw to calculate the full 16-bit product. This trick means we don't
27758   // need to sign extend the bytes to use pmullw.
27759 
27760   MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
27761   SDValue Zero = DAG.getConstant(0, dl, VT);
27762 
27763   SDValue ALo, AHi;
27764   if (IsSigned) {
27765     ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
27766     AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
27767   } else {
27768     ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
27769     AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
27770   }
27771 
27772   SDValue BLo, BHi;
27773   if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
27774     // If the RHS is a constant, manually unpackl/unpackh and extend.
27775     SmallVector<SDValue, 16> LoOps, HiOps;
27776     for (unsigned i = 0; i != NumElts; i += 16) {
27777       for (unsigned j = 0; j != 8; ++j) {
27778         SDValue LoOp = B.getOperand(i + j);
27779         SDValue HiOp = B.getOperand(i + j + 8);
27780 
27781         if (IsSigned) {
27782           LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
27783           HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
27784           LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
27785                              DAG.getConstant(8, dl, MVT::i16));
27786           HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
27787                              DAG.getConstant(8, dl, MVT::i16));
27788         } else {
27789           LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
27790           HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
27791         }
27792 
27793         LoOps.push_back(LoOp);
27794         HiOps.push_back(HiOp);
27795       }
27796     }
27797 
27798     BLo = DAG.getBuildVector(ExVT, dl, LoOps);
27799     BHi = DAG.getBuildVector(ExVT, dl, HiOps);
27800   } else if (IsSigned) {
27801     BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
27802     BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
27803   } else {
27804     BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
27805     BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
27806   }
27807 
27808   // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
27809   // pack back to vXi8.
27810   unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
27811   SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
27812   SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
27813 
27814   if (Low) {
27815     // Mask the lower bits and pack the results to rejoin the halves.
27816     SDValue Mask = DAG.getConstant(255, dl, ExVT);
27817     SDValue LLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, Mask);
27818     SDValue LHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, Mask);
27819     *Low = DAG.getNode(X86ISD::PACKUS, dl, VT, LLo, LHi);
27820   }
27821 
27822   RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
27823   RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);
27824 
27825   // Bitcast back to VT and then pack all the even elements from Lo and Hi.
27826   return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
27827 }
27828 
LowerMULH(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)27829 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
27830                          SelectionDAG &DAG) {
27831   SDLoc dl(Op);
27832   MVT VT = Op.getSimpleValueType();
27833   bool IsSigned = Op->getOpcode() == ISD::MULHS;
27834   unsigned NumElts = VT.getVectorNumElements();
27835   SDValue A = Op.getOperand(0);
27836   SDValue B = Op.getOperand(1);
27837 
27838   // Decompose 256-bit ops into 128-bit ops.
27839   if (VT.is256BitVector() && !Subtarget.hasInt256())
27840     return splitVectorIntBinary(Op, DAG);
27841 
27842   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27843     return splitVectorIntBinary(Op, DAG);
27844 
27845   if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
27846     assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
27847            (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
27848            (VT == MVT::v16i32 && Subtarget.hasAVX512()));
27849 
27850     // PMULxD operations multiply each even value (starting at 0) of LHS with
27851     // the related value of RHS and produce a widen result.
27852     // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
27853     // => <2 x i64> <ae|cg>
27854     //
27855     // In other word, to have all the results, we need to perform two PMULxD:
27856     // 1. one with the even values.
27857     // 2. one with the odd values.
27858     // To achieve #2, with need to place the odd values at an even position.
27859     //
27860     // Place the odd value at an even position (basically, shift all values 1
27861     // step to the left):
27862     const int Mask[] = {1, -1,  3, -1,  5, -1,  7, -1,
27863                         9, -1, 11, -1, 13, -1, 15, -1};
27864     // <a|b|c|d> => <b|undef|d|undef>
27865     SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
27866                                         makeArrayRef(&Mask[0], NumElts));
27867     // <e|f|g|h> => <f|undef|h|undef>
27868     SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
27869                                         makeArrayRef(&Mask[0], NumElts));
27870 
27871     // Emit two multiplies, one for the lower 2 ints and one for the higher 2
27872     // ints.
27873     MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
27874     unsigned Opcode =
27875         (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
27876     // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
27877     // => <2 x i64> <ae|cg>
27878     SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
27879                                                   DAG.getBitcast(MulVT, A),
27880                                                   DAG.getBitcast(MulVT, B)));
27881     // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
27882     // => <2 x i64> <bf|dh>
27883     SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
27884                                                   DAG.getBitcast(MulVT, Odd0),
27885                                                   DAG.getBitcast(MulVT, Odd1)));
27886 
27887     // Shuffle it back into the right order.
27888     SmallVector<int, 16> ShufMask(NumElts);
27889     for (int i = 0; i != (int)NumElts; ++i)
27890       ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
27891 
27892     SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
27893 
27894     // If we have a signed multiply but no PMULDQ fix up the result of an
27895     // unsigned multiply.
27896     if (IsSigned && !Subtarget.hasSSE41()) {
27897       SDValue Zero = DAG.getConstant(0, dl, VT);
27898       SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
27899                                DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
27900       SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
27901                                DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
27902 
27903       SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
27904       Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
27905     }
27906 
27907     return Res;
27908   }
27909 
27910   // Only i8 vectors should need custom lowering after this.
27911   assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
27912          (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
27913          "Unsupported vector type");
27914 
27915   // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
27916   // logical shift down the upper half and pack back to i8.
27917 
27918   // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
27919   // and then ashr/lshr the upper bits down to the lower bits before multiply.
27920 
27921   if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
27922       (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
27923     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
27924     unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
27925     SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
27926     SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
27927     SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
27928     Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
27929     return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
27930   }
27931 
27932   return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
27933 }
27934 
27935 // Custom lowering for SMULO/UMULO.
LowerMULO(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)27936 static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
27937                          SelectionDAG &DAG) {
27938   MVT VT = Op.getSimpleValueType();
27939 
27940   // Scalars defer to LowerXALUO.
27941   if (!VT.isVector())
27942     return LowerXALUO(Op, DAG);
27943 
27944   SDLoc dl(Op);
27945   bool IsSigned = Op->getOpcode() == ISD::SMULO;
27946   SDValue A = Op.getOperand(0);
27947   SDValue B = Op.getOperand(1);
27948   EVT OvfVT = Op->getValueType(1);
27949 
27950   if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
27951       (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
27952     // Extract the LHS Lo/Hi vectors
27953     SDValue LHSLo, LHSHi;
27954     std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
27955 
27956     // Extract the RHS Lo/Hi vectors
27957     SDValue RHSLo, RHSHi;
27958     std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
27959 
27960     EVT LoOvfVT, HiOvfVT;
27961     std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
27962     SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
27963     SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
27964 
27965     // Issue the split operations.
27966     SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
27967     SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
27968 
27969     // Join the separate data results and the overflow results.
27970     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
27971     SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
27972                               Hi.getValue(1));
27973 
27974     return DAG.getMergeValues({Res, Ovf}, dl);
27975   }
27976 
27977   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27978   EVT SetccVT =
27979       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
27980 
27981   if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
27982       (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
27983     unsigned NumElts = VT.getVectorNumElements();
27984     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
27985     unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
27986     SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
27987     SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
27988     SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
27989 
27990     SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
27991 
27992     SDValue Ovf;
27993     if (IsSigned) {
27994       SDValue High, LowSign;
27995       if (OvfVT.getVectorElementType() == MVT::i1 &&
27996           (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
27997         // Rather the truncating try to do the compare on vXi16 or vXi32.
27998         // Shift the high down filling with sign bits.
27999         High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
28000         // Fill all 16 bits with the sign bit from the low.
28001         LowSign =
28002             getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
28003         LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
28004                                              15, DAG);
28005         SetccVT = OvfVT;
28006         if (!Subtarget.hasBWI()) {
28007           // We can't do a vXi16 compare so sign extend to v16i32.
28008           High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
28009           LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
28010         }
28011       } else {
28012         // Otherwise do the compare at vXi8.
28013         High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28014         High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28015         LowSign =
28016             DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28017       }
28018 
28019       Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28020     } else {
28021       SDValue High =
28022           getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28023       if (OvfVT.getVectorElementType() == MVT::i1 &&
28024           (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28025         // Rather the truncating try to do the compare on vXi16 or vXi32.
28026         SetccVT = OvfVT;
28027         if (!Subtarget.hasBWI()) {
28028           // We can't do a vXi16 compare so sign extend to v16i32.
28029           High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
28030         }
28031       } else {
28032         // Otherwise do the compare at vXi8.
28033         High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28034       }
28035 
28036       Ovf =
28037           DAG.getSetCC(dl, SetccVT, High,
28038                        DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
28039     }
28040 
28041     Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28042 
28043     return DAG.getMergeValues({Low, Ovf}, dl);
28044   }
28045 
28046   SDValue Low;
28047   SDValue High =
28048       LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
28049 
28050   SDValue Ovf;
28051   if (IsSigned) {
28052     // SMULO overflows if the high bits don't match the sign of the low.
28053     SDValue LowSign =
28054         DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28055     Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28056   } else {
28057     // UMULO overflows if the high bits are non-zero.
28058     Ovf =
28059         DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
28060   }
28061 
28062   Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28063 
28064   return DAG.getMergeValues({Low, Ovf}, dl);
28065 }
28066 
LowerWin64_i128OP(SDValue Op,SelectionDAG & DAG) const28067 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
28068   assert(Subtarget.isTargetWin64() && "Unexpected target");
28069   EVT VT = Op.getValueType();
28070   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
28071          "Unexpected return type for lowering");
28072 
28073   RTLIB::Libcall LC;
28074   bool isSigned;
28075   switch (Op->getOpcode()) {
28076   default: llvm_unreachable("Unexpected request for libcall!");
28077   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
28078   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
28079   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
28080   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
28081   }
28082 
28083   SDLoc dl(Op);
28084   SDValue InChain = DAG.getEntryNode();
28085 
28086   TargetLowering::ArgListTy Args;
28087   TargetLowering::ArgListEntry Entry;
28088   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
28089     EVT ArgVT = Op->getOperand(i).getValueType();
28090     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
28091            "Unexpected argument type for lowering");
28092     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
28093     int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
28094     MachinePointerInfo MPI =
28095         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
28096     Entry.Node = StackPtr;
28097     InChain =
28098         DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
28099     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
28100     Entry.Ty = PointerType::get(ArgTy,0);
28101     Entry.IsSExt = false;
28102     Entry.IsZExt = false;
28103     Args.push_back(Entry);
28104   }
28105 
28106   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
28107                                          getPointerTy(DAG.getDataLayout()));
28108 
28109   TargetLowering::CallLoweringInfo CLI(DAG);
28110   CLI.setDebugLoc(dl)
28111       .setChain(InChain)
28112       .setLibCallee(
28113           getLibcallCallingConv(LC),
28114           static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
28115           std::move(Args))
28116       .setInRegister()
28117       .setSExtResult(isSigned)
28118       .setZExtResult(!isSigned);
28119 
28120   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
28121   return DAG.getBitcast(VT, CallInfo.first);
28122 }
28123 
28124 // Return true if the required (according to Opcode) shift-imm form is natively
28125 // supported by the Subtarget
SupportedVectorShiftWithImm(MVT VT,const X86Subtarget & Subtarget,unsigned Opcode)28126 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
28127                                         unsigned Opcode) {
28128   if (VT.getScalarSizeInBits() < 16)
28129     return false;
28130 
28131   if (VT.is512BitVector() && Subtarget.hasAVX512() &&
28132       (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
28133     return true;
28134 
28135   bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
28136                 (VT.is256BitVector() && Subtarget.hasInt256());
28137 
28138   bool AShift = LShift && (Subtarget.hasAVX512() ||
28139                            (VT != MVT::v2i64 && VT != MVT::v4i64));
28140   return (Opcode == ISD::SRA) ? AShift : LShift;
28141 }
28142 
28143 // The shift amount is a variable, but it is the same for all vector lanes.
28144 // These instructions are defined together with shift-immediate.
28145 static
SupportedVectorShiftWithBaseAmnt(MVT VT,const X86Subtarget & Subtarget,unsigned Opcode)28146 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
28147                                       unsigned Opcode) {
28148   return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
28149 }
28150 
28151 // Return true if the required (according to Opcode) variable-shift form is
28152 // natively supported by the Subtarget
SupportedVectorVarShift(MVT VT,const X86Subtarget & Subtarget,unsigned Opcode)28153 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
28154                                     unsigned Opcode) {
28155 
28156   if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
28157     return false;
28158 
28159   // vXi16 supported only on AVX-512, BWI
28160   if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
28161     return false;
28162 
28163   if (Subtarget.hasAVX512())
28164     return true;
28165 
28166   bool LShift = VT.is128BitVector() || VT.is256BitVector();
28167   bool AShift = LShift &&  VT != MVT::v2i64 && VT != MVT::v4i64;
28168   return (Opcode == ISD::SRA) ? AShift : LShift;
28169 }
28170 
LowerScalarImmediateShift(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)28171 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
28172                                          const X86Subtarget &Subtarget) {
28173   MVT VT = Op.getSimpleValueType();
28174   SDLoc dl(Op);
28175   SDValue R = Op.getOperand(0);
28176   SDValue Amt = Op.getOperand(1);
28177   unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
28178 
28179   auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
28180     assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
28181     MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
28182     SDValue Ex = DAG.getBitcast(ExVT, R);
28183 
28184     // ashr(R, 63) === cmp_slt(R, 0)
28185     if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
28186       assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
28187              "Unsupported PCMPGT op");
28188       return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
28189     }
28190 
28191     if (ShiftAmt >= 32) {
28192       // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
28193       SDValue Upper =
28194           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
28195       SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
28196                                                  ShiftAmt - 32, DAG);
28197       if (VT == MVT::v2i64)
28198         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
28199       if (VT == MVT::v4i64)
28200         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
28201                                   {9, 1, 11, 3, 13, 5, 15, 7});
28202     } else {
28203       // SRA upper i32, SRL whole i64 and select lower i32.
28204       SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
28205                                                  ShiftAmt, DAG);
28206       SDValue Lower =
28207           getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
28208       Lower = DAG.getBitcast(ExVT, Lower);
28209       if (VT == MVT::v2i64)
28210         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
28211       if (VT == MVT::v4i64)
28212         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
28213                                   {8, 1, 10, 3, 12, 5, 14, 7});
28214     }
28215     return DAG.getBitcast(VT, Ex);
28216   };
28217 
28218   // Optimize shl/srl/sra with constant shift amount.
28219   APInt APIntShiftAmt;
28220   if (!X86::isConstantSplat(Amt, APIntShiftAmt))
28221     return SDValue();
28222 
28223   // If the shift amount is out of range, return undef.
28224   if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
28225     return DAG.getUNDEF(VT);
28226 
28227   uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
28228 
28229   if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
28230     return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
28231 
28232   // i64 SRA needs to be performed as partial shifts.
28233   if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
28234        (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
28235       Op.getOpcode() == ISD::SRA)
28236     return ArithmeticShiftRight64(ShiftAmt);
28237 
28238   if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
28239       (Subtarget.hasBWI() && VT == MVT::v64i8)) {
28240     unsigned NumElts = VT.getVectorNumElements();
28241     MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28242 
28243     // Simple i8 add case
28244     if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
28245       return DAG.getNode(ISD::ADD, dl, VT, R, R);
28246 
28247     // ashr(R, 7)  === cmp_slt(R, 0)
28248     if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
28249       SDValue Zeros = DAG.getConstant(0, dl, VT);
28250       if (VT.is512BitVector()) {
28251         assert(VT == MVT::v64i8 && "Unexpected element type!");
28252         SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
28253         return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
28254       }
28255       return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
28256     }
28257 
28258     // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
28259     if (VT == MVT::v16i8 && Subtarget.hasXOP())
28260       return SDValue();
28261 
28262     if (Op.getOpcode() == ISD::SHL) {
28263       // Make a large shift.
28264       SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
28265                                                ShiftAmt, DAG);
28266       SHL = DAG.getBitcast(VT, SHL);
28267       // Zero out the rightmost bits.
28268       APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
28269       return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
28270     }
28271     if (Op.getOpcode() == ISD::SRL) {
28272       // Make a large shift.
28273       SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
28274                                                ShiftAmt, DAG);
28275       SRL = DAG.getBitcast(VT, SRL);
28276       // Zero out the leftmost bits.
28277       APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
28278       return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
28279     }
28280     if (Op.getOpcode() == ISD::SRA) {
28281       // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
28282       SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
28283 
28284       SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
28285       Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
28286       Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
28287       return Res;
28288     }
28289     llvm_unreachable("Unknown shift opcode.");
28290   }
28291 
28292   return SDValue();
28293 }
28294 
LowerScalarVariableShift(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)28295 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
28296                                         const X86Subtarget &Subtarget) {
28297   MVT VT = Op.getSimpleValueType();
28298   SDLoc dl(Op);
28299   SDValue R = Op.getOperand(0);
28300   SDValue Amt = Op.getOperand(1);
28301   unsigned Opcode = Op.getOpcode();
28302   unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
28303   unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
28304 
28305   if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
28306     if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
28307       MVT EltVT = VT.getVectorElementType();
28308       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
28309       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
28310         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
28311       else if (EltVT.bitsLT(MVT::i32))
28312         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
28313 
28314       return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
28315     }
28316 
28317     // vXi8 shifts - shift as v8i16 + mask result.
28318     if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
28319          (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
28320          VT == MVT::v64i8) &&
28321         !Subtarget.hasXOP()) {
28322       unsigned NumElts = VT.getVectorNumElements();
28323       MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28324       if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
28325         unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
28326         unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
28327         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
28328 
28329         // Create the mask using vXi16 shifts. For shift-rights we need to move
28330         // the upper byte down before splatting the vXi8 mask.
28331         SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
28332         BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
28333                                       BaseShAmt, Subtarget, DAG);
28334         if (Opcode != ISD::SHL)
28335           BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
28336                                                8, DAG);
28337         BitMask = DAG.getBitcast(VT, BitMask);
28338         BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
28339                                        SmallVector<int, 64>(NumElts, 0));
28340 
28341         SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
28342                                           DAG.getBitcast(ExtVT, R), BaseShAmt,
28343                                           Subtarget, DAG);
28344         Res = DAG.getBitcast(VT, Res);
28345         Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
28346 
28347         if (Opcode == ISD::SRA) {
28348           // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
28349           // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
28350           SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
28351           SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
28352                                          BaseShAmt, Subtarget, DAG);
28353           SignMask = DAG.getBitcast(VT, SignMask);
28354           Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
28355           Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
28356         }
28357         return Res;
28358       }
28359     }
28360   }
28361 
28362   // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
28363   if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
28364       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
28365     Amt = Amt.getOperand(0);
28366     unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
28367     std::vector<SDValue> Vals(Ratio);
28368     for (unsigned i = 0; i != Ratio; ++i)
28369       Vals[i] = Amt.getOperand(i);
28370     for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
28371       for (unsigned j = 0; j != Ratio; ++j)
28372         if (Vals[j] != Amt.getOperand(i + j))
28373           return SDValue();
28374     }
28375 
28376     if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
28377       return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
28378   }
28379   return SDValue();
28380 }
28381 
28382 // Convert a shift/rotate left amount to a multiplication scale factor.
convertShiftLeftToScale(SDValue Amt,const SDLoc & dl,const X86Subtarget & Subtarget,SelectionDAG & DAG)28383 static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
28384                                        const X86Subtarget &Subtarget,
28385                                        SelectionDAG &DAG) {
28386   MVT VT = Amt.getSimpleValueType();
28387   if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
28388         (Subtarget.hasInt256() && VT == MVT::v16i16) ||
28389         (Subtarget.hasVBMI2() && VT == MVT::v32i16) ||
28390         (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
28391     return SDValue();
28392 
28393   if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
28394     SmallVector<SDValue, 8> Elts;
28395     MVT SVT = VT.getVectorElementType();
28396     unsigned SVTBits = SVT.getSizeInBits();
28397     APInt One(SVTBits, 1);
28398     unsigned NumElems = VT.getVectorNumElements();
28399 
28400     for (unsigned i = 0; i != NumElems; ++i) {
28401       SDValue Op = Amt->getOperand(i);
28402       if (Op->isUndef()) {
28403         Elts.push_back(Op);
28404         continue;
28405       }
28406 
28407       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
28408       APInt C(SVTBits, ND->getZExtValue());
28409       uint64_t ShAmt = C.getZExtValue();
28410       if (ShAmt >= SVTBits) {
28411         Elts.push_back(DAG.getUNDEF(SVT));
28412         continue;
28413       }
28414       Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
28415     }
28416     return DAG.getBuildVector(VT, dl, Elts);
28417   }
28418 
28419   // If the target doesn't support variable shifts, use either FP conversion
28420   // or integer multiplication to avoid shifting each element individually.
28421   if (VT == MVT::v4i32) {
28422     Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
28423     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
28424                       DAG.getConstant(0x3f800000U, dl, VT));
28425     Amt = DAG.getBitcast(MVT::v4f32, Amt);
28426     return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
28427   }
28428 
28429   // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
28430   if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
28431     SDValue Z = DAG.getConstant(0, dl, VT);
28432     SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
28433     SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
28434     Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
28435     Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
28436     if (Subtarget.hasSSE41())
28437       return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
28438 
28439     return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
28440                                         DAG.getBitcast(VT, Hi),
28441                                         {0, 2, 4, 6, 8, 10, 12, 14});
28442   }
28443 
28444   return SDValue();
28445 }
28446 
LowerShift(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)28447 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
28448                           SelectionDAG &DAG) {
28449   MVT VT = Op.getSimpleValueType();
28450   SDLoc dl(Op);
28451   SDValue R = Op.getOperand(0);
28452   SDValue Amt = Op.getOperand(1);
28453   unsigned EltSizeInBits = VT.getScalarSizeInBits();
28454   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
28455 
28456   unsigned Opc = Op.getOpcode();
28457   unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
28458   unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
28459 
28460   assert(VT.isVector() && "Custom lowering only for vector shifts!");
28461   assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
28462 
28463   if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
28464     return V;
28465 
28466   if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
28467     return V;
28468 
28469   if (SupportedVectorVarShift(VT, Subtarget, Opc))
28470     return Op;
28471 
28472   // XOP has 128-bit variable logical/arithmetic shifts.
28473   // +ve/-ve Amt = shift left/right.
28474   if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
28475                              VT == MVT::v8i16 || VT == MVT::v16i8)) {
28476     if (Opc == ISD::SRL || Opc == ISD::SRA) {
28477       SDValue Zero = DAG.getConstant(0, dl, VT);
28478       Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
28479     }
28480     if (Opc == ISD::SHL || Opc == ISD::SRL)
28481       return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
28482     if (Opc == ISD::SRA)
28483       return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
28484   }
28485 
28486   // 2i64 vector logical shifts can efficiently avoid scalarization - do the
28487   // shifts per-lane and then shuffle the partial results back together.
28488   if (VT == MVT::v2i64 && Opc != ISD::SRA) {
28489     // Splat the shift amounts so the scalar shifts above will catch it.
28490     SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
28491     SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
28492     SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
28493     SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
28494     return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
28495   }
28496 
28497   // i64 vector arithmetic shift can be emulated with the transform:
28498   // M = lshr(SIGN_MASK, Amt)
28499   // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
28500   if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
28501       Opc == ISD::SRA) {
28502     SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
28503     SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
28504     R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
28505     R = DAG.getNode(ISD::XOR, dl, VT, R, M);
28506     R = DAG.getNode(ISD::SUB, dl, VT, R, M);
28507     return R;
28508   }
28509 
28510   // If possible, lower this shift as a sequence of two shifts by
28511   // constant plus a BLENDing shuffle instead of scalarizing it.
28512   // Example:
28513   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
28514   //
28515   // Could be rewritten as:
28516   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
28517   //
28518   // The advantage is that the two shifts from the example would be
28519   // lowered as X86ISD::VSRLI nodes in parallel before blending.
28520   if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
28521                       (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
28522     SDValue Amt1, Amt2;
28523     unsigned NumElts = VT.getVectorNumElements();
28524     SmallVector<int, 8> ShuffleMask;
28525     for (unsigned i = 0; i != NumElts; ++i) {
28526       SDValue A = Amt->getOperand(i);
28527       if (A.isUndef()) {
28528         ShuffleMask.push_back(SM_SentinelUndef);
28529         continue;
28530       }
28531       if (!Amt1 || Amt1 == A) {
28532         ShuffleMask.push_back(i);
28533         Amt1 = A;
28534         continue;
28535       }
28536       if (!Amt2 || Amt2 == A) {
28537         ShuffleMask.push_back(i + NumElts);
28538         Amt2 = A;
28539         continue;
28540       }
28541       break;
28542     }
28543 
28544     // Only perform this blend if we can perform it without loading a mask.
28545     if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
28546         (VT != MVT::v16i16 ||
28547          is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
28548         (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
28549          canWidenShuffleElements(ShuffleMask))) {
28550       auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
28551       auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
28552       if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
28553           Cst2->getAPIntValue().ult(EltSizeInBits)) {
28554         SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
28555                                                     Cst1->getZExtValue(), DAG);
28556         SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
28557                                                     Cst2->getZExtValue(), DAG);
28558         return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
28559       }
28560     }
28561   }
28562 
28563   // If possible, lower this packed shift into a vector multiply instead of
28564   // expanding it into a sequence of scalar shifts.
28565   if (Opc == ISD::SHL)
28566     if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
28567       return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
28568 
28569   // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
28570   // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
28571   if (Opc == ISD::SRL && ConstantAmt &&
28572       (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
28573     SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
28574     SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
28575     if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
28576       SDValue Zero = DAG.getConstant(0, dl, VT);
28577       SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
28578       SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
28579       return DAG.getSelect(dl, VT, ZAmt, R, Res);
28580     }
28581   }
28582 
28583   // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
28584   // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
28585   // TODO: Special case handling for shift by 0/1, really we can afford either
28586   // of these cases in pre-SSE41/XOP/AVX512 but not both.
28587   if (Opc == ISD::SRA && ConstantAmt &&
28588       (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
28589       ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
28590         !Subtarget.hasAVX512()) ||
28591        DAG.isKnownNeverZero(Amt))) {
28592     SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
28593     SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
28594     if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
28595       SDValue Amt0 =
28596           DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
28597       SDValue Amt1 =
28598           DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
28599       SDValue Sra1 =
28600           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
28601       SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
28602       Res = DAG.getSelect(dl, VT, Amt0, R, Res);
28603       return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
28604     }
28605   }
28606 
28607   // v4i32 Non Uniform Shifts.
28608   // If the shift amount is constant we can shift each lane using the SSE2
28609   // immediate shifts, else we need to zero-extend each lane to the lower i64
28610   // and shift using the SSE2 variable shifts.
28611   // The separate results can then be blended together.
28612   if (VT == MVT::v4i32) {
28613     SDValue Amt0, Amt1, Amt2, Amt3;
28614     if (ConstantAmt) {
28615       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
28616       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
28617       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
28618       Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
28619     } else {
28620       // The SSE2 shifts use the lower i64 as the same shift amount for
28621       // all lanes and the upper i64 is ignored. On AVX we're better off
28622       // just zero-extending, but for SSE just duplicating the top 16-bits is
28623       // cheaper and has the same effect for out of range values.
28624       if (Subtarget.hasAVX()) {
28625         SDValue Z = DAG.getConstant(0, dl, VT);
28626         Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
28627         Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
28628         Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
28629         Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
28630       } else {
28631         SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
28632         SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
28633                                              {4, 5, 6, 7, -1, -1, -1, -1});
28634         Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
28635                                     {0, 1, 1, 1, -1, -1, -1, -1});
28636         Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
28637                                     {2, 3, 3, 3, -1, -1, -1, -1});
28638         Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
28639                                     {0, 1, 1, 1, -1, -1, -1, -1});
28640         Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
28641                                     {2, 3, 3, 3, -1, -1, -1, -1});
28642       }
28643     }
28644 
28645     unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
28646     SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
28647     SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
28648     SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
28649     SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
28650 
28651     // Merge the shifted lane results optimally with/without PBLENDW.
28652     // TODO - ideally shuffle combining would handle this.
28653     if (Subtarget.hasSSE41()) {
28654       SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
28655       SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
28656       return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
28657     }
28658     SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
28659     SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
28660     return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
28661   }
28662 
28663   // It's worth extending once and using the vXi16/vXi32 shifts for smaller
28664   // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
28665   // make the existing SSE solution better.
28666   // NOTE: We honor prefered vector width before promoting to 512-bits.
28667   if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
28668       (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
28669       (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
28670       (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
28671       (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
28672     assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
28673            "Unexpected vector type");
28674     MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
28675     MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
28676     unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28677     R = DAG.getNode(ExtOpc, dl, ExtVT, R);
28678     Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
28679     return DAG.getNode(ISD::TRUNCATE, dl, VT,
28680                        DAG.getNode(Opc, dl, ExtVT, R, Amt));
28681   }
28682 
28683   // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
28684   // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
28685   if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
28686       (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
28687        (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
28688       !Subtarget.hasXOP()) {
28689     int NumElts = VT.getVectorNumElements();
28690     SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
28691 
28692     // Extend constant shift amount to vXi16 (it doesn't matter if the type
28693     // isn't legal).
28694     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28695     Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
28696     Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
28697     Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
28698     assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
28699            "Constant build vector expected");
28700 
28701     if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
28702       R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
28703                           : DAG.getZExtOrTrunc(R, dl, ExVT);
28704       R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
28705       R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
28706       return DAG.getZExtOrTrunc(R, dl, VT);
28707     }
28708 
28709     SmallVector<SDValue, 16> LoAmt, HiAmt;
28710     for (int i = 0; i != NumElts; i += 16) {
28711       for (int j = 0; j != 8; ++j) {
28712         LoAmt.push_back(Amt.getOperand(i + j));
28713         HiAmt.push_back(Amt.getOperand(i + j + 8));
28714       }
28715     }
28716 
28717     MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
28718     SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
28719     SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
28720 
28721     SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
28722     SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
28723     LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
28724     HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
28725     LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
28726     HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
28727     LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
28728     HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
28729     return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
28730   }
28731 
28732   if (VT == MVT::v16i8 ||
28733       (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
28734       (VT == MVT::v64i8 && Subtarget.hasBWI())) {
28735     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
28736 
28737     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
28738       if (VT.is512BitVector()) {
28739         // On AVX512BW targets we make use of the fact that VSELECT lowers
28740         // to a masked blend which selects bytes based just on the sign bit
28741         // extracted to a mask.
28742         MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
28743         V0 = DAG.getBitcast(VT, V0);
28744         V1 = DAG.getBitcast(VT, V1);
28745         Sel = DAG.getBitcast(VT, Sel);
28746         Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
28747                            ISD::SETGT);
28748         return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
28749       } else if (Subtarget.hasSSE41()) {
28750         // On SSE41 targets we can use PBLENDVB which selects bytes based just
28751         // on the sign bit.
28752         V0 = DAG.getBitcast(VT, V0);
28753         V1 = DAG.getBitcast(VT, V1);
28754         Sel = DAG.getBitcast(VT, Sel);
28755         return DAG.getBitcast(SelVT,
28756                               DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
28757       }
28758       // On pre-SSE41 targets we test for the sign bit by comparing to
28759       // zero - a negative value will set all bits of the lanes to true
28760       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
28761       SDValue Z = DAG.getConstant(0, dl, SelVT);
28762       SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
28763       return DAG.getSelect(dl, SelVT, C, V0, V1);
28764     };
28765 
28766     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
28767     // We can safely do this using i16 shifts as we're only interested in
28768     // the 3 lower bits of each byte.
28769     Amt = DAG.getBitcast(ExtVT, Amt);
28770     Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
28771     Amt = DAG.getBitcast(VT, Amt);
28772 
28773     if (Opc == ISD::SHL || Opc == ISD::SRL) {
28774       // r = VSELECT(r, shift(r, 4), a);
28775       SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
28776       R = SignBitSelect(VT, Amt, M, R);
28777 
28778       // a += a
28779       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28780 
28781       // r = VSELECT(r, shift(r, 2), a);
28782       M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
28783       R = SignBitSelect(VT, Amt, M, R);
28784 
28785       // a += a
28786       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28787 
28788       // return VSELECT(r, shift(r, 1), a);
28789       M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
28790       R = SignBitSelect(VT, Amt, M, R);
28791       return R;
28792     }
28793 
28794     if (Opc == ISD::SRA) {
28795       // For SRA we need to unpack each byte to the higher byte of a i16 vector
28796       // so we can correctly sign extend. We don't care what happens to the
28797       // lower byte.
28798       SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
28799       SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
28800       SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
28801       SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
28802       ALo = DAG.getBitcast(ExtVT, ALo);
28803       AHi = DAG.getBitcast(ExtVT, AHi);
28804       RLo = DAG.getBitcast(ExtVT, RLo);
28805       RHi = DAG.getBitcast(ExtVT, RHi);
28806 
28807       // r = VSELECT(r, shift(r, 4), a);
28808       SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
28809       SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
28810       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
28811       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
28812 
28813       // a += a
28814       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
28815       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
28816 
28817       // r = VSELECT(r, shift(r, 2), a);
28818       MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
28819       MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
28820       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
28821       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
28822 
28823       // a += a
28824       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
28825       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
28826 
28827       // r = VSELECT(r, shift(r, 1), a);
28828       MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
28829       MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
28830       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
28831       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
28832 
28833       // Logical shift the result back to the lower byte, leaving a zero upper
28834       // byte meaning that we can safely pack with PACKUSWB.
28835       RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
28836       RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
28837       return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
28838     }
28839   }
28840 
28841   if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
28842     MVT ExtVT = MVT::v8i32;
28843     SDValue Z = DAG.getConstant(0, dl, VT);
28844     SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
28845     SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
28846     SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
28847     SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
28848     ALo = DAG.getBitcast(ExtVT, ALo);
28849     AHi = DAG.getBitcast(ExtVT, AHi);
28850     RLo = DAG.getBitcast(ExtVT, RLo);
28851     RHi = DAG.getBitcast(ExtVT, RHi);
28852     SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
28853     SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
28854     Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
28855     Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
28856     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
28857   }
28858 
28859   if (VT == MVT::v8i16) {
28860     // If we have a constant shift amount, the non-SSE41 path is best as
28861     // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
28862     bool UseSSE41 = Subtarget.hasSSE41() &&
28863                     !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
28864 
28865     auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
28866       // On SSE41 targets we can use PBLENDVB which selects bytes based just on
28867       // the sign bit.
28868       if (UseSSE41) {
28869         MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
28870         V0 = DAG.getBitcast(ExtVT, V0);
28871         V1 = DAG.getBitcast(ExtVT, V1);
28872         Sel = DAG.getBitcast(ExtVT, Sel);
28873         return DAG.getBitcast(
28874             VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
28875       }
28876       // On pre-SSE41 targets we splat the sign bit - a negative value will
28877       // set all bits of the lanes to true and VSELECT uses that in
28878       // its OR(AND(V0,C),AND(V1,~C)) lowering.
28879       SDValue C =
28880           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
28881       return DAG.getSelect(dl, VT, C, V0, V1);
28882     };
28883 
28884     // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
28885     if (UseSSE41) {
28886       // On SSE41 targets we need to replicate the shift mask in both
28887       // bytes for PBLENDVB.
28888       Amt = DAG.getNode(
28889           ISD::OR, dl, VT,
28890           getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
28891           getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
28892     } else {
28893       Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
28894     }
28895 
28896     // r = VSELECT(r, shift(r, 8), a);
28897     SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
28898     R = SignBitSelect(Amt, M, R);
28899 
28900     // a += a
28901     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28902 
28903     // r = VSELECT(r, shift(r, 4), a);
28904     M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
28905     R = SignBitSelect(Amt, M, R);
28906 
28907     // a += a
28908     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28909 
28910     // r = VSELECT(r, shift(r, 2), a);
28911     M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
28912     R = SignBitSelect(Amt, M, R);
28913 
28914     // a += a
28915     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28916 
28917     // return VSELECT(r, shift(r, 1), a);
28918     M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
28919     R = SignBitSelect(Amt, M, R);
28920     return R;
28921   }
28922 
28923   // Decompose 256-bit shifts into 128-bit shifts.
28924   if (VT.is256BitVector())
28925     return splitVectorIntBinary(Op, DAG);
28926 
28927   if (VT == MVT::v32i16 || VT == MVT::v64i8)
28928     return splitVectorIntBinary(Op, DAG);
28929 
28930   return SDValue();
28931 }
28932 
LowerRotate(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)28933 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
28934                            SelectionDAG &DAG) {
28935   MVT VT = Op.getSimpleValueType();
28936   assert(VT.isVector() && "Custom lowering only for vector rotates!");
28937 
28938   SDLoc DL(Op);
28939   SDValue R = Op.getOperand(0);
28940   SDValue Amt = Op.getOperand(1);
28941   unsigned Opcode = Op.getOpcode();
28942   unsigned EltSizeInBits = VT.getScalarSizeInBits();
28943   int NumElts = VT.getVectorNumElements();
28944 
28945   // Check for constant splat rotation amount.
28946   APInt CstSplatValue;
28947   bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
28948 
28949   // Check for splat rotate by zero.
28950   if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
28951     return R;
28952 
28953   // AVX512 implicitly uses modulo rotation amounts.
28954   if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
28955     // Attempt to rotate by immediate.
28956     if (IsCstSplat) {
28957       unsigned RotOpc = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
28958       uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
28959       return DAG.getNode(RotOpc, DL, VT, R,
28960                          DAG.getTargetConstant(RotAmt, DL, MVT::i8));
28961     }
28962 
28963     // Else, fall-back on VPROLV/VPRORV.
28964     return Op;
28965   }
28966 
28967   // AVX512 VBMI2 vXi16 - lower to funnel shifts.
28968   if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
28969     unsigned FunnelOpc = (Opcode == ISD::ROTL ? ISD::FSHL : ISD::FSHR);
28970     return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
28971   }
28972 
28973   assert((Opcode == ISD::ROTL) && "Only ROTL supported");
28974 
28975   // XOP has 128-bit vector variable + immediate rotates.
28976   // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
28977   // XOP implicitly uses modulo rotation amounts.
28978   if (Subtarget.hasXOP()) {
28979     if (VT.is256BitVector())
28980       return splitVectorIntBinary(Op, DAG);
28981     assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
28982 
28983     // Attempt to rotate by immediate.
28984     if (IsCstSplat) {
28985       uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
28986       return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
28987                          DAG.getTargetConstant(RotAmt, DL, MVT::i8));
28988     }
28989 
28990     // Use general rotate by variable (per-element).
28991     return Op;
28992   }
28993 
28994   // Split 256-bit integers on pre-AVX2 targets.
28995   if (VT.is256BitVector() && !Subtarget.hasAVX2())
28996     return splitVectorIntBinary(Op, DAG);
28997 
28998   assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
28999           ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 ||
29000             VT == MVT::v32i16) &&
29001            Subtarget.hasAVX2())) &&
29002          "Only vXi32/vXi16/vXi8 vector rotates supported");
29003 
29004   // Rotate by an uniform constant - expand back to shifts.
29005   if (IsCstSplat)
29006     return SDValue();
29007 
29008   bool IsSplatAmt = DAG.isSplatValue(Amt);
29009 
29010   // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
29011   // the amount bit.
29012   if (EltSizeInBits == 8 && !IsSplatAmt) {
29013     if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
29014       return SDValue();
29015 
29016     // We don't need ModuloAmt here as we just peek at individual bits.
29017     MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29018 
29019     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
29020       if (Subtarget.hasSSE41()) {
29021         // On SSE41 targets we can use PBLENDVB which selects bytes based just
29022         // on the sign bit.
29023         V0 = DAG.getBitcast(VT, V0);
29024         V1 = DAG.getBitcast(VT, V1);
29025         Sel = DAG.getBitcast(VT, Sel);
29026         return DAG.getBitcast(SelVT,
29027                               DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
29028       }
29029       // On pre-SSE41 targets we test for the sign bit by comparing to
29030       // zero - a negative value will set all bits of the lanes to true
29031       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
29032       SDValue Z = DAG.getConstant(0, DL, SelVT);
29033       SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
29034       return DAG.getSelect(DL, SelVT, C, V0, V1);
29035     };
29036 
29037     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
29038     // We can safely do this using i16 shifts as we're only interested in
29039     // the 3 lower bits of each byte.
29040     Amt = DAG.getBitcast(ExtVT, Amt);
29041     Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
29042     Amt = DAG.getBitcast(VT, Amt);
29043 
29044     // r = VSELECT(r, rot(r, 4), a);
29045     SDValue M;
29046     M = DAG.getNode(
29047         ISD::OR, DL, VT,
29048         DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
29049         DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
29050     R = SignBitSelect(VT, Amt, M, R);
29051 
29052     // a += a
29053     Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
29054 
29055     // r = VSELECT(r, rot(r, 2), a);
29056     M = DAG.getNode(
29057         ISD::OR, DL, VT,
29058         DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
29059         DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
29060     R = SignBitSelect(VT, Amt, M, R);
29061 
29062     // a += a
29063     Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
29064 
29065     // return VSELECT(r, rot(r, 1), a);
29066     M = DAG.getNode(
29067         ISD::OR, DL, VT,
29068         DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
29069         DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
29070     return SignBitSelect(VT, Amt, M, R);
29071   }
29072 
29073   // ISD::ROT* uses modulo rotate amounts.
29074   if (SDValue BaseRotAmt = DAG.getSplatValue(Amt)) {
29075     // If the amount is a splat, perform the modulo BEFORE the splat,
29076     // this helps LowerScalarVariableShift to remove the splat later.
29077     Amt = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, BaseRotAmt);
29078     Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
29079                       DAG.getConstant(EltSizeInBits - 1, DL, VT));
29080     Amt = DAG.getVectorShuffle(VT, DL, Amt, DAG.getUNDEF(VT),
29081                                SmallVector<int>(NumElts, 0));
29082   } else {
29083     Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
29084                       DAG.getConstant(EltSizeInBits - 1, DL, VT));
29085   }
29086 
29087   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29088   bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
29089                         SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
29090 
29091   // Fallback for splats + all supported variable shifts.
29092   // Fallback for non-constants AVX2 vXi16 as well.
29093   if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
29094     SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
29095     AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
29096     SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
29097     SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
29098     return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
29099   }
29100 
29101   // As with shifts, convert the rotation amount to a multiplication factor.
29102   SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
29103   assert(Scale && "Failed to convert ROTL amount to scale");
29104 
29105   // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
29106   if (EltSizeInBits == 16) {
29107     SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
29108     SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
29109     return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
29110   }
29111 
29112   // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
29113   // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
29114   // that can then be OR'd with the lower 32-bits.
29115   assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
29116   static const int OddMask[] = {1, -1, 3, -1};
29117   SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
29118   SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
29119 
29120   SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
29121                               DAG.getBitcast(MVT::v2i64, R),
29122                               DAG.getBitcast(MVT::v2i64, Scale));
29123   SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
29124                               DAG.getBitcast(MVT::v2i64, R13),
29125                               DAG.getBitcast(MVT::v2i64, Scale13));
29126   Res02 = DAG.getBitcast(VT, Res02);
29127   Res13 = DAG.getBitcast(VT, Res13);
29128 
29129   return DAG.getNode(ISD::OR, DL, VT,
29130                      DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
29131                      DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
29132 }
29133 
29134 /// Returns true if the operand type is exactly twice the native width, and
29135 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
29136 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
29137 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
needsCmpXchgNb(Type * MemType) const29138 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
29139   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
29140 
29141   if (OpWidth == 64)
29142     return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
29143   if (OpWidth == 128)
29144     return Subtarget.hasCmpxchg16b();
29145 
29146   return false;
29147 }
29148 
shouldExpandAtomicStoreInIR(StoreInst * SI) const29149 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
29150   Type *MemType = SI->getValueOperand()->getType();
29151 
29152   bool NoImplicitFloatOps =
29153       SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
29154   if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
29155       !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
29156       (Subtarget.hasSSE1() || Subtarget.hasX87()))
29157     return false;
29158 
29159   return needsCmpXchgNb(MemType);
29160 }
29161 
29162 // Note: this turns large loads into lock cmpxchg8b/16b.
29163 // TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
29164 TargetLowering::AtomicExpansionKind
shouldExpandAtomicLoadInIR(LoadInst * LI) const29165 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
29166   Type *MemType = LI->getType();
29167 
29168   // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
29169   // can use movq to do the load. If we have X87 we can load into an 80-bit
29170   // X87 register and store it to a stack temporary.
29171   bool NoImplicitFloatOps =
29172       LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
29173   if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
29174       !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
29175       (Subtarget.hasSSE1() || Subtarget.hasX87()))
29176     return AtomicExpansionKind::None;
29177 
29178   return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
29179                                  : AtomicExpansionKind::None;
29180 }
29181 
29182 TargetLowering::AtomicExpansionKind
shouldExpandAtomicRMWInIR(AtomicRMWInst * AI) const29183 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
29184   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
29185   Type *MemType = AI->getType();
29186 
29187   // If the operand is too big, we must see if cmpxchg8/16b is available
29188   // and default to library calls otherwise.
29189   if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
29190     return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
29191                                    : AtomicExpansionKind::None;
29192   }
29193 
29194   AtomicRMWInst::BinOp Op = AI->getOperation();
29195   switch (Op) {
29196   default:
29197     llvm_unreachable("Unknown atomic operation");
29198   case AtomicRMWInst::Xchg:
29199   case AtomicRMWInst::Add:
29200   case AtomicRMWInst::Sub:
29201     // It's better to use xadd, xsub or xchg for these in all cases.
29202     return AtomicExpansionKind::None;
29203   case AtomicRMWInst::Or:
29204   case AtomicRMWInst::And:
29205   case AtomicRMWInst::Xor:
29206     // If the atomicrmw's result isn't actually used, we can just add a "lock"
29207     // prefix to a normal instruction for these operations.
29208     return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
29209                             : AtomicExpansionKind::None;
29210   case AtomicRMWInst::Nand:
29211   case AtomicRMWInst::Max:
29212   case AtomicRMWInst::Min:
29213   case AtomicRMWInst::UMax:
29214   case AtomicRMWInst::UMin:
29215   case AtomicRMWInst::FAdd:
29216   case AtomicRMWInst::FSub:
29217     // These always require a non-trivial set of data operations on x86. We must
29218     // use a cmpxchg loop.
29219     return AtomicExpansionKind::CmpXChg;
29220   }
29221 }
29222 
29223 LoadInst *
lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst * AI) const29224 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
29225   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
29226   Type *MemType = AI->getType();
29227   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
29228   // there is no benefit in turning such RMWs into loads, and it is actually
29229   // harmful as it introduces a mfence.
29230   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
29231     return nullptr;
29232 
29233   // If this is a canonical idempotent atomicrmw w/no uses, we have a better
29234   // lowering available in lowerAtomicArith.
29235   // TODO: push more cases through this path.
29236   if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
29237     if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
29238         AI->use_empty())
29239       return nullptr;
29240 
29241   IRBuilder<> Builder(AI);
29242   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
29243   auto SSID = AI->getSyncScopeID();
29244   // We must restrict the ordering to avoid generating loads with Release or
29245   // ReleaseAcquire orderings.
29246   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
29247 
29248   // Before the load we need a fence. Here is an example lifted from
29249   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
29250   // is required:
29251   // Thread 0:
29252   //   x.store(1, relaxed);
29253   //   r1 = y.fetch_add(0, release);
29254   // Thread 1:
29255   //   y.fetch_add(42, acquire);
29256   //   r2 = x.load(relaxed);
29257   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
29258   // lowered to just a load without a fence. A mfence flushes the store buffer,
29259   // making the optimization clearly correct.
29260   // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
29261   // otherwise, we might be able to be more aggressive on relaxed idempotent
29262   // rmw. In practice, they do not look useful, so we don't try to be
29263   // especially clever.
29264   if (SSID == SyncScope::SingleThread)
29265     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
29266     // the IR level, so we must wrap it in an intrinsic.
29267     return nullptr;
29268 
29269   if (!Subtarget.hasMFence())
29270     // FIXME: it might make sense to use a locked operation here but on a
29271     // different cache-line to prevent cache-line bouncing. In practice it
29272     // is probably a small win, and x86 processors without mfence are rare
29273     // enough that we do not bother.
29274     return nullptr;
29275 
29276   Function *MFence =
29277       llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
29278   Builder.CreateCall(MFence, {});
29279 
29280   // Finally we can emit the atomic load.
29281   LoadInst *Loaded = Builder.CreateAlignedLoad(
29282       AI->getType(), AI->getPointerOperand(), AI->getAlign());
29283   Loaded->setAtomic(Order, SSID);
29284   AI->replaceAllUsesWith(Loaded);
29285   AI->eraseFromParent();
29286   return Loaded;
29287 }
29288 
lowerAtomicStoreAsStoreSDNode(const StoreInst & SI) const29289 bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
29290   if (!SI.isUnordered())
29291     return false;
29292   return ExperimentalUnorderedISEL;
29293 }
lowerAtomicLoadAsLoadSDNode(const LoadInst & LI) const29294 bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
29295   if (!LI.isUnordered())
29296     return false;
29297   return ExperimentalUnorderedISEL;
29298 }
29299 
29300 
29301 /// Emit a locked operation on a stack location which does not change any
29302 /// memory location, but does involve a lock prefix.  Location is chosen to be
29303 /// a) very likely accessed only by a single thread to minimize cache traffic,
29304 /// and b) definitely dereferenceable.  Returns the new Chain result.
emitLockedStackOp(SelectionDAG & DAG,const X86Subtarget & Subtarget,SDValue Chain,const SDLoc & DL)29305 static SDValue emitLockedStackOp(SelectionDAG &DAG,
29306                                  const X86Subtarget &Subtarget, SDValue Chain,
29307                                  const SDLoc &DL) {
29308   // Implementation notes:
29309   // 1) LOCK prefix creates a full read/write reordering barrier for memory
29310   // operations issued by the current processor.  As such, the location
29311   // referenced is not relevant for the ordering properties of the instruction.
29312   // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
29313   // 8.2.3.9  Loads and Stores Are Not Reordered with Locked Instructions
29314   // 2) Using an immediate operand appears to be the best encoding choice
29315   // here since it doesn't require an extra register.
29316   // 3) OR appears to be very slightly faster than ADD. (Though, the difference
29317   // is small enough it might just be measurement noise.)
29318   // 4) When choosing offsets, there are several contributing factors:
29319   //   a) If there's no redzone, we default to TOS.  (We could allocate a cache
29320   //      line aligned stack object to improve this case.)
29321   //   b) To minimize our chances of introducing a false dependence, we prefer
29322   //      to offset the stack usage from TOS slightly.
29323   //   c) To minimize concerns about cross thread stack usage - in particular,
29324   //      the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
29325   //      captures state in the TOS frame and accesses it from many threads -
29326   //      we want to use an offset such that the offset is in a distinct cache
29327   //      line from the TOS frame.
29328   //
29329   // For a general discussion of the tradeoffs and benchmark results, see:
29330   // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
29331 
29332   auto &MF = DAG.getMachineFunction();
29333   auto &TFL = *Subtarget.getFrameLowering();
29334   const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
29335 
29336   if (Subtarget.is64Bit()) {
29337     SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
29338     SDValue Ops[] = {
29339       DAG.getRegister(X86::RSP, MVT::i64),                  // Base
29340       DAG.getTargetConstant(1, DL, MVT::i8),                // Scale
29341       DAG.getRegister(0, MVT::i64),                         // Index
29342       DAG.getTargetConstant(SPOffset, DL, MVT::i32),        // Disp
29343       DAG.getRegister(0, MVT::i16),                         // Segment.
29344       Zero,
29345       Chain};
29346     SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
29347                                      MVT::Other, Ops);
29348     return SDValue(Res, 1);
29349   }
29350 
29351   SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
29352   SDValue Ops[] = {
29353     DAG.getRegister(X86::ESP, MVT::i32),            // Base
29354     DAG.getTargetConstant(1, DL, MVT::i8),          // Scale
29355     DAG.getRegister(0, MVT::i32),                   // Index
29356     DAG.getTargetConstant(SPOffset, DL, MVT::i32),  // Disp
29357     DAG.getRegister(0, MVT::i16),                   // Segment.
29358     Zero,
29359     Chain
29360   };
29361   SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
29362                                    MVT::Other, Ops);
29363   return SDValue(Res, 1);
29364 }
29365 
LowerATOMIC_FENCE(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)29366 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
29367                                  SelectionDAG &DAG) {
29368   SDLoc dl(Op);
29369   AtomicOrdering FenceOrdering =
29370       static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
29371   SyncScope::ID FenceSSID =
29372       static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
29373 
29374   // The only fence that needs an instruction is a sequentially-consistent
29375   // cross-thread fence.
29376   if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
29377       FenceSSID == SyncScope::System) {
29378     if (Subtarget.hasMFence())
29379       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
29380 
29381     SDValue Chain = Op.getOperand(0);
29382     return emitLockedStackOp(DAG, Subtarget, Chain, dl);
29383   }
29384 
29385   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
29386   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
29387 }
29388 
LowerCMP_SWAP(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)29389 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
29390                              SelectionDAG &DAG) {
29391   MVT T = Op.getSimpleValueType();
29392   SDLoc DL(Op);
29393   unsigned Reg = 0;
29394   unsigned size = 0;
29395   switch(T.SimpleTy) {
29396   default: llvm_unreachable("Invalid value type!");
29397   case MVT::i8:  Reg = X86::AL;  size = 1; break;
29398   case MVT::i16: Reg = X86::AX;  size = 2; break;
29399   case MVT::i32: Reg = X86::EAX; size = 4; break;
29400   case MVT::i64:
29401     assert(Subtarget.is64Bit() && "Node not type legal!");
29402     Reg = X86::RAX; size = 8;
29403     break;
29404   }
29405   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
29406                                   Op.getOperand(2), SDValue());
29407   SDValue Ops[] = { cpIn.getValue(0),
29408                     Op.getOperand(1),
29409                     Op.getOperand(3),
29410                     DAG.getTargetConstant(size, DL, MVT::i8),
29411                     cpIn.getValue(1) };
29412   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
29413   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
29414   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
29415                                            Ops, T, MMO);
29416 
29417   SDValue cpOut =
29418     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
29419   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
29420                                       MVT::i32, cpOut.getValue(2));
29421   SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
29422 
29423   return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
29424                      cpOut, Success, EFLAGS.getValue(1));
29425 }
29426 
29427 // Create MOVMSKB, taking into account whether we need to split for AVX1.
getPMOVMSKB(const SDLoc & DL,SDValue V,SelectionDAG & DAG,const X86Subtarget & Subtarget)29428 static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
29429                            const X86Subtarget &Subtarget) {
29430   MVT InVT = V.getSimpleValueType();
29431 
29432   if (InVT == MVT::v64i8) {
29433     SDValue Lo, Hi;
29434     std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
29435     Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
29436     Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
29437     Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
29438     Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
29439     Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
29440                      DAG.getConstant(32, DL, MVT::i8));
29441     return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
29442   }
29443   if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
29444     SDValue Lo, Hi;
29445     std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
29446     Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
29447     Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
29448     Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
29449                      DAG.getConstant(16, DL, MVT::i8));
29450     return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
29451   }
29452 
29453   return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
29454 }
29455 
LowerBITCAST(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)29456 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
29457                             SelectionDAG &DAG) {
29458   SDValue Src = Op.getOperand(0);
29459   MVT SrcVT = Src.getSimpleValueType();
29460   MVT DstVT = Op.getSimpleValueType();
29461 
29462   // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
29463   // half to v32i1 and concatenating the result.
29464   if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
29465     assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
29466     assert(Subtarget.hasBWI() && "Expected BWI target");
29467     SDLoc dl(Op);
29468     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
29469                              DAG.getIntPtrConstant(0, dl));
29470     Lo = DAG.getBitcast(MVT::v32i1, Lo);
29471     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
29472                              DAG.getIntPtrConstant(1, dl));
29473     Hi = DAG.getBitcast(MVT::v32i1, Hi);
29474     return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
29475   }
29476 
29477   // Use MOVMSK for vector to scalar conversion to prevent scalarization.
29478   if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
29479     assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
29480     MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
29481     SDLoc DL(Op);
29482     SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
29483     V = getPMOVMSKB(DL, V, DAG, Subtarget);
29484     return DAG.getZExtOrTrunc(V, DL, DstVT);
29485   }
29486 
29487   assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
29488           SrcVT == MVT::i64) && "Unexpected VT!");
29489 
29490   assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
29491   if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
29492       !(DstVT == MVT::x86mmx && SrcVT.isVector()))
29493     // This conversion needs to be expanded.
29494     return SDValue();
29495 
29496   SDLoc dl(Op);
29497   if (SrcVT.isVector()) {
29498     // Widen the vector in input in the case of MVT::v2i32.
29499     // Example: from MVT::v2i32 to MVT::v4i32.
29500     MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
29501                                  SrcVT.getVectorNumElements() * 2);
29502     Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
29503                       DAG.getUNDEF(SrcVT));
29504   } else {
29505     assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
29506            "Unexpected source type in LowerBITCAST");
29507     Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
29508   }
29509 
29510   MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
29511   Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
29512 
29513   if (DstVT == MVT::x86mmx)
29514     return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
29515 
29516   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
29517                      DAG.getIntPtrConstant(0, dl));
29518 }
29519 
29520 /// Compute the horizontal sum of bytes in V for the elements of VT.
29521 ///
29522 /// Requires V to be a byte vector and VT to be an integer vector type with
29523 /// wider elements than V's type. The width of the elements of VT determines
29524 /// how many bytes of V are summed horizontally to produce each element of the
29525 /// result.
LowerHorizontalByteSum(SDValue V,MVT VT,const X86Subtarget & Subtarget,SelectionDAG & DAG)29526 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
29527                                       const X86Subtarget &Subtarget,
29528                                       SelectionDAG &DAG) {
29529   SDLoc DL(V);
29530   MVT ByteVecVT = V.getSimpleValueType();
29531   MVT EltVT = VT.getVectorElementType();
29532   assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
29533          "Expected value to have byte element type.");
29534   assert(EltVT != MVT::i8 &&
29535          "Horizontal byte sum only makes sense for wider elements!");
29536   unsigned VecSize = VT.getSizeInBits();
29537   assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
29538 
29539   // PSADBW instruction horizontally add all bytes and leave the result in i64
29540   // chunks, thus directly computes the pop count for v2i64 and v4i64.
29541   if (EltVT == MVT::i64) {
29542     SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
29543     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
29544     V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
29545     return DAG.getBitcast(VT, V);
29546   }
29547 
29548   if (EltVT == MVT::i32) {
29549     // We unpack the low half and high half into i32s interleaved with zeros so
29550     // that we can use PSADBW to horizontally sum them. The most useful part of
29551     // this is that it lines up the results of two PSADBW instructions to be
29552     // two v2i64 vectors which concatenated are the 4 population counts. We can
29553     // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
29554     SDValue Zeros = DAG.getConstant(0, DL, VT);
29555     SDValue V32 = DAG.getBitcast(VT, V);
29556     SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
29557     SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
29558 
29559     // Do the horizontal sums into two v2i64s.
29560     Zeros = DAG.getConstant(0, DL, ByteVecVT);
29561     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
29562     Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
29563                       DAG.getBitcast(ByteVecVT, Low), Zeros);
29564     High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
29565                        DAG.getBitcast(ByteVecVT, High), Zeros);
29566 
29567     // Merge them together.
29568     MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
29569     V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
29570                     DAG.getBitcast(ShortVecVT, Low),
29571                     DAG.getBitcast(ShortVecVT, High));
29572 
29573     return DAG.getBitcast(VT, V);
29574   }
29575 
29576   // The only element type left is i16.
29577   assert(EltVT == MVT::i16 && "Unknown how to handle type");
29578 
29579   // To obtain pop count for each i16 element starting from the pop count for
29580   // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
29581   // right by 8. It is important to shift as i16s as i8 vector shift isn't
29582   // directly supported.
29583   SDValue ShifterV = DAG.getConstant(8, DL, VT);
29584   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
29585   V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
29586                   DAG.getBitcast(ByteVecVT, V));
29587   return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
29588 }
29589 
LowerVectorCTPOPInRegLUT(SDValue Op,const SDLoc & DL,const X86Subtarget & Subtarget,SelectionDAG & DAG)29590 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
29591                                         const X86Subtarget &Subtarget,
29592                                         SelectionDAG &DAG) {
29593   MVT VT = Op.getSimpleValueType();
29594   MVT EltVT = VT.getVectorElementType();
29595   int NumElts = VT.getVectorNumElements();
29596   (void)EltVT;
29597   assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
29598 
29599   // Implement a lookup table in register by using an algorithm based on:
29600   // http://wm.ite.pl/articles/sse-popcount.html
29601   //
29602   // The general idea is that every lower byte nibble in the input vector is an
29603   // index into a in-register pre-computed pop count table. We then split up the
29604   // input vector in two new ones: (1) a vector with only the shifted-right
29605   // higher nibbles for each byte and (2) a vector with the lower nibbles (and
29606   // masked out higher ones) for each byte. PSHUFB is used separately with both
29607   // to index the in-register table. Next, both are added and the result is a
29608   // i8 vector where each element contains the pop count for input byte.
29609   const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
29610                        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
29611                        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
29612                        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
29613 
29614   SmallVector<SDValue, 64> LUTVec;
29615   for (int i = 0; i < NumElts; ++i)
29616     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
29617   SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
29618   SDValue M0F = DAG.getConstant(0x0F, DL, VT);
29619 
29620   // High nibbles
29621   SDValue FourV = DAG.getConstant(4, DL, VT);
29622   SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
29623 
29624   // Low nibbles
29625   SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
29626 
29627   // The input vector is used as the shuffle mask that index elements into the
29628   // LUT. After counting low and high nibbles, add the vector to obtain the
29629   // final pop count per i8 element.
29630   SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
29631   SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
29632   return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
29633 }
29634 
29635 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
29636 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
LowerVectorCTPOP(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)29637 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
29638                                 SelectionDAG &DAG) {
29639   MVT VT = Op.getSimpleValueType();
29640   assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
29641          "Unknown CTPOP type to handle");
29642   SDLoc DL(Op.getNode());
29643   SDValue Op0 = Op.getOperand(0);
29644 
29645   // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
29646   if (Subtarget.hasVPOPCNTDQ()) {
29647     unsigned NumElems = VT.getVectorNumElements();
29648     assert((VT.getVectorElementType() == MVT::i8 ||
29649             VT.getVectorElementType() == MVT::i16) && "Unexpected type");
29650     if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
29651       MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
29652       Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
29653       Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
29654       return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
29655     }
29656   }
29657 
29658   // Decompose 256-bit ops into smaller 128-bit ops.
29659   if (VT.is256BitVector() && !Subtarget.hasInt256())
29660     return splitVectorIntUnary(Op, DAG);
29661 
29662   // Decompose 512-bit ops into smaller 256-bit ops.
29663   if (VT.is512BitVector() && !Subtarget.hasBWI())
29664     return splitVectorIntUnary(Op, DAG);
29665 
29666   // For element types greater than i8, do vXi8 pop counts and a bytesum.
29667   if (VT.getScalarType() != MVT::i8) {
29668     MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
29669     SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
29670     SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
29671     return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
29672   }
29673 
29674   // We can't use the fast LUT approach, so fall back on LegalizeDAG.
29675   if (!Subtarget.hasSSSE3())
29676     return SDValue();
29677 
29678   return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
29679 }
29680 
LowerCTPOP(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)29681 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
29682                           SelectionDAG &DAG) {
29683   assert(Op.getSimpleValueType().isVector() &&
29684          "We only do custom lowering for vector population count.");
29685   return LowerVectorCTPOP(Op, Subtarget, DAG);
29686 }
29687 
LowerBITREVERSE_XOP(SDValue Op,SelectionDAG & DAG)29688 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
29689   MVT VT = Op.getSimpleValueType();
29690   SDValue In = Op.getOperand(0);
29691   SDLoc DL(Op);
29692 
29693   // For scalars, its still beneficial to transfer to/from the SIMD unit to
29694   // perform the BITREVERSE.
29695   if (!VT.isVector()) {
29696     MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
29697     SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
29698     Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
29699     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
29700                        DAG.getIntPtrConstant(0, DL));
29701   }
29702 
29703   int NumElts = VT.getVectorNumElements();
29704   int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
29705 
29706   // Decompose 256-bit ops into smaller 128-bit ops.
29707   if (VT.is256BitVector())
29708     return splitVectorIntUnary(Op, DAG);
29709 
29710   assert(VT.is128BitVector() &&
29711          "Only 128-bit vector bitreverse lowering supported.");
29712 
29713   // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
29714   // perform the BSWAP in the shuffle.
29715   // Its best to shuffle using the second operand as this will implicitly allow
29716   // memory folding for multiple vectors.
29717   SmallVector<SDValue, 16> MaskElts;
29718   for (int i = 0; i != NumElts; ++i) {
29719     for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
29720       int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
29721       int PermuteByte = SourceByte | (2 << 5);
29722       MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
29723     }
29724   }
29725 
29726   SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
29727   SDValue Res = DAG.getBitcast(MVT::v16i8, In);
29728   Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
29729                     Res, Mask);
29730   return DAG.getBitcast(VT, Res);
29731 }
29732 
LowerBITREVERSE(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)29733 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
29734                                SelectionDAG &DAG) {
29735   MVT VT = Op.getSimpleValueType();
29736 
29737   if (Subtarget.hasXOP() && !VT.is512BitVector())
29738     return LowerBITREVERSE_XOP(Op, DAG);
29739 
29740   assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
29741 
29742   SDValue In = Op.getOperand(0);
29743   SDLoc DL(Op);
29744 
29745   assert(VT.getScalarType() == MVT::i8 &&
29746          "Only byte vector BITREVERSE supported");
29747 
29748   // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
29749   if (VT == MVT::v64i8 && !Subtarget.hasBWI())
29750     return splitVectorIntUnary(Op, DAG);
29751 
29752   // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
29753   if (VT == MVT::v32i8 && !Subtarget.hasInt256())
29754     return splitVectorIntUnary(Op, DAG);
29755 
29756   unsigned NumElts = VT.getVectorNumElements();
29757 
29758   // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
29759   if (Subtarget.hasGFNI()) {
29760     MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
29761     SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
29762     Matrix = DAG.getBitcast(VT, Matrix);
29763     return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
29764                        DAG.getTargetConstant(0, DL, MVT::i8));
29765   }
29766 
29767   // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
29768   // two nibbles and a PSHUFB lookup to find the bitreverse of each
29769   // 0-15 value (moved to the other nibble).
29770   SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
29771   SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
29772   SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
29773 
29774   const int LoLUT[16] = {
29775       /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
29776       /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
29777       /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
29778       /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
29779   const int HiLUT[16] = {
29780       /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
29781       /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
29782       /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
29783       /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
29784 
29785   SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
29786   for (unsigned i = 0; i < NumElts; ++i) {
29787     LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
29788     HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
29789   }
29790 
29791   SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
29792   SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
29793   Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
29794   Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
29795   return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
29796 }
29797 
LowerPARITY(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)29798 static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
29799                            SelectionDAG &DAG) {
29800   SDLoc DL(Op);
29801   SDValue X = Op.getOperand(0);
29802   MVT VT = Op.getSimpleValueType();
29803 
29804   // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
29805   if (VT == MVT::i8 ||
29806       DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
29807     X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
29808     SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
29809                                 DAG.getConstant(0, DL, MVT::i8));
29810     // Copy the inverse of the parity flag into a register with setcc.
29811     SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
29812     // Extend to the original type.
29813     return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
29814   }
29815 
29816   if (VT == MVT::i64) {
29817     // Xor the high and low 16-bits together using a 32-bit operation.
29818     SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
29819                              DAG.getNode(ISD::SRL, DL, MVT::i64, X,
29820                                          DAG.getConstant(32, DL, MVT::i8)));
29821     SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
29822     X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
29823   }
29824 
29825   if (VT != MVT::i16) {
29826     // Xor the high and low 16-bits together using a 32-bit operation.
29827     SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
29828                                DAG.getConstant(16, DL, MVT::i8));
29829     X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
29830   } else {
29831     // If the input is 16-bits, we need to extend to use an i32 shift below.
29832     X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
29833   }
29834 
29835   // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
29836   // This should allow an h-reg to be used to save a shift.
29837   SDValue Hi = DAG.getNode(
29838       ISD::TRUNCATE, DL, MVT::i8,
29839       DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
29840   SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
29841   SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
29842   SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
29843 
29844   // Copy the inverse of the parity flag into a register with setcc.
29845   SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
29846   // Extend to the original type.
29847   return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
29848 }
29849 
lowerAtomicArithWithLOCK(SDValue N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29850 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
29851                                         const X86Subtarget &Subtarget) {
29852   unsigned NewOpc = 0;
29853   switch (N->getOpcode()) {
29854   case ISD::ATOMIC_LOAD_ADD:
29855     NewOpc = X86ISD::LADD;
29856     break;
29857   case ISD::ATOMIC_LOAD_SUB:
29858     NewOpc = X86ISD::LSUB;
29859     break;
29860   case ISD::ATOMIC_LOAD_OR:
29861     NewOpc = X86ISD::LOR;
29862     break;
29863   case ISD::ATOMIC_LOAD_XOR:
29864     NewOpc = X86ISD::LXOR;
29865     break;
29866   case ISD::ATOMIC_LOAD_AND:
29867     NewOpc = X86ISD::LAND;
29868     break;
29869   default:
29870     llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
29871   }
29872 
29873   MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
29874 
29875   return DAG.getMemIntrinsicNode(
29876       NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
29877       {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
29878       /*MemVT=*/N->getSimpleValueType(0), MMO);
29879 }
29880 
29881 /// Lower atomic_load_ops into LOCK-prefixed operations.
lowerAtomicArith(SDValue N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29882 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
29883                                 const X86Subtarget &Subtarget) {
29884   AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
29885   SDValue Chain = N->getOperand(0);
29886   SDValue LHS = N->getOperand(1);
29887   SDValue RHS = N->getOperand(2);
29888   unsigned Opc = N->getOpcode();
29889   MVT VT = N->getSimpleValueType(0);
29890   SDLoc DL(N);
29891 
29892   // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
29893   // can only be lowered when the result is unused.  They should have already
29894   // been transformed into a cmpxchg loop in AtomicExpand.
29895   if (N->hasAnyUseOfValue(0)) {
29896     // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
29897     // select LXADD if LOCK_SUB can't be selected.
29898     if (Opc == ISD::ATOMIC_LOAD_SUB) {
29899       RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
29900       return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
29901                            RHS, AN->getMemOperand());
29902     }
29903     assert(Opc == ISD::ATOMIC_LOAD_ADD &&
29904            "Used AtomicRMW ops other than Add should have been expanded!");
29905     return N;
29906   }
29907 
29908   // Specialized lowering for the canonical form of an idemptotent atomicrmw.
29909   // The core idea here is that since the memory location isn't actually
29910   // changing, all we need is a lowering for the *ordering* impacts of the
29911   // atomicrmw.  As such, we can chose a different operation and memory
29912   // location to minimize impact on other code.
29913   if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
29914     // On X86, the only ordering which actually requires an instruction is
29915     // seq_cst which isn't SingleThread, everything just needs to be preserved
29916     // during codegen and then dropped. Note that we expect (but don't assume),
29917     // that orderings other than seq_cst and acq_rel have been canonicalized to
29918     // a store or load.
29919     if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&
29920         AN->getSyncScopeID() == SyncScope::System) {
29921       // Prefer a locked operation against a stack location to minimize cache
29922       // traffic.  This assumes that stack locations are very likely to be
29923       // accessed only by the owning thread.
29924       SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
29925       assert(!N->hasAnyUseOfValue(0));
29926       // NOTE: The getUNDEF is needed to give something for the unused result 0.
29927       return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
29928                          DAG.getUNDEF(VT), NewChain);
29929     }
29930     // MEMBARRIER is a compiler barrier; it codegens to a no-op.
29931     SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
29932     assert(!N->hasAnyUseOfValue(0));
29933     // NOTE: The getUNDEF is needed to give something for the unused result 0.
29934     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
29935                        DAG.getUNDEF(VT), NewChain);
29936   }
29937 
29938   SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
29939   // RAUW the chain, but don't worry about the result, as it's unused.
29940   assert(!N->hasAnyUseOfValue(0));
29941   // NOTE: The getUNDEF is needed to give something for the unused result 0.
29942   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
29943                      DAG.getUNDEF(VT), LockOp.getValue(1));
29944 }
29945 
LowerATOMIC_STORE(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)29946 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
29947                                  const X86Subtarget &Subtarget) {
29948   auto *Node = cast<AtomicSDNode>(Op.getNode());
29949   SDLoc dl(Node);
29950   EVT VT = Node->getMemoryVT();
29951 
29952   bool IsSeqCst =
29953       Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
29954   bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
29955 
29956   // If this store is not sequentially consistent and the type is legal
29957   // we can just keep it.
29958   if (!IsSeqCst && IsTypeLegal)
29959     return Op;
29960 
29961   if (VT == MVT::i64 && !IsTypeLegal) {
29962     // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
29963     // is enabled.
29964     bool NoImplicitFloatOps =
29965         DAG.getMachineFunction().getFunction().hasFnAttribute(
29966             Attribute::NoImplicitFloat);
29967     if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
29968       SDValue Chain;
29969       if (Subtarget.hasSSE1()) {
29970         SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
29971                                        Node->getOperand(2));
29972         MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
29973         SclToVec = DAG.getBitcast(StVT, SclToVec);
29974         SDVTList Tys = DAG.getVTList(MVT::Other);
29975         SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
29976         Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
29977                                         MVT::i64, Node->getMemOperand());
29978       } else if (Subtarget.hasX87()) {
29979         // First load this into an 80-bit X87 register using a stack temporary.
29980         // This will put the whole integer into the significand.
29981         SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
29982         int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29983         MachinePointerInfo MPI =
29984             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
29985         Chain =
29986             DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
29987                          MPI, MaybeAlign(), MachineMemOperand::MOStore);
29988         SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
29989         SDValue LdOps[] = {Chain, StackPtr};
29990         SDValue Value =
29991             DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
29992                                     /*Align*/ None, MachineMemOperand::MOLoad);
29993         Chain = Value.getValue(1);
29994 
29995         // Now use an FIST to do the atomic store.
29996         SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
29997         Chain =
29998             DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
29999                                     StoreOps, MVT::i64, Node->getMemOperand());
30000       }
30001 
30002       if (Chain) {
30003         // If this is a sequentially consistent store, also emit an appropriate
30004         // barrier.
30005         if (IsSeqCst)
30006           Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
30007 
30008         return Chain;
30009       }
30010     }
30011   }
30012 
30013   // Convert seq_cst store -> xchg
30014   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
30015   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
30016   SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
30017                                Node->getMemoryVT(),
30018                                Node->getOperand(0),
30019                                Node->getOperand(1), Node->getOperand(2),
30020                                Node->getMemOperand());
30021   return Swap.getValue(1);
30022 }
30023 
LowerADDSUBCARRY(SDValue Op,SelectionDAG & DAG)30024 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
30025   SDNode *N = Op.getNode();
30026   MVT VT = N->getSimpleValueType(0);
30027   unsigned Opc = Op.getOpcode();
30028 
30029   // Let legalize expand this if it isn't a legal type yet.
30030   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
30031     return SDValue();
30032 
30033   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
30034   SDLoc DL(N);
30035 
30036   // Set the carry flag.
30037   SDValue Carry = Op.getOperand(2);
30038   EVT CarryVT = Carry.getValueType();
30039   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
30040                       Carry, DAG.getAllOnesConstant(DL, CarryVT));
30041 
30042   bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;
30043   SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
30044                             Op.getOperand(0), Op.getOperand(1),
30045                             Carry.getValue(1));
30046 
30047   bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
30048   SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
30049                            Sum.getValue(1), DL, DAG);
30050   if (N->getValueType(1) == MVT::i1)
30051     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
30052 
30053   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
30054 }
30055 
LowerFSINCOS(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)30056 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
30057                             SelectionDAG &DAG) {
30058   assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
30059 
30060   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
30061   // which returns the values as { float, float } (in XMM0) or
30062   // { double, double } (which is returned in XMM0, XMM1).
30063   SDLoc dl(Op);
30064   SDValue Arg = Op.getOperand(0);
30065   EVT ArgVT = Arg.getValueType();
30066   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
30067 
30068   TargetLowering::ArgListTy Args;
30069   TargetLowering::ArgListEntry Entry;
30070 
30071   Entry.Node = Arg;
30072   Entry.Ty = ArgTy;
30073   Entry.IsSExt = false;
30074   Entry.IsZExt = false;
30075   Args.push_back(Entry);
30076 
30077   bool isF64 = ArgVT == MVT::f64;
30078   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
30079   // the small struct {f32, f32} is returned in (eax, edx). For f64,
30080   // the results are returned via SRet in memory.
30081   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30082   RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
30083   const char *LibcallName = TLI.getLibcallName(LC);
30084   SDValue Callee =
30085       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
30086 
30087   Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
30088                       : (Type *)FixedVectorType::get(ArgTy, 4);
30089 
30090   TargetLowering::CallLoweringInfo CLI(DAG);
30091   CLI.setDebugLoc(dl)
30092       .setChain(DAG.getEntryNode())
30093       .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
30094 
30095   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
30096 
30097   if (isF64)
30098     // Returned in xmm0 and xmm1.
30099     return CallResult.first;
30100 
30101   // Returned in bits 0:31 and 32:64 xmm0.
30102   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
30103                                CallResult.first, DAG.getIntPtrConstant(0, dl));
30104   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
30105                                CallResult.first, DAG.getIntPtrConstant(1, dl));
30106   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
30107   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
30108 }
30109 
30110 /// Widen a vector input to a vector of NVT.  The
30111 /// input vector must have the same element type as NVT.
ExtendToType(SDValue InOp,MVT NVT,SelectionDAG & DAG,bool FillWithZeroes=false)30112 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
30113                             bool FillWithZeroes = false) {
30114   // Check if InOp already has the right width.
30115   MVT InVT = InOp.getSimpleValueType();
30116   if (InVT == NVT)
30117     return InOp;
30118 
30119   if (InOp.isUndef())
30120     return DAG.getUNDEF(NVT);
30121 
30122   assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
30123          "input and widen element type must match");
30124 
30125   unsigned InNumElts = InVT.getVectorNumElements();
30126   unsigned WidenNumElts = NVT.getVectorNumElements();
30127   assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
30128          "Unexpected request for vector widening");
30129 
30130   SDLoc dl(InOp);
30131   if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
30132       InOp.getNumOperands() == 2) {
30133     SDValue N1 = InOp.getOperand(1);
30134     if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
30135         N1.isUndef()) {
30136       InOp = InOp.getOperand(0);
30137       InVT = InOp.getSimpleValueType();
30138       InNumElts = InVT.getVectorNumElements();
30139     }
30140   }
30141   if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
30142       ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
30143     SmallVector<SDValue, 16> Ops;
30144     for (unsigned i = 0; i < InNumElts; ++i)
30145       Ops.push_back(InOp.getOperand(i));
30146 
30147     EVT EltVT = InOp.getOperand(0).getValueType();
30148 
30149     SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
30150       DAG.getUNDEF(EltVT);
30151     for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
30152       Ops.push_back(FillVal);
30153     return DAG.getBuildVector(NVT, dl, Ops);
30154   }
30155   SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
30156     DAG.getUNDEF(NVT);
30157   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
30158                      InOp, DAG.getIntPtrConstant(0, dl));
30159 }
30160 
LowerMSCATTER(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)30161 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
30162                              SelectionDAG &DAG) {
30163   assert(Subtarget.hasAVX512() &&
30164          "MGATHER/MSCATTER are supported on AVX-512 arch only");
30165 
30166   MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
30167   SDValue Src = N->getValue();
30168   MVT VT = Src.getSimpleValueType();
30169   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
30170   SDLoc dl(Op);
30171 
30172   SDValue Scale = N->getScale();
30173   SDValue Index = N->getIndex();
30174   SDValue Mask = N->getMask();
30175   SDValue Chain = N->getChain();
30176   SDValue BasePtr = N->getBasePtr();
30177 
30178   if (VT == MVT::v2f32 || VT == MVT::v2i32) {
30179     assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
30180     // If the index is v2i64 and we have VLX we can use xmm for data and index.
30181     if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
30182       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30183       EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
30184       Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
30185       SDVTList VTs = DAG.getVTList(MVT::Other);
30186       SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
30187       return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
30188                                      N->getMemoryVT(), N->getMemOperand());
30189     }
30190     return SDValue();
30191   }
30192 
30193   MVT IndexVT = Index.getSimpleValueType();
30194 
30195   // If the index is v2i32, we're being called by type legalization and we
30196   // should just let the default handling take care of it.
30197   if (IndexVT == MVT::v2i32)
30198     return SDValue();
30199 
30200   // If we don't have VLX and neither the passthru or index is 512-bits, we
30201   // need to widen until one is.
30202   if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
30203       !Index.getSimpleValueType().is512BitVector()) {
30204     // Determine how much we need to widen by to get a 512-bit type.
30205     unsigned Factor = std::min(512/VT.getSizeInBits(),
30206                                512/IndexVT.getSizeInBits());
30207     unsigned NumElts = VT.getVectorNumElements() * Factor;
30208 
30209     VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
30210     IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
30211     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
30212 
30213     Src = ExtendToType(Src, VT, DAG);
30214     Index = ExtendToType(Index, IndexVT, DAG);
30215     Mask = ExtendToType(Mask, MaskVT, DAG, true);
30216   }
30217 
30218   SDVTList VTs = DAG.getVTList(MVT::Other);
30219   SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
30220   return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
30221                                  N->getMemoryVT(), N->getMemOperand());
30222 }
30223 
LowerMLOAD(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)30224 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
30225                           SelectionDAG &DAG) {
30226 
30227   MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
30228   MVT VT = Op.getSimpleValueType();
30229   MVT ScalarVT = VT.getScalarType();
30230   SDValue Mask = N->getMask();
30231   MVT MaskVT = Mask.getSimpleValueType();
30232   SDValue PassThru = N->getPassThru();
30233   SDLoc dl(Op);
30234 
30235   // Handle AVX masked loads which don't support passthru other than 0.
30236   if (MaskVT.getVectorElementType() != MVT::i1) {
30237     // We also allow undef in the isel pattern.
30238     if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
30239       return Op;
30240 
30241     SDValue NewLoad = DAG.getMaskedLoad(
30242         VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
30243         getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
30244         N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
30245         N->isExpandingLoad());
30246     // Emit a blend.
30247     SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
30248     return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
30249   }
30250 
30251   assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
30252          "Expanding masked load is supported on AVX-512 target only!");
30253 
30254   assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
30255          "Expanding masked load is supported for 32 and 64-bit types only!");
30256 
30257   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
30258          "Cannot lower masked load op.");
30259 
30260   assert((ScalarVT.getSizeInBits() >= 32 ||
30261           (Subtarget.hasBWI() &&
30262               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
30263          "Unsupported masked load op.");
30264 
30265   // This operation is legal for targets with VLX, but without
30266   // VLX the vector should be widened to 512 bit
30267   unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
30268   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
30269   PassThru = ExtendToType(PassThru, WideDataVT, DAG);
30270 
30271   // Mask element has to be i1.
30272   assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
30273          "Unexpected mask type");
30274 
30275   MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
30276 
30277   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
30278   SDValue NewLoad = DAG.getMaskedLoad(
30279       WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
30280       PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
30281       N->getExtensionType(), N->isExpandingLoad());
30282 
30283   SDValue Extract =
30284       DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
30285                   DAG.getIntPtrConstant(0, dl));
30286   SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
30287   return DAG.getMergeValues(RetOps, dl);
30288 }
30289 
LowerMSTORE(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)30290 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
30291                            SelectionDAG &DAG) {
30292   MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
30293   SDValue DataToStore = N->getValue();
30294   MVT VT = DataToStore.getSimpleValueType();
30295   MVT ScalarVT = VT.getScalarType();
30296   SDValue Mask = N->getMask();
30297   SDLoc dl(Op);
30298 
30299   assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
30300          "Expanding masked load is supported on AVX-512 target only!");
30301 
30302   assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
30303          "Expanding masked load is supported for 32 and 64-bit types only!");
30304 
30305   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
30306          "Cannot lower masked store op.");
30307 
30308   assert((ScalarVT.getSizeInBits() >= 32 ||
30309           (Subtarget.hasBWI() &&
30310               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
30311           "Unsupported masked store op.");
30312 
30313   // This operation is legal for targets with VLX, but without
30314   // VLX the vector should be widened to 512 bit
30315   unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
30316   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
30317 
30318   // Mask element has to be i1.
30319   assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
30320          "Unexpected mask type");
30321 
30322   MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
30323 
30324   DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
30325   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
30326   return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
30327                             N->getOffset(), Mask, N->getMemoryVT(),
30328                             N->getMemOperand(), N->getAddressingMode(),
30329                             N->isTruncatingStore(), N->isCompressingStore());
30330 }
30331 
LowerMGATHER(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)30332 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
30333                             SelectionDAG &DAG) {
30334   assert(Subtarget.hasAVX2() &&
30335          "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
30336 
30337   MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
30338   SDLoc dl(Op);
30339   MVT VT = Op.getSimpleValueType();
30340   SDValue Index = N->getIndex();
30341   SDValue Mask = N->getMask();
30342   SDValue PassThru = N->getPassThru();
30343   MVT IndexVT = Index.getSimpleValueType();
30344 
30345   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
30346 
30347   // If the index is v2i32, we're being called by type legalization.
30348   if (IndexVT == MVT::v2i32)
30349     return SDValue();
30350 
30351   // If we don't have VLX and neither the passthru or index is 512-bits, we
30352   // need to widen until one is.
30353   MVT OrigVT = VT;
30354   if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
30355       !IndexVT.is512BitVector()) {
30356     // Determine how much we need to widen by to get a 512-bit type.
30357     unsigned Factor = std::min(512/VT.getSizeInBits(),
30358                                512/IndexVT.getSizeInBits());
30359 
30360     unsigned NumElts = VT.getVectorNumElements() * Factor;
30361 
30362     VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
30363     IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
30364     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
30365 
30366     PassThru = ExtendToType(PassThru, VT, DAG);
30367     Index = ExtendToType(Index, IndexVT, DAG);
30368     Mask = ExtendToType(Mask, MaskVT, DAG, true);
30369   }
30370 
30371   SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
30372                     N->getScale() };
30373   SDValue NewGather = DAG.getMemIntrinsicNode(
30374       X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
30375       N->getMemOperand());
30376   SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
30377                                 NewGather, DAG.getIntPtrConstant(0, dl));
30378   return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
30379 }
30380 
LowerADDRSPACECAST(SDValue Op,SelectionDAG & DAG)30381 static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
30382   SDLoc dl(Op);
30383   SDValue Src = Op.getOperand(0);
30384   MVT DstVT = Op.getSimpleValueType();
30385 
30386   AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
30387   unsigned SrcAS = N->getSrcAddressSpace();
30388 
30389   assert(SrcAS != N->getDestAddressSpace() &&
30390          "addrspacecast must be between different address spaces");
30391 
30392   if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
30393     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
30394   } else if (DstVT == MVT::i64) {
30395     Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
30396   } else if (DstVT == MVT::i32) {
30397     Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
30398   } else {
30399     report_fatal_error("Bad address space in addrspacecast");
30400   }
30401   return Op;
30402 }
30403 
LowerGC_TRANSITION(SDValue Op,SelectionDAG & DAG) const30404 SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
30405                                               SelectionDAG &DAG) const {
30406   // TODO: Eventually, the lowering of these nodes should be informed by or
30407   // deferred to the GC strategy for the function in which they appear. For
30408   // now, however, they must be lowered to something. Since they are logically
30409   // no-ops in the case of a null GC strategy (or a GC strategy which does not
30410   // require special handling for these nodes), lower them as literal NOOPs for
30411   // the time being.
30412   SmallVector<SDValue, 2> Ops;
30413 
30414   Ops.push_back(Op.getOperand(0));
30415   if (Op->getGluedNode())
30416     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
30417 
30418   SDLoc OpDL(Op);
30419   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
30420   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
30421 
30422   return NOOP;
30423 }
30424 
30425 // Custom split CVTPS2PH with wide types.
LowerCVTPS2PH(SDValue Op,SelectionDAG & DAG)30426 static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
30427   SDLoc dl(Op);
30428   EVT VT = Op.getValueType();
30429   SDValue Lo, Hi;
30430   std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
30431   EVT LoVT, HiVT;
30432   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
30433   SDValue RC = Op.getOperand(1);
30434   Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
30435   Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
30436   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30437 }
30438 
30439 /// Provide custom lowering hooks for some operations.
LowerOperation(SDValue Op,SelectionDAG & DAG) const30440 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
30441   switch (Op.getOpcode()) {
30442   default: llvm_unreachable("Should not custom lower this!");
30443   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
30444   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
30445     return LowerCMP_SWAP(Op, Subtarget, DAG);
30446   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
30447   case ISD::ATOMIC_LOAD_ADD:
30448   case ISD::ATOMIC_LOAD_SUB:
30449   case ISD::ATOMIC_LOAD_OR:
30450   case ISD::ATOMIC_LOAD_XOR:
30451   case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);
30452   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG, Subtarget);
30453   case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);
30454   case ISD::PARITY:             return LowerPARITY(Op, Subtarget, DAG);
30455   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
30456   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
30457   case ISD::VECTOR_SHUFFLE:     return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
30458   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
30459   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
30460   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
30461   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
30462   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
30463   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
30464   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
30465   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
30466   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
30467   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
30468   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
30469   case ISD::SHL_PARTS:
30470   case ISD::SRA_PARTS:
30471   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
30472   case ISD::FSHL:
30473   case ISD::FSHR:               return LowerFunnelShift(Op, Subtarget, DAG);
30474   case ISD::STRICT_SINT_TO_FP:
30475   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
30476   case ISD::STRICT_UINT_TO_FP:
30477   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
30478   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
30479   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
30480   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
30481   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
30482   case ISD::ZERO_EXTEND_VECTOR_INREG:
30483   case ISD::SIGN_EXTEND_VECTOR_INREG:
30484     return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
30485   case ISD::FP_TO_SINT:
30486   case ISD::STRICT_FP_TO_SINT:
30487   case ISD::FP_TO_UINT:
30488   case ISD::STRICT_FP_TO_UINT:  return LowerFP_TO_INT(Op, DAG);
30489   case ISD::FP_TO_SINT_SAT:
30490   case ISD::FP_TO_UINT_SAT:     return LowerFP_TO_INT_SAT(Op, DAG);
30491   case ISD::FP_EXTEND:
30492   case ISD::STRICT_FP_EXTEND:   return LowerFP_EXTEND(Op, DAG);
30493   case ISD::FP_ROUND:
30494   case ISD::STRICT_FP_ROUND:    return LowerFP_ROUND(Op, DAG);
30495   case ISD::FP16_TO_FP:
30496   case ISD::STRICT_FP16_TO_FP:  return LowerFP16_TO_FP(Op, DAG);
30497   case ISD::FP_TO_FP16:
30498   case ISD::STRICT_FP_TO_FP16:  return LowerFP_TO_FP16(Op, DAG);
30499   case ISD::LOAD:               return LowerLoad(Op, Subtarget, DAG);
30500   case ISD::STORE:              return LowerStore(Op, Subtarget, DAG);
30501   case ISD::FADD:
30502   case ISD::FSUB:               return lowerFaddFsub(Op, DAG);
30503   case ISD::FROUND:             return LowerFROUND(Op, DAG);
30504   case ISD::FABS:
30505   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
30506   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
30507   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
30508   case ISD::LRINT:
30509   case ISD::LLRINT:             return LowerLRINT_LLRINT(Op, DAG);
30510   case ISD::SETCC:
30511   case ISD::STRICT_FSETCC:
30512   case ISD::STRICT_FSETCCS:     return LowerSETCC(Op, DAG);
30513   case ISD::SETCCCARRY:         return LowerSETCCCARRY(Op, DAG);
30514   case ISD::SELECT:             return LowerSELECT(Op, DAG);
30515   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
30516   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
30517   case ISD::VASTART:            return LowerVASTART(Op, DAG);
30518   case ISD::VAARG:              return LowerVAARG(Op, DAG);
30519   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
30520   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
30521   case ISD::INTRINSIC_VOID:
30522   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
30523   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
30524   case ISD::ADDROFRETURNADDR:   return LowerADDROFRETURNADDR(Op, DAG);
30525   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
30526   case ISD::FRAME_TO_ARGS_OFFSET:
30527                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
30528   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
30529   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
30530   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
30531   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
30532   case ISD::EH_SJLJ_SETUP_DISPATCH:
30533     return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
30534   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
30535   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
30536   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
30537   case ISD::SET_ROUNDING:       return LowerSET_ROUNDING(Op, DAG);
30538   case ISD::CTLZ:
30539   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);
30540   case ISD::CTTZ:
30541   case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, Subtarget, DAG);
30542   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
30543   case ISD::MULHS:
30544   case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
30545   case ISD::ROTL:
30546   case ISD::ROTR:               return LowerRotate(Op, Subtarget, DAG);
30547   case ISD::SRA:
30548   case ISD::SRL:
30549   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
30550   case ISD::SADDO:
30551   case ISD::UADDO:
30552   case ISD::SSUBO:
30553   case ISD::USUBO:              return LowerXALUO(Op, DAG);
30554   case ISD::SMULO:
30555   case ISD::UMULO:              return LowerMULO(Op, Subtarget, DAG);
30556   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
30557   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
30558   case ISD::SADDO_CARRY:
30559   case ISD::SSUBO_CARRY:
30560   case ISD::ADDCARRY:
30561   case ISD::SUBCARRY:           return LowerADDSUBCARRY(Op, DAG);
30562   case ISD::ADD:
30563   case ISD::SUB:                return lowerAddSub(Op, DAG, Subtarget);
30564   case ISD::UADDSAT:
30565   case ISD::SADDSAT:
30566   case ISD::USUBSAT:
30567   case ISD::SSUBSAT:            return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
30568   case ISD::SMAX:
30569   case ISD::SMIN:
30570   case ISD::UMAX:
30571   case ISD::UMIN:               return LowerMINMAX(Op, DAG);
30572   case ISD::ABS:                return LowerABS(Op, Subtarget, DAG);
30573   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
30574   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
30575   case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
30576   case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
30577   case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
30578   case ISD::GC_TRANSITION_START:
30579   case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION(Op, DAG);
30580   case ISD::ADDRSPACECAST:      return LowerADDRSPACECAST(Op, DAG);
30581   case X86ISD::CVTPS2PH:        return LowerCVTPS2PH(Op, DAG);
30582   }
30583 }
30584 
30585 /// Replace a node with an illegal result type with a new node built out of
30586 /// custom code.
ReplaceNodeResults(SDNode * N,SmallVectorImpl<SDValue> & Results,SelectionDAG & DAG) const30587 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
30588                                            SmallVectorImpl<SDValue>&Results,
30589                                            SelectionDAG &DAG) const {
30590   SDLoc dl(N);
30591   switch (N->getOpcode()) {
30592   default:
30593 #ifndef NDEBUG
30594     dbgs() << "ReplaceNodeResults: ";
30595     N->dump(&DAG);
30596 #endif
30597     llvm_unreachable("Do not know how to custom type legalize this operation!");
30598   case X86ISD::CVTPH2PS: {
30599     EVT VT = N->getValueType(0);
30600     SDValue Lo, Hi;
30601     std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
30602     EVT LoVT, HiVT;
30603     std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
30604     Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
30605     Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
30606     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30607     Results.push_back(Res);
30608     return;
30609   }
30610   case X86ISD::STRICT_CVTPH2PS: {
30611     EVT VT = N->getValueType(0);
30612     SDValue Lo, Hi;
30613     std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
30614     EVT LoVT, HiVT;
30615     std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
30616     Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
30617                      {N->getOperand(0), Lo});
30618     Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
30619                      {N->getOperand(0), Hi});
30620     SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
30621                                 Lo.getValue(1), Hi.getValue(1));
30622     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30623     Results.push_back(Res);
30624     Results.push_back(Chain);
30625     return;
30626   }
30627   case X86ISD::CVTPS2PH:
30628     Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
30629     return;
30630   case ISD::CTPOP: {
30631     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
30632     // Use a v2i64 if possible.
30633     bool NoImplicitFloatOps =
30634         DAG.getMachineFunction().getFunction().hasFnAttribute(
30635             Attribute::NoImplicitFloat);
30636     if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
30637       SDValue Wide =
30638           DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
30639       Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
30640       // Bit count should fit in 32-bits, extract it as that and then zero
30641       // extend to i64. Otherwise we end up extracting bits 63:32 separately.
30642       Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
30643       Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
30644                          DAG.getIntPtrConstant(0, dl));
30645       Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
30646       Results.push_back(Wide);
30647     }
30648     return;
30649   }
30650   case ISD::MUL: {
30651     EVT VT = N->getValueType(0);
30652     assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
30653            VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
30654     // Pre-promote these to vXi16 to avoid op legalization thinking all 16
30655     // elements are needed.
30656     MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
30657     SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
30658     SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
30659     SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
30660     Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
30661     unsigned NumConcats = 16 / VT.getVectorNumElements();
30662     SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
30663     ConcatOps[0] = Res;
30664     Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
30665     Results.push_back(Res);
30666     return;
30667   }
30668   case X86ISD::VPMADDWD:
30669   case X86ISD::AVG: {
30670     // Legalize types for X86ISD::AVG/VPMADDWD by widening.
30671     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
30672 
30673     EVT VT = N->getValueType(0);
30674     EVT InVT = N->getOperand(0).getValueType();
30675     assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
30676            "Expected a VT that divides into 128 bits.");
30677     assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
30678            "Unexpected type action!");
30679     unsigned NumConcat = 128 / InVT.getSizeInBits();
30680 
30681     EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
30682                                     InVT.getVectorElementType(),
30683                                     NumConcat * InVT.getVectorNumElements());
30684     EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
30685                                   VT.getVectorElementType(),
30686                                   NumConcat * VT.getVectorNumElements());
30687 
30688     SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
30689     Ops[0] = N->getOperand(0);
30690     SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
30691     Ops[0] = N->getOperand(1);
30692     SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
30693 
30694     SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
30695     Results.push_back(Res);
30696     return;
30697   }
30698   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
30699   case X86ISD::FMINC:
30700   case X86ISD::FMIN:
30701   case X86ISD::FMAXC:
30702   case X86ISD::FMAX: {
30703     EVT VT = N->getValueType(0);
30704     assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
30705     SDValue UNDEF = DAG.getUNDEF(VT);
30706     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
30707                               N->getOperand(0), UNDEF);
30708     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
30709                               N->getOperand(1), UNDEF);
30710     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
30711     return;
30712   }
30713   case ISD::SDIV:
30714   case ISD::UDIV:
30715   case ISD::SREM:
30716   case ISD::UREM: {
30717     EVT VT = N->getValueType(0);
30718     if (VT.isVector()) {
30719       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
30720              "Unexpected type action!");
30721       // If this RHS is a constant splat vector we can widen this and let
30722       // division/remainder by constant optimize it.
30723       // TODO: Can we do something for non-splat?
30724       APInt SplatVal;
30725       if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
30726         unsigned NumConcats = 128 / VT.getSizeInBits();
30727         SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
30728         Ops0[0] = N->getOperand(0);
30729         EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
30730         SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
30731         SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
30732         SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
30733         Results.push_back(Res);
30734       }
30735       return;
30736     }
30737 
30738     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
30739     Results.push_back(V);
30740     return;
30741   }
30742   case ISD::TRUNCATE: {
30743     MVT VT = N->getSimpleValueType(0);
30744     if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
30745       return;
30746 
30747     // The generic legalizer will try to widen the input type to the same
30748     // number of elements as the widened result type. But this isn't always
30749     // the best thing so do some custom legalization to avoid some cases.
30750     MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
30751     SDValue In = N->getOperand(0);
30752     EVT InVT = In.getValueType();
30753 
30754     unsigned InBits = InVT.getSizeInBits();
30755     if (128 % InBits == 0) {
30756       // 128 bit and smaller inputs should avoid truncate all together and
30757       // just use a build_vector that will become a shuffle.
30758       // TODO: Widen and use a shuffle directly?
30759       MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
30760       EVT EltVT = VT.getVectorElementType();
30761       unsigned WidenNumElts = WidenVT.getVectorNumElements();
30762       SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
30763       // Use the original element count so we don't do more scalar opts than
30764       // necessary.
30765       unsigned MinElts = VT.getVectorNumElements();
30766       for (unsigned i=0; i < MinElts; ++i) {
30767         SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
30768                                   DAG.getIntPtrConstant(i, dl));
30769         Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
30770       }
30771       Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
30772       return;
30773     }
30774     // With AVX512 there are some cases that can use a target specific
30775     // truncate node to go from 256/512 to less than 128 with zeros in the
30776     // upper elements of the 128 bit result.
30777     if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
30778       // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
30779       if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
30780         Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
30781         return;
30782       }
30783       // There's one case we can widen to 512 bits and use VTRUNC.
30784       if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
30785         In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
30786                          DAG.getUNDEF(MVT::v4i64));
30787         Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
30788         return;
30789       }
30790     }
30791     if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
30792         getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
30793         isTypeLegal(MVT::v4i64)) {
30794       // Input needs to be split and output needs to widened. Let's use two
30795       // VTRUNCs, and shuffle their results together into the wider type.
30796       SDValue Lo, Hi;
30797       std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
30798 
30799       Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
30800       Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
30801       SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
30802                                          { 0,  1,  2,  3, 16, 17, 18, 19,
30803                                           -1, -1, -1, -1, -1, -1, -1, -1 });
30804       Results.push_back(Res);
30805       return;
30806     }
30807 
30808     return;
30809   }
30810   case ISD::ANY_EXTEND:
30811     // Right now, only MVT::v8i8 has Custom action for an illegal type.
30812     // It's intended to custom handle the input type.
30813     assert(N->getValueType(0) == MVT::v8i8 &&
30814            "Do not know how to legalize this Node");
30815     return;
30816   case ISD::SIGN_EXTEND:
30817   case ISD::ZERO_EXTEND: {
30818     EVT VT = N->getValueType(0);
30819     SDValue In = N->getOperand(0);
30820     EVT InVT = In.getValueType();
30821     if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
30822         (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
30823       assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
30824              "Unexpected type action!");
30825       assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
30826       // Custom split this so we can extend i8/i16->i32 invec. This is better
30827       // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
30828       // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
30829       // we allow the sra from the extend to i32 to be shared by the split.
30830       In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
30831 
30832       // Fill a vector with sign bits for each element.
30833       SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
30834       SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
30835 
30836       // Create an unpackl and unpackh to interleave the sign bits then bitcast
30837       // to v2i64.
30838       SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
30839                                         {0, 4, 1, 5});
30840       Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
30841       SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
30842                                         {2, 6, 3, 7});
30843       Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
30844 
30845       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30846       Results.push_back(Res);
30847       return;
30848     }
30849 
30850     if (VT == MVT::v16i32 || VT == MVT::v8i64) {
30851       if (!InVT.is128BitVector()) {
30852         // Not a 128 bit vector, but maybe type legalization will promote
30853         // it to 128 bits.
30854         if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
30855           return;
30856         InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
30857         if (!InVT.is128BitVector())
30858           return;
30859 
30860         // Promote the input to 128 bits. Type legalization will turn this into
30861         // zext_inreg/sext_inreg.
30862         In = DAG.getNode(N->getOpcode(), dl, InVT, In);
30863       }
30864 
30865       // Perform custom splitting instead of the two stage extend we would get
30866       // by default.
30867       EVT LoVT, HiVT;
30868       std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
30869       assert(isTypeLegal(LoVT) && "Split VT not legal?");
30870 
30871       SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
30872 
30873       // We need to shift the input over by half the number of elements.
30874       unsigned NumElts = InVT.getVectorNumElements();
30875       unsigned HalfNumElts = NumElts / 2;
30876       SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
30877       for (unsigned i = 0; i != HalfNumElts; ++i)
30878         ShufMask[i] = i + HalfNumElts;
30879 
30880       SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
30881       Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
30882 
30883       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30884       Results.push_back(Res);
30885     }
30886     return;
30887   }
30888   case ISD::FP_TO_SINT:
30889   case ISD::STRICT_FP_TO_SINT:
30890   case ISD::FP_TO_UINT:
30891   case ISD::STRICT_FP_TO_UINT: {
30892     bool IsStrict = N->isStrictFPOpcode();
30893     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
30894                     N->getOpcode() == ISD::STRICT_FP_TO_SINT;
30895     EVT VT = N->getValueType(0);
30896     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
30897     EVT SrcVT = Src.getValueType();
30898 
30899     if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
30900       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
30901              "Unexpected type action!");
30902 
30903       // Try to create a 128 bit vector, but don't exceed a 32 bit element.
30904       unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
30905       MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
30906                                        VT.getVectorNumElements());
30907       SDValue Res;
30908       SDValue Chain;
30909       if (IsStrict) {
30910         Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
30911                           {N->getOperand(0), Src});
30912         Chain = Res.getValue(1);
30913       } else
30914         Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
30915 
30916       // Preserve what we know about the size of the original result. If the
30917       // result is v2i32, we have to manually widen the assert.
30918       if (PromoteVT == MVT::v2i32)
30919         Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
30920                           DAG.getUNDEF(MVT::v2i32));
30921 
30922       Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
30923                         Res.getValueType(), Res,
30924                         DAG.getValueType(VT.getVectorElementType()));
30925 
30926       if (PromoteVT == MVT::v2i32)
30927         Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
30928                           DAG.getIntPtrConstant(0, dl));
30929 
30930       // Truncate back to the original width.
30931       Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
30932 
30933       // Now widen to 128 bits.
30934       unsigned NumConcats = 128 / VT.getSizeInBits();
30935       MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
30936                                       VT.getVectorNumElements() * NumConcats);
30937       SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
30938       ConcatOps[0] = Res;
30939       Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
30940       Results.push_back(Res);
30941       if (IsStrict)
30942         Results.push_back(Chain);
30943       return;
30944     }
30945 
30946 
30947     if (VT == MVT::v2i32) {
30948       assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
30949              "Strict unsigned conversion requires AVX512");
30950       assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
30951       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
30952              "Unexpected type action!");
30953       if (Src.getValueType() == MVT::v2f64) {
30954         if (!IsSigned && !Subtarget.hasAVX512()) {
30955           SDValue Res =
30956               expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
30957           Results.push_back(Res);
30958           return;
30959         }
30960 
30961         unsigned Opc;
30962         if (IsStrict)
30963           Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
30964         else
30965           Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
30966 
30967         // If we have VLX we can emit a target specific FP_TO_UINT node,.
30968         if (!IsSigned && !Subtarget.hasVLX()) {
30969           // Otherwise we can defer to the generic legalizer which will widen
30970           // the input as well. This will be further widened during op
30971           // legalization to v8i32<-v8f64.
30972           // For strict nodes we'll need to widen ourselves.
30973           // FIXME: Fix the type legalizer to safely widen strict nodes?
30974           if (!IsStrict)
30975             return;
30976           Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
30977                             DAG.getConstantFP(0.0, dl, MVT::v2f64));
30978           Opc = N->getOpcode();
30979         }
30980         SDValue Res;
30981         SDValue Chain;
30982         if (IsStrict) {
30983           Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
30984                             {N->getOperand(0), Src});
30985           Chain = Res.getValue(1);
30986         } else {
30987           Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
30988         }
30989         Results.push_back(Res);
30990         if (IsStrict)
30991           Results.push_back(Chain);
30992         return;
30993       }
30994 
30995       // Custom widen strict v2f32->v2i32 by padding with zeros.
30996       // FIXME: Should generic type legalizer do this?
30997       if (Src.getValueType() == MVT::v2f32 && IsStrict) {
30998         Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
30999                           DAG.getConstantFP(0.0, dl, MVT::v2f32));
31000         SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
31001                                   {N->getOperand(0), Src});
31002         Results.push_back(Res);
31003         Results.push_back(Res.getValue(1));
31004         return;
31005       }
31006 
31007       // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
31008       // so early out here.
31009       return;
31010     }
31011 
31012     assert(!VT.isVector() && "Vectors should have been handled above!");
31013 
31014     if (Subtarget.hasDQI() && VT == MVT::i64 &&
31015         (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
31016       assert(!Subtarget.is64Bit() && "i64 should be legal");
31017       unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
31018       // If we use a 128-bit result we might need to use a target specific node.
31019       unsigned SrcElts =
31020           std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
31021       MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
31022       MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
31023       unsigned Opc = N->getOpcode();
31024       if (NumElts != SrcElts) {
31025         if (IsStrict)
31026           Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
31027         else
31028           Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
31029       }
31030 
31031       SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
31032       SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
31033                                 DAG.getConstantFP(0.0, dl, VecInVT), Src,
31034                                 ZeroIdx);
31035       SDValue Chain;
31036       if (IsStrict) {
31037         SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
31038         Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
31039         Chain = Res.getValue(1);
31040       } else
31041         Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
31042       Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
31043       Results.push_back(Res);
31044       if (IsStrict)
31045         Results.push_back(Chain);
31046       return;
31047     }
31048 
31049     SDValue Chain;
31050     if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
31051       Results.push_back(V);
31052       if (IsStrict)
31053         Results.push_back(Chain);
31054     }
31055     return;
31056   }
31057   case ISD::LRINT:
31058   case ISD::LLRINT: {
31059     if (SDValue V = LRINT_LLRINTHelper(N, DAG))
31060       Results.push_back(V);
31061     return;
31062   }
31063 
31064   case ISD::SINT_TO_FP:
31065   case ISD::STRICT_SINT_TO_FP:
31066   case ISD::UINT_TO_FP:
31067   case ISD::STRICT_UINT_TO_FP: {
31068     bool IsStrict = N->isStrictFPOpcode();
31069     bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
31070                     N->getOpcode() == ISD::STRICT_SINT_TO_FP;
31071     EVT VT = N->getValueType(0);
31072     if (VT != MVT::v2f32)
31073       return;
31074     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
31075     EVT SrcVT = Src.getValueType();
31076     if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
31077       if (IsStrict) {
31078         unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
31079                                 : X86ISD::STRICT_CVTUI2P;
31080         SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
31081                                   {N->getOperand(0), Src});
31082         Results.push_back(Res);
31083         Results.push_back(Res.getValue(1));
31084       } else {
31085         unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
31086         Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
31087       }
31088       return;
31089     }
31090     if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
31091         Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
31092       SDValue Zero = DAG.getConstant(0, dl, SrcVT);
31093       SDValue One  = DAG.getConstant(1, dl, SrcVT);
31094       SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
31095                                  DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
31096                                  DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
31097       SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
31098       SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
31099       SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
31100       for (int i = 0; i != 2; ++i) {
31101         SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
31102                                   SignSrc, DAG.getIntPtrConstant(i, dl));
31103         if (IsStrict)
31104           SignCvts[i] =
31105               DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
31106                           {N->getOperand(0), Elt});
31107         else
31108           SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
31109       };
31110       SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
31111       SDValue Slow, Chain;
31112       if (IsStrict) {
31113         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
31114                             SignCvts[0].getValue(1), SignCvts[1].getValue(1));
31115         Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
31116                            {Chain, SignCvt, SignCvt});
31117         Chain = Slow.getValue(1);
31118       } else {
31119         Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
31120       }
31121       IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
31122       IsNeg =
31123           DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
31124       SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
31125       Results.push_back(Cvt);
31126       if (IsStrict)
31127         Results.push_back(Chain);
31128       return;
31129     }
31130 
31131     if (SrcVT != MVT::v2i32)
31132       return;
31133 
31134     if (IsSigned || Subtarget.hasAVX512()) {
31135       if (!IsStrict)
31136         return;
31137 
31138       // Custom widen strict v2i32->v2f32 to avoid scalarization.
31139       // FIXME: Should generic type legalizer do this?
31140       Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
31141                         DAG.getConstant(0, dl, MVT::v2i32));
31142       SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
31143                                 {N->getOperand(0), Src});
31144       Results.push_back(Res);
31145       Results.push_back(Res.getValue(1));
31146       return;
31147     }
31148 
31149     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
31150     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
31151     SDValue VBias =
31152         DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
31153     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
31154                              DAG.getBitcast(MVT::v2i64, VBias));
31155     Or = DAG.getBitcast(MVT::v2f64, Or);
31156     if (IsStrict) {
31157       SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
31158                                 {N->getOperand(0), Or, VBias});
31159       SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
31160                                 {MVT::v4f32, MVT::Other},
31161                                 {Sub.getValue(1), Sub});
31162       Results.push_back(Res);
31163       Results.push_back(Res.getValue(1));
31164     } else {
31165       // TODO: Are there any fast-math-flags to propagate here?
31166       SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
31167       Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
31168     }
31169     return;
31170   }
31171   case ISD::STRICT_FP_ROUND:
31172   case ISD::FP_ROUND: {
31173     bool IsStrict = N->isStrictFPOpcode();
31174     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
31175     if (!isTypeLegal(Src.getValueType()))
31176       return;
31177     SDValue V;
31178     if (IsStrict)
31179       V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other},
31180                       {N->getOperand(0), N->getOperand(1)});
31181     else
31182       V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
31183     Results.push_back(V);
31184     if (IsStrict)
31185       Results.push_back(V.getValue(1));
31186     return;
31187   }
31188   case ISD::FP_EXTEND:
31189   case ISD::STRICT_FP_EXTEND: {
31190     // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
31191     // No other ValueType for FP_EXTEND should reach this point.
31192     assert(N->getValueType(0) == MVT::v2f32 &&
31193            "Do not know how to legalize this Node");
31194     return;
31195   }
31196   case ISD::INTRINSIC_W_CHAIN: {
31197     unsigned IntNo = N->getConstantOperandVal(1);
31198     switch (IntNo) {
31199     default : llvm_unreachable("Do not know how to custom type "
31200                                "legalize this intrinsic operation!");
31201     case Intrinsic::x86_rdtsc:
31202       return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
31203                                      Results);
31204     case Intrinsic::x86_rdtscp:
31205       return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
31206                                      Results);
31207     case Intrinsic::x86_rdpmc:
31208       expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
31209                                   Results);
31210       return;
31211     case Intrinsic::x86_xgetbv:
31212       expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
31213                                   Results);
31214       return;
31215     }
31216   }
31217   case ISD::READCYCLECOUNTER: {
31218     return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
31219   }
31220   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
31221     EVT T = N->getValueType(0);
31222     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
31223     bool Regs64bit = T == MVT::i128;
31224     assert((!Regs64bit || Subtarget.hasCmpxchg16b()) &&
31225            "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
31226     MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
31227     SDValue cpInL, cpInH;
31228     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
31229                         DAG.getConstant(0, dl, HalfT));
31230     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
31231                         DAG.getConstant(1, dl, HalfT));
31232     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
31233                              Regs64bit ? X86::RAX : X86::EAX,
31234                              cpInL, SDValue());
31235     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
31236                              Regs64bit ? X86::RDX : X86::EDX,
31237                              cpInH, cpInL.getValue(1));
31238     SDValue swapInL, swapInH;
31239     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
31240                           DAG.getConstant(0, dl, HalfT));
31241     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
31242                           DAG.getConstant(1, dl, HalfT));
31243     swapInH =
31244         DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
31245                          swapInH, cpInH.getValue(1));
31246 
31247     // In 64-bit mode we might need the base pointer in RBX, but we can't know
31248     // until later. So we keep the RBX input in a vreg and use a custom
31249     // inserter.
31250     // Since RBX will be a reserved register the register allocator will not
31251     // make sure its value will be properly saved and restored around this
31252     // live-range.
31253     SDValue Result;
31254     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
31255     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
31256     if (Regs64bit) {
31257       SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
31258                        swapInH.getValue(1)};
31259       Result =
31260           DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
31261     } else {
31262       swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
31263                                  swapInH.getValue(1));
31264       SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
31265                        swapInL.getValue(1)};
31266       Result =
31267           DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
31268     }
31269 
31270     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
31271                                         Regs64bit ? X86::RAX : X86::EAX,
31272                                         HalfT, Result.getValue(1));
31273     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
31274                                         Regs64bit ? X86::RDX : X86::EDX,
31275                                         HalfT, cpOutL.getValue(2));
31276     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
31277 
31278     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
31279                                         MVT::i32, cpOutH.getValue(2));
31280     SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
31281     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
31282 
31283     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
31284     Results.push_back(Success);
31285     Results.push_back(EFLAGS.getValue(1));
31286     return;
31287   }
31288   case ISD::ATOMIC_LOAD: {
31289     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
31290     bool NoImplicitFloatOps =
31291         DAG.getMachineFunction().getFunction().hasFnAttribute(
31292             Attribute::NoImplicitFloat);
31293     if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
31294       auto *Node = cast<AtomicSDNode>(N);
31295       if (Subtarget.hasSSE1()) {
31296         // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
31297         // Then extract the lower 64-bits.
31298         MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
31299         SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
31300         SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
31301         SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
31302                                              MVT::i64, Node->getMemOperand());
31303         if (Subtarget.hasSSE2()) {
31304           SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
31305                                     DAG.getIntPtrConstant(0, dl));
31306           Results.push_back(Res);
31307           Results.push_back(Ld.getValue(1));
31308           return;
31309         }
31310         // We use an alternative sequence for SSE1 that extracts as v2f32 and
31311         // then casts to i64. This avoids a 128-bit stack temporary being
31312         // created by type legalization if we were to cast v4f32->v2i64.
31313         SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
31314                                   DAG.getIntPtrConstant(0, dl));
31315         Res = DAG.getBitcast(MVT::i64, Res);
31316         Results.push_back(Res);
31317         Results.push_back(Ld.getValue(1));
31318         return;
31319       }
31320       if (Subtarget.hasX87()) {
31321         // First load this into an 80-bit X87 register. This will put the whole
31322         // integer into the significand.
31323         SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
31324         SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
31325         SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
31326                                                  dl, Tys, Ops, MVT::i64,
31327                                                  Node->getMemOperand());
31328         SDValue Chain = Result.getValue(1);
31329 
31330         // Now store the X87 register to a stack temporary and convert to i64.
31331         // This store is not atomic and doesn't need to be.
31332         // FIXME: We don't need a stack temporary if the result of the load
31333         // is already being stored. We could just directly store there.
31334         SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
31335         int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
31336         MachinePointerInfo MPI =
31337             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
31338         SDValue StoreOps[] = { Chain, Result, StackPtr };
31339         Chain = DAG.getMemIntrinsicNode(
31340             X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
31341             MPI, None /*Align*/, MachineMemOperand::MOStore);
31342 
31343         // Finally load the value back from the stack temporary and return it.
31344         // This load is not atomic and doesn't need to be.
31345         // This load will be further type legalized.
31346         Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
31347         Results.push_back(Result);
31348         Results.push_back(Result.getValue(1));
31349         return;
31350       }
31351     }
31352     // TODO: Use MOVLPS when SSE1 is available?
31353     // Delegate to generic TypeLegalization. Situations we can really handle
31354     // should have already been dealt with by AtomicExpandPass.cpp.
31355     break;
31356   }
31357   case ISD::ATOMIC_SWAP:
31358   case ISD::ATOMIC_LOAD_ADD:
31359   case ISD::ATOMIC_LOAD_SUB:
31360   case ISD::ATOMIC_LOAD_AND:
31361   case ISD::ATOMIC_LOAD_OR:
31362   case ISD::ATOMIC_LOAD_XOR:
31363   case ISD::ATOMIC_LOAD_NAND:
31364   case ISD::ATOMIC_LOAD_MIN:
31365   case ISD::ATOMIC_LOAD_MAX:
31366   case ISD::ATOMIC_LOAD_UMIN:
31367   case ISD::ATOMIC_LOAD_UMAX:
31368     // Delegate to generic TypeLegalization. Situations we can really handle
31369     // should have already been dealt with by AtomicExpandPass.cpp.
31370     break;
31371 
31372   case ISD::BITCAST: {
31373     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
31374     EVT DstVT = N->getValueType(0);
31375     EVT SrcVT = N->getOperand(0).getValueType();
31376 
31377     // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
31378     // we can split using the k-register rather than memory.
31379     if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
31380       assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
31381       SDValue Lo, Hi;
31382       std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
31383       Lo = DAG.getBitcast(MVT::i32, Lo);
31384       Hi = DAG.getBitcast(MVT::i32, Hi);
31385       SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
31386       Results.push_back(Res);
31387       return;
31388     }
31389 
31390     if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
31391       // FIXME: Use v4f32 for SSE1?
31392       assert(Subtarget.hasSSE2() && "Requires SSE2");
31393       assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
31394              "Unexpected type action!");
31395       EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
31396       SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
31397                                 N->getOperand(0));
31398       Res = DAG.getBitcast(WideVT, Res);
31399       Results.push_back(Res);
31400       return;
31401     }
31402 
31403     return;
31404   }
31405   case ISD::MGATHER: {
31406     EVT VT = N->getValueType(0);
31407     if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
31408         (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
31409       auto *Gather = cast<MaskedGatherSDNode>(N);
31410       SDValue Index = Gather->getIndex();
31411       if (Index.getValueType() != MVT::v2i64)
31412         return;
31413       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
31414              "Unexpected type action!");
31415       EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
31416       SDValue Mask = Gather->getMask();
31417       assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
31418       SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
31419                                      Gather->getPassThru(),
31420                                      DAG.getUNDEF(VT));
31421       if (!Subtarget.hasVLX()) {
31422         // We need to widen the mask, but the instruction will only use 2
31423         // of its elements. So we can use undef.
31424         Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
31425                            DAG.getUNDEF(MVT::v2i1));
31426         Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
31427       }
31428       SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
31429                         Gather->getBasePtr(), Index, Gather->getScale() };
31430       SDValue Res = DAG.getMemIntrinsicNode(
31431           X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
31432           Gather->getMemoryVT(), Gather->getMemOperand());
31433       Results.push_back(Res);
31434       Results.push_back(Res.getValue(1));
31435       return;
31436     }
31437     return;
31438   }
31439   case ISD::LOAD: {
31440     // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
31441     // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
31442     // cast since type legalization will try to use an i64 load.
31443     MVT VT = N->getSimpleValueType(0);
31444     assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
31445     assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
31446            "Unexpected type action!");
31447     if (!ISD::isNON_EXTLoad(N))
31448       return;
31449     auto *Ld = cast<LoadSDNode>(N);
31450     if (Subtarget.hasSSE2()) {
31451       MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
31452       SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
31453                                 Ld->getPointerInfo(), Ld->getOriginalAlign(),
31454                                 Ld->getMemOperand()->getFlags());
31455       SDValue Chain = Res.getValue(1);
31456       MVT VecVT = MVT::getVectorVT(LdVT, 2);
31457       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
31458       EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
31459       Res = DAG.getBitcast(WideVT, Res);
31460       Results.push_back(Res);
31461       Results.push_back(Chain);
31462       return;
31463     }
31464     assert(Subtarget.hasSSE1() && "Expected SSE");
31465     SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
31466     SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
31467     SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
31468                                           MVT::i64, Ld->getMemOperand());
31469     Results.push_back(Res);
31470     Results.push_back(Res.getValue(1));
31471     return;
31472   }
31473   case ISD::ADDRSPACECAST: {
31474     SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
31475     Results.push_back(V);
31476     return;
31477   }
31478   case ISD::BITREVERSE:
31479     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
31480     assert(Subtarget.hasXOP() && "Expected XOP");
31481     // We can use VPPERM by copying to a vector register and back. We'll need
31482     // to move the scalar in two i32 pieces.
31483     Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
31484     return;
31485   }
31486 }
31487 
getTargetNodeName(unsigned Opcode) const31488 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
31489   switch ((X86ISD::NodeType)Opcode) {
31490   case X86ISD::FIRST_NUMBER:       break;
31491 #define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
31492   NODE_NAME_CASE(BSF)
31493   NODE_NAME_CASE(BSR)
31494   NODE_NAME_CASE(FSHL)
31495   NODE_NAME_CASE(FSHR)
31496   NODE_NAME_CASE(FAND)
31497   NODE_NAME_CASE(FANDN)
31498   NODE_NAME_CASE(FOR)
31499   NODE_NAME_CASE(FXOR)
31500   NODE_NAME_CASE(FILD)
31501   NODE_NAME_CASE(FIST)
31502   NODE_NAME_CASE(FP_TO_INT_IN_MEM)
31503   NODE_NAME_CASE(FLD)
31504   NODE_NAME_CASE(FST)
31505   NODE_NAME_CASE(CALL)
31506   NODE_NAME_CASE(CALL_RVMARKER)
31507   NODE_NAME_CASE(BT)
31508   NODE_NAME_CASE(CMP)
31509   NODE_NAME_CASE(FCMP)
31510   NODE_NAME_CASE(STRICT_FCMP)
31511   NODE_NAME_CASE(STRICT_FCMPS)
31512   NODE_NAME_CASE(COMI)
31513   NODE_NAME_CASE(UCOMI)
31514   NODE_NAME_CASE(CMPM)
31515   NODE_NAME_CASE(CMPMM)
31516   NODE_NAME_CASE(STRICT_CMPM)
31517   NODE_NAME_CASE(CMPMM_SAE)
31518   NODE_NAME_CASE(SETCC)
31519   NODE_NAME_CASE(SETCC_CARRY)
31520   NODE_NAME_CASE(FSETCC)
31521   NODE_NAME_CASE(FSETCCM)
31522   NODE_NAME_CASE(FSETCCM_SAE)
31523   NODE_NAME_CASE(CMOV)
31524   NODE_NAME_CASE(BRCOND)
31525   NODE_NAME_CASE(RET_FLAG)
31526   NODE_NAME_CASE(IRET)
31527   NODE_NAME_CASE(REP_STOS)
31528   NODE_NAME_CASE(REP_MOVS)
31529   NODE_NAME_CASE(GlobalBaseReg)
31530   NODE_NAME_CASE(Wrapper)
31531   NODE_NAME_CASE(WrapperRIP)
31532   NODE_NAME_CASE(MOVQ2DQ)
31533   NODE_NAME_CASE(MOVDQ2Q)
31534   NODE_NAME_CASE(MMX_MOVD2W)
31535   NODE_NAME_CASE(MMX_MOVW2D)
31536   NODE_NAME_CASE(PEXTRB)
31537   NODE_NAME_CASE(PEXTRW)
31538   NODE_NAME_CASE(INSERTPS)
31539   NODE_NAME_CASE(PINSRB)
31540   NODE_NAME_CASE(PINSRW)
31541   NODE_NAME_CASE(PSHUFB)
31542   NODE_NAME_CASE(ANDNP)
31543   NODE_NAME_CASE(BLENDI)
31544   NODE_NAME_CASE(BLENDV)
31545   NODE_NAME_CASE(HADD)
31546   NODE_NAME_CASE(HSUB)
31547   NODE_NAME_CASE(FHADD)
31548   NODE_NAME_CASE(FHSUB)
31549   NODE_NAME_CASE(CONFLICT)
31550   NODE_NAME_CASE(FMAX)
31551   NODE_NAME_CASE(FMAXS)
31552   NODE_NAME_CASE(FMAX_SAE)
31553   NODE_NAME_CASE(FMAXS_SAE)
31554   NODE_NAME_CASE(FMIN)
31555   NODE_NAME_CASE(FMINS)
31556   NODE_NAME_CASE(FMIN_SAE)
31557   NODE_NAME_CASE(FMINS_SAE)
31558   NODE_NAME_CASE(FMAXC)
31559   NODE_NAME_CASE(FMINC)
31560   NODE_NAME_CASE(FRSQRT)
31561   NODE_NAME_CASE(FRCP)
31562   NODE_NAME_CASE(EXTRQI)
31563   NODE_NAME_CASE(INSERTQI)
31564   NODE_NAME_CASE(TLSADDR)
31565   NODE_NAME_CASE(TLSBASEADDR)
31566   NODE_NAME_CASE(TLSCALL)
31567   NODE_NAME_CASE(EH_SJLJ_SETJMP)
31568   NODE_NAME_CASE(EH_SJLJ_LONGJMP)
31569   NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
31570   NODE_NAME_CASE(EH_RETURN)
31571   NODE_NAME_CASE(TC_RETURN)
31572   NODE_NAME_CASE(FNSTCW16m)
31573   NODE_NAME_CASE(FLDCW16m)
31574   NODE_NAME_CASE(LCMPXCHG_DAG)
31575   NODE_NAME_CASE(LCMPXCHG8_DAG)
31576   NODE_NAME_CASE(LCMPXCHG16_DAG)
31577   NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
31578   NODE_NAME_CASE(LADD)
31579   NODE_NAME_CASE(LSUB)
31580   NODE_NAME_CASE(LOR)
31581   NODE_NAME_CASE(LXOR)
31582   NODE_NAME_CASE(LAND)
31583   NODE_NAME_CASE(VZEXT_MOVL)
31584   NODE_NAME_CASE(VZEXT_LOAD)
31585   NODE_NAME_CASE(VEXTRACT_STORE)
31586   NODE_NAME_CASE(VTRUNC)
31587   NODE_NAME_CASE(VTRUNCS)
31588   NODE_NAME_CASE(VTRUNCUS)
31589   NODE_NAME_CASE(VMTRUNC)
31590   NODE_NAME_CASE(VMTRUNCS)
31591   NODE_NAME_CASE(VMTRUNCUS)
31592   NODE_NAME_CASE(VTRUNCSTORES)
31593   NODE_NAME_CASE(VTRUNCSTOREUS)
31594   NODE_NAME_CASE(VMTRUNCSTORES)
31595   NODE_NAME_CASE(VMTRUNCSTOREUS)
31596   NODE_NAME_CASE(VFPEXT)
31597   NODE_NAME_CASE(STRICT_VFPEXT)
31598   NODE_NAME_CASE(VFPEXT_SAE)
31599   NODE_NAME_CASE(VFPEXTS)
31600   NODE_NAME_CASE(VFPEXTS_SAE)
31601   NODE_NAME_CASE(VFPROUND)
31602   NODE_NAME_CASE(STRICT_VFPROUND)
31603   NODE_NAME_CASE(VMFPROUND)
31604   NODE_NAME_CASE(VFPROUND_RND)
31605   NODE_NAME_CASE(VFPROUNDS)
31606   NODE_NAME_CASE(VFPROUNDS_RND)
31607   NODE_NAME_CASE(VSHLDQ)
31608   NODE_NAME_CASE(VSRLDQ)
31609   NODE_NAME_CASE(VSHL)
31610   NODE_NAME_CASE(VSRL)
31611   NODE_NAME_CASE(VSRA)
31612   NODE_NAME_CASE(VSHLI)
31613   NODE_NAME_CASE(VSRLI)
31614   NODE_NAME_CASE(VSRAI)
31615   NODE_NAME_CASE(VSHLV)
31616   NODE_NAME_CASE(VSRLV)
31617   NODE_NAME_CASE(VSRAV)
31618   NODE_NAME_CASE(VROTLI)
31619   NODE_NAME_CASE(VROTRI)
31620   NODE_NAME_CASE(VPPERM)
31621   NODE_NAME_CASE(CMPP)
31622   NODE_NAME_CASE(STRICT_CMPP)
31623   NODE_NAME_CASE(PCMPEQ)
31624   NODE_NAME_CASE(PCMPGT)
31625   NODE_NAME_CASE(PHMINPOS)
31626   NODE_NAME_CASE(ADD)
31627   NODE_NAME_CASE(SUB)
31628   NODE_NAME_CASE(ADC)
31629   NODE_NAME_CASE(SBB)
31630   NODE_NAME_CASE(SMUL)
31631   NODE_NAME_CASE(UMUL)
31632   NODE_NAME_CASE(OR)
31633   NODE_NAME_CASE(XOR)
31634   NODE_NAME_CASE(AND)
31635   NODE_NAME_CASE(BEXTR)
31636   NODE_NAME_CASE(BEXTRI)
31637   NODE_NAME_CASE(BZHI)
31638   NODE_NAME_CASE(PDEP)
31639   NODE_NAME_CASE(PEXT)
31640   NODE_NAME_CASE(MUL_IMM)
31641   NODE_NAME_CASE(MOVMSK)
31642   NODE_NAME_CASE(PTEST)
31643   NODE_NAME_CASE(TESTP)
31644   NODE_NAME_CASE(KORTEST)
31645   NODE_NAME_CASE(KTEST)
31646   NODE_NAME_CASE(KADD)
31647   NODE_NAME_CASE(KSHIFTL)
31648   NODE_NAME_CASE(KSHIFTR)
31649   NODE_NAME_CASE(PACKSS)
31650   NODE_NAME_CASE(PACKUS)
31651   NODE_NAME_CASE(PALIGNR)
31652   NODE_NAME_CASE(VALIGN)
31653   NODE_NAME_CASE(VSHLD)
31654   NODE_NAME_CASE(VSHRD)
31655   NODE_NAME_CASE(VSHLDV)
31656   NODE_NAME_CASE(VSHRDV)
31657   NODE_NAME_CASE(PSHUFD)
31658   NODE_NAME_CASE(PSHUFHW)
31659   NODE_NAME_CASE(PSHUFLW)
31660   NODE_NAME_CASE(SHUFP)
31661   NODE_NAME_CASE(SHUF128)
31662   NODE_NAME_CASE(MOVLHPS)
31663   NODE_NAME_CASE(MOVHLPS)
31664   NODE_NAME_CASE(MOVDDUP)
31665   NODE_NAME_CASE(MOVSHDUP)
31666   NODE_NAME_CASE(MOVSLDUP)
31667   NODE_NAME_CASE(MOVSD)
31668   NODE_NAME_CASE(MOVSS)
31669   NODE_NAME_CASE(UNPCKL)
31670   NODE_NAME_CASE(UNPCKH)
31671   NODE_NAME_CASE(VBROADCAST)
31672   NODE_NAME_CASE(VBROADCAST_LOAD)
31673   NODE_NAME_CASE(VBROADCASTM)
31674   NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
31675   NODE_NAME_CASE(VPERMILPV)
31676   NODE_NAME_CASE(VPERMILPI)
31677   NODE_NAME_CASE(VPERM2X128)
31678   NODE_NAME_CASE(VPERMV)
31679   NODE_NAME_CASE(VPERMV3)
31680   NODE_NAME_CASE(VPERMI)
31681   NODE_NAME_CASE(VPTERNLOG)
31682   NODE_NAME_CASE(VFIXUPIMM)
31683   NODE_NAME_CASE(VFIXUPIMM_SAE)
31684   NODE_NAME_CASE(VFIXUPIMMS)
31685   NODE_NAME_CASE(VFIXUPIMMS_SAE)
31686   NODE_NAME_CASE(VRANGE)
31687   NODE_NAME_CASE(VRANGE_SAE)
31688   NODE_NAME_CASE(VRANGES)
31689   NODE_NAME_CASE(VRANGES_SAE)
31690   NODE_NAME_CASE(PMULUDQ)
31691   NODE_NAME_CASE(PMULDQ)
31692   NODE_NAME_CASE(PSADBW)
31693   NODE_NAME_CASE(DBPSADBW)
31694   NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
31695   NODE_NAME_CASE(VAARG_64)
31696   NODE_NAME_CASE(VAARG_X32)
31697   NODE_NAME_CASE(WIN_ALLOCA)
31698   NODE_NAME_CASE(MEMBARRIER)
31699   NODE_NAME_CASE(MFENCE)
31700   NODE_NAME_CASE(SEG_ALLOCA)
31701   NODE_NAME_CASE(PROBED_ALLOCA)
31702   NODE_NAME_CASE(RDRAND)
31703   NODE_NAME_CASE(RDSEED)
31704   NODE_NAME_CASE(RDPKRU)
31705   NODE_NAME_CASE(WRPKRU)
31706   NODE_NAME_CASE(VPMADDUBSW)
31707   NODE_NAME_CASE(VPMADDWD)
31708   NODE_NAME_CASE(VPSHA)
31709   NODE_NAME_CASE(VPSHL)
31710   NODE_NAME_CASE(VPCOM)
31711   NODE_NAME_CASE(VPCOMU)
31712   NODE_NAME_CASE(VPERMIL2)
31713   NODE_NAME_CASE(FMSUB)
31714   NODE_NAME_CASE(STRICT_FMSUB)
31715   NODE_NAME_CASE(FNMADD)
31716   NODE_NAME_CASE(STRICT_FNMADD)
31717   NODE_NAME_CASE(FNMSUB)
31718   NODE_NAME_CASE(STRICT_FNMSUB)
31719   NODE_NAME_CASE(FMADDSUB)
31720   NODE_NAME_CASE(FMSUBADD)
31721   NODE_NAME_CASE(FMADD_RND)
31722   NODE_NAME_CASE(FNMADD_RND)
31723   NODE_NAME_CASE(FMSUB_RND)
31724   NODE_NAME_CASE(FNMSUB_RND)
31725   NODE_NAME_CASE(FMADDSUB_RND)
31726   NODE_NAME_CASE(FMSUBADD_RND)
31727   NODE_NAME_CASE(VPMADD52H)
31728   NODE_NAME_CASE(VPMADD52L)
31729   NODE_NAME_CASE(VRNDSCALE)
31730   NODE_NAME_CASE(STRICT_VRNDSCALE)
31731   NODE_NAME_CASE(VRNDSCALE_SAE)
31732   NODE_NAME_CASE(VRNDSCALES)
31733   NODE_NAME_CASE(VRNDSCALES_SAE)
31734   NODE_NAME_CASE(VREDUCE)
31735   NODE_NAME_CASE(VREDUCE_SAE)
31736   NODE_NAME_CASE(VREDUCES)
31737   NODE_NAME_CASE(VREDUCES_SAE)
31738   NODE_NAME_CASE(VGETMANT)
31739   NODE_NAME_CASE(VGETMANT_SAE)
31740   NODE_NAME_CASE(VGETMANTS)
31741   NODE_NAME_CASE(VGETMANTS_SAE)
31742   NODE_NAME_CASE(PCMPESTR)
31743   NODE_NAME_CASE(PCMPISTR)
31744   NODE_NAME_CASE(XTEST)
31745   NODE_NAME_CASE(COMPRESS)
31746   NODE_NAME_CASE(EXPAND)
31747   NODE_NAME_CASE(SELECTS)
31748   NODE_NAME_CASE(ADDSUB)
31749   NODE_NAME_CASE(RCP14)
31750   NODE_NAME_CASE(RCP14S)
31751   NODE_NAME_CASE(RCP28)
31752   NODE_NAME_CASE(RCP28_SAE)
31753   NODE_NAME_CASE(RCP28S)
31754   NODE_NAME_CASE(RCP28S_SAE)
31755   NODE_NAME_CASE(EXP2)
31756   NODE_NAME_CASE(EXP2_SAE)
31757   NODE_NAME_CASE(RSQRT14)
31758   NODE_NAME_CASE(RSQRT14S)
31759   NODE_NAME_CASE(RSQRT28)
31760   NODE_NAME_CASE(RSQRT28_SAE)
31761   NODE_NAME_CASE(RSQRT28S)
31762   NODE_NAME_CASE(RSQRT28S_SAE)
31763   NODE_NAME_CASE(FADD_RND)
31764   NODE_NAME_CASE(FADDS)
31765   NODE_NAME_CASE(FADDS_RND)
31766   NODE_NAME_CASE(FSUB_RND)
31767   NODE_NAME_CASE(FSUBS)
31768   NODE_NAME_CASE(FSUBS_RND)
31769   NODE_NAME_CASE(FMUL_RND)
31770   NODE_NAME_CASE(FMULS)
31771   NODE_NAME_CASE(FMULS_RND)
31772   NODE_NAME_CASE(FDIV_RND)
31773   NODE_NAME_CASE(FDIVS)
31774   NODE_NAME_CASE(FDIVS_RND)
31775   NODE_NAME_CASE(FSQRT_RND)
31776   NODE_NAME_CASE(FSQRTS)
31777   NODE_NAME_CASE(FSQRTS_RND)
31778   NODE_NAME_CASE(FGETEXP)
31779   NODE_NAME_CASE(FGETEXP_SAE)
31780   NODE_NAME_CASE(FGETEXPS)
31781   NODE_NAME_CASE(FGETEXPS_SAE)
31782   NODE_NAME_CASE(SCALEF)
31783   NODE_NAME_CASE(SCALEF_RND)
31784   NODE_NAME_CASE(SCALEFS)
31785   NODE_NAME_CASE(SCALEFS_RND)
31786   NODE_NAME_CASE(AVG)
31787   NODE_NAME_CASE(MULHRS)
31788   NODE_NAME_CASE(SINT_TO_FP_RND)
31789   NODE_NAME_CASE(UINT_TO_FP_RND)
31790   NODE_NAME_CASE(CVTTP2SI)
31791   NODE_NAME_CASE(CVTTP2UI)
31792   NODE_NAME_CASE(STRICT_CVTTP2SI)
31793   NODE_NAME_CASE(STRICT_CVTTP2UI)
31794   NODE_NAME_CASE(MCVTTP2SI)
31795   NODE_NAME_CASE(MCVTTP2UI)
31796   NODE_NAME_CASE(CVTTP2SI_SAE)
31797   NODE_NAME_CASE(CVTTP2UI_SAE)
31798   NODE_NAME_CASE(CVTTS2SI)
31799   NODE_NAME_CASE(CVTTS2UI)
31800   NODE_NAME_CASE(CVTTS2SI_SAE)
31801   NODE_NAME_CASE(CVTTS2UI_SAE)
31802   NODE_NAME_CASE(CVTSI2P)
31803   NODE_NAME_CASE(CVTUI2P)
31804   NODE_NAME_CASE(STRICT_CVTSI2P)
31805   NODE_NAME_CASE(STRICT_CVTUI2P)
31806   NODE_NAME_CASE(MCVTSI2P)
31807   NODE_NAME_CASE(MCVTUI2P)
31808   NODE_NAME_CASE(VFPCLASS)
31809   NODE_NAME_CASE(VFPCLASSS)
31810   NODE_NAME_CASE(MULTISHIFT)
31811   NODE_NAME_CASE(SCALAR_SINT_TO_FP)
31812   NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
31813   NODE_NAME_CASE(SCALAR_UINT_TO_FP)
31814   NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
31815   NODE_NAME_CASE(CVTPS2PH)
31816   NODE_NAME_CASE(STRICT_CVTPS2PH)
31817   NODE_NAME_CASE(MCVTPS2PH)
31818   NODE_NAME_CASE(CVTPH2PS)
31819   NODE_NAME_CASE(STRICT_CVTPH2PS)
31820   NODE_NAME_CASE(CVTPH2PS_SAE)
31821   NODE_NAME_CASE(CVTP2SI)
31822   NODE_NAME_CASE(CVTP2UI)
31823   NODE_NAME_CASE(MCVTP2SI)
31824   NODE_NAME_CASE(MCVTP2UI)
31825   NODE_NAME_CASE(CVTP2SI_RND)
31826   NODE_NAME_CASE(CVTP2UI_RND)
31827   NODE_NAME_CASE(CVTS2SI)
31828   NODE_NAME_CASE(CVTS2UI)
31829   NODE_NAME_CASE(CVTS2SI_RND)
31830   NODE_NAME_CASE(CVTS2UI_RND)
31831   NODE_NAME_CASE(CVTNE2PS2BF16)
31832   NODE_NAME_CASE(CVTNEPS2BF16)
31833   NODE_NAME_CASE(MCVTNEPS2BF16)
31834   NODE_NAME_CASE(DPBF16PS)
31835   NODE_NAME_CASE(LWPINS)
31836   NODE_NAME_CASE(MGATHER)
31837   NODE_NAME_CASE(MSCATTER)
31838   NODE_NAME_CASE(VPDPBUSD)
31839   NODE_NAME_CASE(VPDPBUSDS)
31840   NODE_NAME_CASE(VPDPWSSD)
31841   NODE_NAME_CASE(VPDPWSSDS)
31842   NODE_NAME_CASE(VPSHUFBITQMB)
31843   NODE_NAME_CASE(GF2P8MULB)
31844   NODE_NAME_CASE(GF2P8AFFINEQB)
31845   NODE_NAME_CASE(GF2P8AFFINEINVQB)
31846   NODE_NAME_CASE(NT_CALL)
31847   NODE_NAME_CASE(NT_BRIND)
31848   NODE_NAME_CASE(UMWAIT)
31849   NODE_NAME_CASE(TPAUSE)
31850   NODE_NAME_CASE(ENQCMD)
31851   NODE_NAME_CASE(ENQCMDS)
31852   NODE_NAME_CASE(VP2INTERSECT)
31853   NODE_NAME_CASE(AESENC128KL)
31854   NODE_NAME_CASE(AESDEC128KL)
31855   NODE_NAME_CASE(AESENC256KL)
31856   NODE_NAME_CASE(AESDEC256KL)
31857   NODE_NAME_CASE(AESENCWIDE128KL)
31858   NODE_NAME_CASE(AESDECWIDE128KL)
31859   NODE_NAME_CASE(AESENCWIDE256KL)
31860   NODE_NAME_CASE(AESDECWIDE256KL)
31861   NODE_NAME_CASE(TESTUI)
31862   }
31863   return nullptr;
31864 #undef NODE_NAME_CASE
31865 }
31866 
31867 /// Return true if the addressing mode represented by AM is legal for this
31868 /// target, for a load/store of the specified type.
isLegalAddressingMode(const DataLayout & DL,const AddrMode & AM,Type * Ty,unsigned AS,Instruction * I) const31869 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
31870                                               const AddrMode &AM, Type *Ty,
31871                                               unsigned AS,
31872                                               Instruction *I) const {
31873   // X86 supports extremely general addressing modes.
31874   CodeModel::Model M = getTargetMachine().getCodeModel();
31875 
31876   // X86 allows a sign-extended 32-bit immediate field as a displacement.
31877   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
31878     return false;
31879 
31880   if (AM.BaseGV) {
31881     unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
31882 
31883     // If a reference to this global requires an extra load, we can't fold it.
31884     if (isGlobalStubReference(GVFlags))
31885       return false;
31886 
31887     // If BaseGV requires a register for the PIC base, we cannot also have a
31888     // BaseReg specified.
31889     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
31890       return false;
31891 
31892     // If lower 4G is not available, then we must use rip-relative addressing.
31893     if ((M != CodeModel::Small || isPositionIndependent()) &&
31894         Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
31895       return false;
31896   }
31897 
31898   switch (AM.Scale) {
31899   case 0:
31900   case 1:
31901   case 2:
31902   case 4:
31903   case 8:
31904     // These scales always work.
31905     break;
31906   case 3:
31907   case 5:
31908   case 9:
31909     // These scales are formed with basereg+scalereg.  Only accept if there is
31910     // no basereg yet.
31911     if (AM.HasBaseReg)
31912       return false;
31913     break;
31914   default:  // Other stuff never works.
31915     return false;
31916   }
31917 
31918   return true;
31919 }
31920 
isVectorShiftByScalarCheap(Type * Ty) const31921 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
31922   unsigned Bits = Ty->getScalarSizeInBits();
31923 
31924   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
31925   // particularly cheaper than those without.
31926   if (Bits == 8)
31927     return false;
31928 
31929   // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
31930   // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
31931   if (Subtarget.hasXOP() &&
31932       (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
31933     return false;
31934 
31935   // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
31936   // shifts just as cheap as scalar ones.
31937   if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
31938     return false;
31939 
31940   // AVX512BW has shifts such as vpsllvw.
31941   if (Subtarget.hasBWI() && Bits == 16)
31942       return false;
31943 
31944   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
31945   // fully general vector.
31946   return true;
31947 }
31948 
isBinOp(unsigned Opcode) const31949 bool X86TargetLowering::isBinOp(unsigned Opcode) const {
31950   switch (Opcode) {
31951   // These are non-commutative binops.
31952   // TODO: Add more X86ISD opcodes once we have test coverage.
31953   case X86ISD::ANDNP:
31954   case X86ISD::PCMPGT:
31955   case X86ISD::FMAX:
31956   case X86ISD::FMIN:
31957   case X86ISD::FANDN:
31958     return true;
31959   }
31960 
31961   return TargetLoweringBase::isBinOp(Opcode);
31962 }
31963 
isCommutativeBinOp(unsigned Opcode) const31964 bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
31965   switch (Opcode) {
31966   // TODO: Add more X86ISD opcodes once we have test coverage.
31967   case X86ISD::PCMPEQ:
31968   case X86ISD::PMULDQ:
31969   case X86ISD::PMULUDQ:
31970   case X86ISD::FMAXC:
31971   case X86ISD::FMINC:
31972   case X86ISD::FAND:
31973   case X86ISD::FOR:
31974   case X86ISD::FXOR:
31975     return true;
31976   }
31977 
31978   return TargetLoweringBase::isCommutativeBinOp(Opcode);
31979 }
31980 
isTruncateFree(Type * Ty1,Type * Ty2) const31981 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
31982   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
31983     return false;
31984   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
31985   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
31986   return NumBits1 > NumBits2;
31987 }
31988 
allowTruncateForTailCall(Type * Ty1,Type * Ty2) const31989 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
31990   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
31991     return false;
31992 
31993   if (!isTypeLegal(EVT::getEVT(Ty1)))
31994     return false;
31995 
31996   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
31997 
31998   // Assuming the caller doesn't have a zeroext or signext return parameter,
31999   // truncation all the way down to i1 is valid.
32000   return true;
32001 }
32002 
isLegalICmpImmediate(int64_t Imm) const32003 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
32004   return isInt<32>(Imm);
32005 }
32006 
isLegalAddImmediate(int64_t Imm) const32007 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
32008   // Can also use sub to handle negated immediates.
32009   return isInt<32>(Imm);
32010 }
32011 
isLegalStoreImmediate(int64_t Imm) const32012 bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
32013   return isInt<32>(Imm);
32014 }
32015 
isTruncateFree(EVT VT1,EVT VT2) const32016 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
32017   if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
32018     return false;
32019   unsigned NumBits1 = VT1.getSizeInBits();
32020   unsigned NumBits2 = VT2.getSizeInBits();
32021   return NumBits1 > NumBits2;
32022 }
32023 
isZExtFree(Type * Ty1,Type * Ty2) const32024 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
32025   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
32026   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
32027 }
32028 
isZExtFree(EVT VT1,EVT VT2) const32029 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
32030   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
32031   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
32032 }
32033 
isZExtFree(SDValue Val,EVT VT2) const32034 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
32035   EVT VT1 = Val.getValueType();
32036   if (isZExtFree(VT1, VT2))
32037     return true;
32038 
32039   if (Val.getOpcode() != ISD::LOAD)
32040     return false;
32041 
32042   if (!VT1.isSimple() || !VT1.isInteger() ||
32043       !VT2.isSimple() || !VT2.isInteger())
32044     return false;
32045 
32046   switch (VT1.getSimpleVT().SimpleTy) {
32047   default: break;
32048   case MVT::i8:
32049   case MVT::i16:
32050   case MVT::i32:
32051     // X86 has 8, 16, and 32-bit zero-extending loads.
32052     return true;
32053   }
32054 
32055   return false;
32056 }
32057 
shouldSinkOperands(Instruction * I,SmallVectorImpl<Use * > & Ops) const32058 bool X86TargetLowering::shouldSinkOperands(Instruction *I,
32059                                            SmallVectorImpl<Use *> &Ops) const {
32060   // A uniform shift amount in a vector shift or funnel shift may be much
32061   // cheaper than a generic variable vector shift, so make that pattern visible
32062   // to SDAG by sinking the shuffle instruction next to the shift.
32063   int ShiftAmountOpNum = -1;
32064   if (I->isShift())
32065     ShiftAmountOpNum = 1;
32066   else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
32067     if (II->getIntrinsicID() == Intrinsic::fshl ||
32068         II->getIntrinsicID() == Intrinsic::fshr)
32069       ShiftAmountOpNum = 2;
32070   }
32071 
32072   if (ShiftAmountOpNum == -1)
32073     return false;
32074 
32075   auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
32076   if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
32077       isVectorShiftByScalarCheap(I->getType())) {
32078     Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
32079     return true;
32080   }
32081 
32082   return false;
32083 }
32084 
shouldConvertPhiType(Type * From,Type * To) const32085 bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
32086   if (!Subtarget.is64Bit())
32087     return false;
32088   return TargetLowering::shouldConvertPhiType(From, To);
32089 }
32090 
isVectorLoadExtDesirable(SDValue ExtVal) const32091 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
32092   if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
32093     return false;
32094 
32095   EVT SrcVT = ExtVal.getOperand(0).getValueType();
32096 
32097   // There is no extending load for vXi1.
32098   if (SrcVT.getScalarType() == MVT::i1)
32099     return false;
32100 
32101   return true;
32102 }
32103 
isFMAFasterThanFMulAndFAdd(const MachineFunction & MF,EVT VT) const32104 bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
32105                                                    EVT VT) const {
32106   if (!Subtarget.hasAnyFMA())
32107     return false;
32108 
32109   VT = VT.getScalarType();
32110 
32111   if (!VT.isSimple())
32112     return false;
32113 
32114   switch (VT.getSimpleVT().SimpleTy) {
32115   case MVT::f32:
32116   case MVT::f64:
32117     return true;
32118   default:
32119     break;
32120   }
32121 
32122   return false;
32123 }
32124 
isNarrowingProfitable(EVT VT1,EVT VT2) const32125 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
32126   // i16 instructions are longer (0x66 prefix) and potentially slower.
32127   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
32128 }
32129 
32130 /// Targets can use this to indicate that they only support *some*
32131 /// VECTOR_SHUFFLE operations, those with specific masks.
32132 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
32133 /// are assumed to be legal.
isShuffleMaskLegal(ArrayRef<int> Mask,EVT VT) const32134 bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
32135   if (!VT.isSimple())
32136     return false;
32137 
32138   // Not for i1 vectors
32139   if (VT.getSimpleVT().getScalarType() == MVT::i1)
32140     return false;
32141 
32142   // Very little shuffling can be done for 64-bit vectors right now.
32143   if (VT.getSimpleVT().getSizeInBits() == 64)
32144     return false;
32145 
32146   // We only care that the types being shuffled are legal. The lowering can
32147   // handle any possible shuffle mask that results.
32148   return isTypeLegal(VT.getSimpleVT());
32149 }
32150 
isVectorClearMaskLegal(ArrayRef<int> Mask,EVT VT) const32151 bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
32152                                                EVT VT) const {
32153   // Don't convert an 'and' into a shuffle that we don't directly support.
32154   // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
32155   if (!Subtarget.hasAVX2())
32156     if (VT == MVT::v32i8 || VT == MVT::v16i16)
32157       return false;
32158 
32159   // Just delegate to the generic legality, clear masks aren't special.
32160   return isShuffleMaskLegal(Mask, VT);
32161 }
32162 
areJTsAllowed(const Function * Fn) const32163 bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
32164   // If the subtarget is using thunks, we need to not generate jump tables.
32165   if (Subtarget.useIndirectThunkBranches())
32166     return false;
32167 
32168   // Otherwise, fallback on the generic logic.
32169   return TargetLowering::areJTsAllowed(Fn);
32170 }
32171 
32172 //===----------------------------------------------------------------------===//
32173 //                           X86 Scheduler Hooks
32174 //===----------------------------------------------------------------------===//
32175 
32176 // Returns true if EFLAG is consumed after this iterator in the rest of the
32177 // basic block or any successors of the basic block.
isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,MachineBasicBlock * BB)32178 static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
32179                               MachineBasicBlock *BB) {
32180   // Scan forward through BB for a use/def of EFLAGS.
32181   for (MachineBasicBlock::iterator miI = std::next(Itr), miE = BB->end();
32182          miI != miE; ++miI) {
32183     const MachineInstr& mi = *miI;
32184     if (mi.readsRegister(X86::EFLAGS))
32185       return true;
32186     // If we found a def, we can stop searching.
32187     if (mi.definesRegister(X86::EFLAGS))
32188       return false;
32189   }
32190 
32191   // If we hit the end of the block, check whether EFLAGS is live into a
32192   // successor.
32193   for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
32194                                         sEnd = BB->succ_end();
32195        sItr != sEnd; ++sItr) {
32196     MachineBasicBlock* succ = *sItr;
32197     if (succ->isLiveIn(X86::EFLAGS))
32198       return true;
32199   }
32200 
32201   return false;
32202 }
32203 
32204 /// Utility function to emit xbegin specifying the start of an RTM region.
emitXBegin(MachineInstr & MI,MachineBasicBlock * MBB,const TargetInstrInfo * TII)32205 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
32206                                      const TargetInstrInfo *TII) {
32207   const DebugLoc &DL = MI.getDebugLoc();
32208 
32209   const BasicBlock *BB = MBB->getBasicBlock();
32210   MachineFunction::iterator I = ++MBB->getIterator();
32211 
32212   // For the v = xbegin(), we generate
32213   //
32214   // thisMBB:
32215   //  xbegin sinkMBB
32216   //
32217   // mainMBB:
32218   //  s0 = -1
32219   //
32220   // fallBB:
32221   //  eax = # XABORT_DEF
32222   //  s1 = eax
32223   //
32224   // sinkMBB:
32225   //  v = phi(s0/mainBB, s1/fallBB)
32226 
32227   MachineBasicBlock *thisMBB = MBB;
32228   MachineFunction *MF = MBB->getParent();
32229   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
32230   MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
32231   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
32232   MF->insert(I, mainMBB);
32233   MF->insert(I, fallMBB);
32234   MF->insert(I, sinkMBB);
32235 
32236   if (isEFLAGSLiveAfter(MI, MBB)) {
32237     mainMBB->addLiveIn(X86::EFLAGS);
32238     fallMBB->addLiveIn(X86::EFLAGS);
32239     sinkMBB->addLiveIn(X86::EFLAGS);
32240   }
32241 
32242   // Transfer the remainder of BB and its successor edges to sinkMBB.
32243   sinkMBB->splice(sinkMBB->begin(), MBB,
32244                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
32245   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
32246 
32247   MachineRegisterInfo &MRI = MF->getRegInfo();
32248   Register DstReg = MI.getOperand(0).getReg();
32249   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
32250   Register mainDstReg = MRI.createVirtualRegister(RC);
32251   Register fallDstReg = MRI.createVirtualRegister(RC);
32252 
32253   // thisMBB:
32254   //  xbegin fallMBB
32255   //  # fallthrough to mainMBB
32256   //  # abortion to fallMBB
32257   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
32258   thisMBB->addSuccessor(mainMBB);
32259   thisMBB->addSuccessor(fallMBB);
32260 
32261   // mainMBB:
32262   //  mainDstReg := -1
32263   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
32264   BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
32265   mainMBB->addSuccessor(sinkMBB);
32266 
32267   // fallMBB:
32268   //  ; pseudo instruction to model hardware's definition from XABORT
32269   //  EAX := XABORT_DEF
32270   //  fallDstReg := EAX
32271   BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
32272   BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
32273       .addReg(X86::EAX);
32274   fallMBB->addSuccessor(sinkMBB);
32275 
32276   // sinkMBB:
32277   //  DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
32278   BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
32279       .addReg(mainDstReg).addMBB(mainMBB)
32280       .addReg(fallDstReg).addMBB(fallMBB);
32281 
32282   MI.eraseFromParent();
32283   return sinkMBB;
32284 }
32285 
32286 MachineBasicBlock *
EmitVAARGWithCustomInserter(MachineInstr & MI,MachineBasicBlock * MBB) const32287 X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
32288                                                MachineBasicBlock *MBB) const {
32289   // Emit va_arg instruction on X86-64.
32290 
32291   // Operands to this pseudo-instruction:
32292   // 0  ) Output        : destination address (reg)
32293   // 1-5) Input         : va_list address (addr, i64mem)
32294   // 6  ) ArgSize       : Size (in bytes) of vararg type
32295   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
32296   // 8  ) Align         : Alignment of type
32297   // 9  ) EFLAGS (implicit-def)
32298 
32299   assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
32300   static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
32301 
32302   Register DestReg = MI.getOperand(0).getReg();
32303   MachineOperand &Base = MI.getOperand(1);
32304   MachineOperand &Scale = MI.getOperand(2);
32305   MachineOperand &Index = MI.getOperand(3);
32306   MachineOperand &Disp = MI.getOperand(4);
32307   MachineOperand &Segment = MI.getOperand(5);
32308   unsigned ArgSize = MI.getOperand(6).getImm();
32309   unsigned ArgMode = MI.getOperand(7).getImm();
32310   Align Alignment = Align(MI.getOperand(8).getImm());
32311 
32312   MachineFunction *MF = MBB->getParent();
32313 
32314   // Memory Reference
32315   assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
32316 
32317   MachineMemOperand *OldMMO = MI.memoperands().front();
32318 
32319   // Clone the MMO into two separate MMOs for loading and storing
32320   MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
32321       OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
32322   MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
32323       OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
32324 
32325   // Machine Information
32326   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32327   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
32328   const TargetRegisterClass *AddrRegClass =
32329       getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
32330   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
32331   const DebugLoc &DL = MI.getDebugLoc();
32332 
32333   // struct va_list {
32334   //   i32   gp_offset
32335   //   i32   fp_offset
32336   //   i64   overflow_area (address)
32337   //   i64   reg_save_area (address)
32338   // }
32339   // sizeof(va_list) = 24
32340   // alignment(va_list) = 8
32341 
32342   unsigned TotalNumIntRegs = 6;
32343   unsigned TotalNumXMMRegs = 8;
32344   bool UseGPOffset = (ArgMode == 1);
32345   bool UseFPOffset = (ArgMode == 2);
32346   unsigned MaxOffset = TotalNumIntRegs * 8 +
32347                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
32348 
32349   /* Align ArgSize to a multiple of 8 */
32350   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
32351   bool NeedsAlign = (Alignment > 8);
32352 
32353   MachineBasicBlock *thisMBB = MBB;
32354   MachineBasicBlock *overflowMBB;
32355   MachineBasicBlock *offsetMBB;
32356   MachineBasicBlock *endMBB;
32357 
32358   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
32359   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
32360   unsigned OffsetReg = 0;
32361 
32362   if (!UseGPOffset && !UseFPOffset) {
32363     // If we only pull from the overflow region, we don't create a branch.
32364     // We don't need to alter control flow.
32365     OffsetDestReg = 0; // unused
32366     OverflowDestReg = DestReg;
32367 
32368     offsetMBB = nullptr;
32369     overflowMBB = thisMBB;
32370     endMBB = thisMBB;
32371   } else {
32372     // First emit code to check if gp_offset (or fp_offset) is below the bound.
32373     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
32374     // If not, pull from overflow_area. (branch to overflowMBB)
32375     //
32376     //       thisMBB
32377     //         |     .
32378     //         |        .
32379     //     offsetMBB   overflowMBB
32380     //         |        .
32381     //         |     .
32382     //        endMBB
32383 
32384     // Registers for the PHI in endMBB
32385     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
32386     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
32387 
32388     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
32389     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32390     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32391     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32392 
32393     MachineFunction::iterator MBBIter = ++MBB->getIterator();
32394 
32395     // Insert the new basic blocks
32396     MF->insert(MBBIter, offsetMBB);
32397     MF->insert(MBBIter, overflowMBB);
32398     MF->insert(MBBIter, endMBB);
32399 
32400     // Transfer the remainder of MBB and its successor edges to endMBB.
32401     endMBB->splice(endMBB->begin(), thisMBB,
32402                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
32403     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
32404 
32405     // Make offsetMBB and overflowMBB successors of thisMBB
32406     thisMBB->addSuccessor(offsetMBB);
32407     thisMBB->addSuccessor(overflowMBB);
32408 
32409     // endMBB is a successor of both offsetMBB and overflowMBB
32410     offsetMBB->addSuccessor(endMBB);
32411     overflowMBB->addSuccessor(endMBB);
32412 
32413     // Load the offset value into a register
32414     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
32415     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
32416         .add(Base)
32417         .add(Scale)
32418         .add(Index)
32419         .addDisp(Disp, UseFPOffset ? 4 : 0)
32420         .add(Segment)
32421         .setMemRefs(LoadOnlyMMO);
32422 
32423     // Check if there is enough room left to pull this argument.
32424     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
32425       .addReg(OffsetReg)
32426       .addImm(MaxOffset + 8 - ArgSizeA8);
32427 
32428     // Branch to "overflowMBB" if offset >= max
32429     // Fall through to "offsetMBB" otherwise
32430     BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
32431       .addMBB(overflowMBB).addImm(X86::COND_AE);
32432   }
32433 
32434   // In offsetMBB, emit code to use the reg_save_area.
32435   if (offsetMBB) {
32436     assert(OffsetReg != 0);
32437 
32438     // Read the reg_save_area address.
32439     Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
32440     BuildMI(
32441         offsetMBB, DL,
32442         TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
32443         RegSaveReg)
32444         .add(Base)
32445         .add(Scale)
32446         .add(Index)
32447         .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
32448         .add(Segment)
32449         .setMemRefs(LoadOnlyMMO);
32450 
32451     if (Subtarget.isTarget64BitLP64()) {
32452       // Zero-extend the offset
32453       Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
32454       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
32455           .addImm(0)
32456           .addReg(OffsetReg)
32457           .addImm(X86::sub_32bit);
32458 
32459       // Add the offset to the reg_save_area to get the final address.
32460       BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
32461           .addReg(OffsetReg64)
32462           .addReg(RegSaveReg);
32463     } else {
32464       // Add the offset to the reg_save_area to get the final address.
32465       BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)
32466           .addReg(OffsetReg)
32467           .addReg(RegSaveReg);
32468     }
32469 
32470     // Compute the offset for the next argument
32471     Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
32472     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
32473       .addReg(OffsetReg)
32474       .addImm(UseFPOffset ? 16 : 8);
32475 
32476     // Store it back into the va_list.
32477     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
32478         .add(Base)
32479         .add(Scale)
32480         .add(Index)
32481         .addDisp(Disp, UseFPOffset ? 4 : 0)
32482         .add(Segment)
32483         .addReg(NextOffsetReg)
32484         .setMemRefs(StoreOnlyMMO);
32485 
32486     // Jump to endMBB
32487     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
32488       .addMBB(endMBB);
32489   }
32490 
32491   //
32492   // Emit code to use overflow area
32493   //
32494 
32495   // Load the overflow_area address into a register.
32496   Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
32497   BuildMI(overflowMBB, DL,
32498           TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
32499           OverflowAddrReg)
32500       .add(Base)
32501       .add(Scale)
32502       .add(Index)
32503       .addDisp(Disp, 8)
32504       .add(Segment)
32505       .setMemRefs(LoadOnlyMMO);
32506 
32507   // If we need to align it, do so. Otherwise, just copy the address
32508   // to OverflowDestReg.
32509   if (NeedsAlign) {
32510     // Align the overflow address
32511     Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
32512 
32513     // aligned_addr = (addr + (align-1)) & ~(align-1)
32514     BuildMI(
32515         overflowMBB, DL,
32516         TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
32517         TmpReg)
32518         .addReg(OverflowAddrReg)
32519         .addImm(Alignment.value() - 1);
32520 
32521     BuildMI(
32522         overflowMBB, DL,
32523         TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
32524         OverflowDestReg)
32525         .addReg(TmpReg)
32526         .addImm(~(uint64_t)(Alignment.value() - 1));
32527   } else {
32528     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
32529       .addReg(OverflowAddrReg);
32530   }
32531 
32532   // Compute the next overflow address after this argument.
32533   // (the overflow address should be kept 8-byte aligned)
32534   Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
32535   BuildMI(
32536       overflowMBB, DL,
32537       TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
32538       NextAddrReg)
32539       .addReg(OverflowDestReg)
32540       .addImm(ArgSizeA8);
32541 
32542   // Store the new overflow address.
32543   BuildMI(overflowMBB, DL,
32544           TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
32545       .add(Base)
32546       .add(Scale)
32547       .add(Index)
32548       .addDisp(Disp, 8)
32549       .add(Segment)
32550       .addReg(NextAddrReg)
32551       .setMemRefs(StoreOnlyMMO);
32552 
32553   // If we branched, emit the PHI to the front of endMBB.
32554   if (offsetMBB) {
32555     BuildMI(*endMBB, endMBB->begin(), DL,
32556             TII->get(X86::PHI), DestReg)
32557       .addReg(OffsetDestReg).addMBB(offsetMBB)
32558       .addReg(OverflowDestReg).addMBB(overflowMBB);
32559   }
32560 
32561   // Erase the pseudo instruction
32562   MI.eraseFromParent();
32563 
32564   return endMBB;
32565 }
32566 
32567 // The EFLAGS operand of SelectItr might be missing a kill marker
32568 // because there were multiple uses of EFLAGS, and ISel didn't know
32569 // which to mark. Figure out whether SelectItr should have had a
32570 // kill marker, and set it if it should. Returns the correct kill
32571 // marker value.
checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,MachineBasicBlock * BB,const TargetRegisterInfo * TRI)32572 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
32573                                      MachineBasicBlock* BB,
32574                                      const TargetRegisterInfo* TRI) {
32575   if (isEFLAGSLiveAfter(SelectItr, BB))
32576     return false;
32577 
32578   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
32579   // out. SelectMI should have a kill flag on EFLAGS.
32580   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
32581   return true;
32582 }
32583 
32584 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
32585 // together with other CMOV pseudo-opcodes into a single basic-block with
32586 // conditional jump around it.
isCMOVPseudo(MachineInstr & MI)32587 static bool isCMOVPseudo(MachineInstr &MI) {
32588   switch (MI.getOpcode()) {
32589   case X86::CMOV_FR32:
32590   case X86::CMOV_FR32X:
32591   case X86::CMOV_FR64:
32592   case X86::CMOV_FR64X:
32593   case X86::CMOV_GR8:
32594   case X86::CMOV_GR16:
32595   case X86::CMOV_GR32:
32596   case X86::CMOV_RFP32:
32597   case X86::CMOV_RFP64:
32598   case X86::CMOV_RFP80:
32599   case X86::CMOV_VR64:
32600   case X86::CMOV_VR128:
32601   case X86::CMOV_VR128X:
32602   case X86::CMOV_VR256:
32603   case X86::CMOV_VR256X:
32604   case X86::CMOV_VR512:
32605   case X86::CMOV_VK1:
32606   case X86::CMOV_VK2:
32607   case X86::CMOV_VK4:
32608   case X86::CMOV_VK8:
32609   case X86::CMOV_VK16:
32610   case X86::CMOV_VK32:
32611   case X86::CMOV_VK64:
32612     return true;
32613 
32614   default:
32615     return false;
32616   }
32617 }
32618 
32619 // Helper function, which inserts PHI functions into SinkMBB:
32620 //   %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
32621 // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
32622 // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
32623 // the last PHI function inserted.
createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin,MachineBasicBlock::iterator MIItEnd,MachineBasicBlock * TrueMBB,MachineBasicBlock * FalseMBB,MachineBasicBlock * SinkMBB)32624 static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
32625     MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
32626     MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
32627     MachineBasicBlock *SinkMBB) {
32628   MachineFunction *MF = TrueMBB->getParent();
32629   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
32630   const DebugLoc &DL = MIItBegin->getDebugLoc();
32631 
32632   X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
32633   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
32634 
32635   MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
32636 
32637   // As we are creating the PHIs, we have to be careful if there is more than
32638   // one.  Later CMOVs may reference the results of earlier CMOVs, but later
32639   // PHIs have to reference the individual true/false inputs from earlier PHIs.
32640   // That also means that PHI construction must work forward from earlier to
32641   // later, and that the code must maintain a mapping from earlier PHI's
32642   // destination registers, and the registers that went into the PHI.
32643   DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
32644   MachineInstrBuilder MIB;
32645 
32646   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
32647     Register DestReg = MIIt->getOperand(0).getReg();
32648     Register Op1Reg = MIIt->getOperand(1).getReg();
32649     Register Op2Reg = MIIt->getOperand(2).getReg();
32650 
32651     // If this CMOV we are generating is the opposite condition from
32652     // the jump we generated, then we have to swap the operands for the
32653     // PHI that is going to be generated.
32654     if (MIIt->getOperand(3).getImm() == OppCC)
32655       std::swap(Op1Reg, Op2Reg);
32656 
32657     if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
32658       Op1Reg = RegRewriteTable[Op1Reg].first;
32659 
32660     if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
32661       Op2Reg = RegRewriteTable[Op2Reg].second;
32662 
32663     MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
32664               .addReg(Op1Reg)
32665               .addMBB(FalseMBB)
32666               .addReg(Op2Reg)
32667               .addMBB(TrueMBB);
32668 
32669     // Add this PHI to the rewrite table.
32670     RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
32671   }
32672 
32673   return MIB;
32674 }
32675 
32676 // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
32677 MachineBasicBlock *
EmitLoweredCascadedSelect(MachineInstr & FirstCMOV,MachineInstr & SecondCascadedCMOV,MachineBasicBlock * ThisMBB) const32678 X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
32679                                              MachineInstr &SecondCascadedCMOV,
32680                                              MachineBasicBlock *ThisMBB) const {
32681   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32682   const DebugLoc &DL = FirstCMOV.getDebugLoc();
32683 
32684   // We lower cascaded CMOVs such as
32685   //
32686   //   (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
32687   //
32688   // to two successive branches.
32689   //
32690   // Without this, we would add a PHI between the two jumps, which ends up
32691   // creating a few copies all around. For instance, for
32692   //
32693   //    (sitofp (zext (fcmp une)))
32694   //
32695   // we would generate:
32696   //
32697   //         ucomiss %xmm1, %xmm0
32698   //         movss  <1.0f>, %xmm0
32699   //         movaps  %xmm0, %xmm1
32700   //         jne     .LBB5_2
32701   //         xorps   %xmm1, %xmm1
32702   // .LBB5_2:
32703   //         jp      .LBB5_4
32704   //         movaps  %xmm1, %xmm0
32705   // .LBB5_4:
32706   //         retq
32707   //
32708   // because this custom-inserter would have generated:
32709   //
32710   //   A
32711   //   | \
32712   //   |  B
32713   //   | /
32714   //   C
32715   //   | \
32716   //   |  D
32717   //   | /
32718   //   E
32719   //
32720   // A: X = ...; Y = ...
32721   // B: empty
32722   // C: Z = PHI [X, A], [Y, B]
32723   // D: empty
32724   // E: PHI [X, C], [Z, D]
32725   //
32726   // If we lower both CMOVs in a single step, we can instead generate:
32727   //
32728   //   A
32729   //   | \
32730   //   |  C
32731   //   | /|
32732   //   |/ |
32733   //   |  |
32734   //   |  D
32735   //   | /
32736   //   E
32737   //
32738   // A: X = ...; Y = ...
32739   // D: empty
32740   // E: PHI [X, A], [X, C], [Y, D]
32741   //
32742   // Which, in our sitofp/fcmp example, gives us something like:
32743   //
32744   //         ucomiss %xmm1, %xmm0
32745   //         movss  <1.0f>, %xmm0
32746   //         jne     .LBB5_4
32747   //         jp      .LBB5_4
32748   //         xorps   %xmm0, %xmm0
32749   // .LBB5_4:
32750   //         retq
32751   //
32752 
32753   // We lower cascaded CMOV into two successive branches to the same block.
32754   // EFLAGS is used by both, so mark it as live in the second.
32755   const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
32756   MachineFunction *F = ThisMBB->getParent();
32757   MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
32758   MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
32759   MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
32760 
32761   MachineFunction::iterator It = ++ThisMBB->getIterator();
32762   F->insert(It, FirstInsertedMBB);
32763   F->insert(It, SecondInsertedMBB);
32764   F->insert(It, SinkMBB);
32765 
32766   // For a cascaded CMOV, we lower it to two successive branches to
32767   // the same block (SinkMBB).  EFLAGS is used by both, so mark it as live in
32768   // the FirstInsertedMBB.
32769   FirstInsertedMBB->addLiveIn(X86::EFLAGS);
32770 
32771   // If the EFLAGS register isn't dead in the terminator, then claim that it's
32772   // live into the sink and copy blocks.
32773   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
32774   if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
32775       !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
32776     SecondInsertedMBB->addLiveIn(X86::EFLAGS);
32777     SinkMBB->addLiveIn(X86::EFLAGS);
32778   }
32779 
32780   // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
32781   SinkMBB->splice(SinkMBB->begin(), ThisMBB,
32782                   std::next(MachineBasicBlock::iterator(FirstCMOV)),
32783                   ThisMBB->end());
32784   SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
32785 
32786   // Fallthrough block for ThisMBB.
32787   ThisMBB->addSuccessor(FirstInsertedMBB);
32788   // The true block target of the first branch is always SinkMBB.
32789   ThisMBB->addSuccessor(SinkMBB);
32790   // Fallthrough block for FirstInsertedMBB.
32791   FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
32792   // The true block for the branch of FirstInsertedMBB.
32793   FirstInsertedMBB->addSuccessor(SinkMBB);
32794   // This is fallthrough.
32795   SecondInsertedMBB->addSuccessor(SinkMBB);
32796 
32797   // Create the conditional branch instructions.
32798   X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
32799   BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
32800 
32801   X86::CondCode SecondCC =
32802       X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
32803   BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
32804 
32805   //  SinkMBB:
32806   //   %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
32807   Register DestReg = FirstCMOV.getOperand(0).getReg();
32808   Register Op1Reg = FirstCMOV.getOperand(1).getReg();
32809   Register Op2Reg = FirstCMOV.getOperand(2).getReg();
32810   MachineInstrBuilder MIB =
32811       BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
32812           .addReg(Op1Reg)
32813           .addMBB(SecondInsertedMBB)
32814           .addReg(Op2Reg)
32815           .addMBB(ThisMBB);
32816 
32817   // The second SecondInsertedMBB provides the same incoming value as the
32818   // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
32819   MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
32820   // Copy the PHI result to the register defined by the second CMOV.
32821   BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
32822           TII->get(TargetOpcode::COPY),
32823           SecondCascadedCMOV.getOperand(0).getReg())
32824       .addReg(FirstCMOV.getOperand(0).getReg());
32825 
32826   // Now remove the CMOVs.
32827   FirstCMOV.eraseFromParent();
32828   SecondCascadedCMOV.eraseFromParent();
32829 
32830   return SinkMBB;
32831 }
32832 
32833 MachineBasicBlock *
EmitLoweredSelect(MachineInstr & MI,MachineBasicBlock * ThisMBB) const32834 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
32835                                      MachineBasicBlock *ThisMBB) const {
32836   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32837   const DebugLoc &DL = MI.getDebugLoc();
32838 
32839   // To "insert" a SELECT_CC instruction, we actually have to insert the
32840   // diamond control-flow pattern.  The incoming instruction knows the
32841   // destination vreg to set, the condition code register to branch on, the
32842   // true/false values to select between and a branch opcode to use.
32843 
32844   //  ThisMBB:
32845   //  ...
32846   //   TrueVal = ...
32847   //   cmpTY ccX, r1, r2
32848   //   bCC copy1MBB
32849   //   fallthrough --> FalseMBB
32850 
32851   // This code lowers all pseudo-CMOV instructions. Generally it lowers these
32852   // as described above, by inserting a BB, and then making a PHI at the join
32853   // point to select the true and false operands of the CMOV in the PHI.
32854   //
32855   // The code also handles two different cases of multiple CMOV opcodes
32856   // in a row.
32857   //
32858   // Case 1:
32859   // In this case, there are multiple CMOVs in a row, all which are based on
32860   // the same condition setting (or the exact opposite condition setting).
32861   // In this case we can lower all the CMOVs using a single inserted BB, and
32862   // then make a number of PHIs at the join point to model the CMOVs. The only
32863   // trickiness here, is that in a case like:
32864   //
32865   // t2 = CMOV cond1 t1, f1
32866   // t3 = CMOV cond1 t2, f2
32867   //
32868   // when rewriting this into PHIs, we have to perform some renaming on the
32869   // temps since you cannot have a PHI operand refer to a PHI result earlier
32870   // in the same block.  The "simple" but wrong lowering would be:
32871   //
32872   // t2 = PHI t1(BB1), f1(BB2)
32873   // t3 = PHI t2(BB1), f2(BB2)
32874   //
32875   // but clearly t2 is not defined in BB1, so that is incorrect. The proper
32876   // renaming is to note that on the path through BB1, t2 is really just a
32877   // copy of t1, and do that renaming, properly generating:
32878   //
32879   // t2 = PHI t1(BB1), f1(BB2)
32880   // t3 = PHI t1(BB1), f2(BB2)
32881   //
32882   // Case 2:
32883   // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
32884   // function - EmitLoweredCascadedSelect.
32885 
32886   X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
32887   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
32888   MachineInstr *LastCMOV = &MI;
32889   MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
32890 
32891   // Check for case 1, where there are multiple CMOVs with the same condition
32892   // first.  Of the two cases of multiple CMOV lowerings, case 1 reduces the
32893   // number of jumps the most.
32894 
32895   if (isCMOVPseudo(MI)) {
32896     // See if we have a string of CMOVS with the same condition. Skip over
32897     // intervening debug insts.
32898     while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
32899            (NextMIIt->getOperand(3).getImm() == CC ||
32900             NextMIIt->getOperand(3).getImm() == OppCC)) {
32901       LastCMOV = &*NextMIIt;
32902       NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
32903     }
32904   }
32905 
32906   // This checks for case 2, but only do this if we didn't already find
32907   // case 1, as indicated by LastCMOV == MI.
32908   if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
32909       NextMIIt->getOpcode() == MI.getOpcode() &&
32910       NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
32911       NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
32912       NextMIIt->getOperand(1).isKill()) {
32913     return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
32914   }
32915 
32916   const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
32917   MachineFunction *F = ThisMBB->getParent();
32918   MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
32919   MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
32920 
32921   MachineFunction::iterator It = ++ThisMBB->getIterator();
32922   F->insert(It, FalseMBB);
32923   F->insert(It, SinkMBB);
32924 
32925   // If the EFLAGS register isn't dead in the terminator, then claim that it's
32926   // live into the sink and copy blocks.
32927   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
32928   if (!LastCMOV->killsRegister(X86::EFLAGS) &&
32929       !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
32930     FalseMBB->addLiveIn(X86::EFLAGS);
32931     SinkMBB->addLiveIn(X86::EFLAGS);
32932   }
32933 
32934   // Transfer any debug instructions inside the CMOV sequence to the sunk block.
32935   auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);
32936   auto DbgIt = MachineBasicBlock::iterator(MI);
32937   while (DbgIt != DbgEnd) {
32938     auto Next = std::next(DbgIt);
32939     if (DbgIt->isDebugInstr())
32940       SinkMBB->push_back(DbgIt->removeFromParent());
32941     DbgIt = Next;
32942   }
32943 
32944   // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
32945   SinkMBB->splice(SinkMBB->end(), ThisMBB,
32946                   std::next(MachineBasicBlock::iterator(LastCMOV)),
32947                   ThisMBB->end());
32948   SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
32949 
32950   // Fallthrough block for ThisMBB.
32951   ThisMBB->addSuccessor(FalseMBB);
32952   // The true block target of the first (or only) branch is always a SinkMBB.
32953   ThisMBB->addSuccessor(SinkMBB);
32954   // Fallthrough block for FalseMBB.
32955   FalseMBB->addSuccessor(SinkMBB);
32956 
32957   // Create the conditional branch instruction.
32958   BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
32959 
32960   //  SinkMBB:
32961   //   %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
32962   //  ...
32963   MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
32964   MachineBasicBlock::iterator MIItEnd =
32965       std::next(MachineBasicBlock::iterator(LastCMOV));
32966   createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
32967 
32968   // Now remove the CMOV(s).
32969   ThisMBB->erase(MIItBegin, MIItEnd);
32970 
32971   return SinkMBB;
32972 }
32973 
getSUBriOpcode(bool IsLP64,int64_t Imm)32974 static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
32975   if (IsLP64) {
32976     if (isInt<8>(Imm))
32977       return X86::SUB64ri8;
32978     return X86::SUB64ri32;
32979   } else {
32980     if (isInt<8>(Imm))
32981       return X86::SUB32ri8;
32982     return X86::SUB32ri;
32983   }
32984 }
32985 
32986 MachineBasicBlock *
EmitLoweredProbedAlloca(MachineInstr & MI,MachineBasicBlock * MBB) const32987 X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
32988                                            MachineBasicBlock *MBB) const {
32989   MachineFunction *MF = MBB->getParent();
32990   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32991   const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
32992   const DebugLoc &DL = MI.getDebugLoc();
32993   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
32994 
32995   const unsigned ProbeSize = getStackProbeSize(*MF);
32996 
32997   MachineRegisterInfo &MRI = MF->getRegInfo();
32998   MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32999   MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33000   MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33001 
33002   MachineFunction::iterator MBBIter = ++MBB->getIterator();
33003   MF->insert(MBBIter, testMBB);
33004   MF->insert(MBBIter, blockMBB);
33005   MF->insert(MBBIter, tailMBB);
33006 
33007   Register sizeVReg = MI.getOperand(1).getReg();
33008 
33009   Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
33010 
33011   Register TmpStackPtr = MRI.createVirtualRegister(
33012       TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
33013   Register FinalStackPtr = MRI.createVirtualRegister(
33014       TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
33015 
33016   BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
33017       .addReg(physSPReg);
33018   {
33019     const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
33020     BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
33021         .addReg(TmpStackPtr)
33022         .addReg(sizeVReg);
33023   }
33024 
33025   // test rsp size
33026 
33027   BuildMI(testMBB, DL,
33028           TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
33029       .addReg(FinalStackPtr)
33030       .addReg(physSPReg);
33031 
33032   BuildMI(testMBB, DL, TII->get(X86::JCC_1))
33033       .addMBB(tailMBB)
33034       .addImm(X86::COND_GE);
33035   testMBB->addSuccessor(blockMBB);
33036   testMBB->addSuccessor(tailMBB);
33037 
33038   // Touch the block then extend it. This is done on the opposite side of
33039   // static probe where we allocate then touch, to avoid the need of probing the
33040   // tail of the static alloca. Possible scenarios are:
33041   //
33042   //       + ---- <- ------------ <- ------------- <- ------------ +
33043   //       |                                                       |
33044   // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
33045   //                                                               |                                                               |
33046   //                                                               + <- ----------- <- ------------ <- ----------- <- ------------ +
33047   //
33048   // The property we want to enforce is to never have more than [page alloc] between two probes.
33049 
33050   const unsigned XORMIOpc =
33051       TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;
33052   addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)
33053       .addImm(0);
33054 
33055   BuildMI(blockMBB, DL,
33056           TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
33057       .addReg(physSPReg)
33058       .addImm(ProbeSize);
33059 
33060 
33061   BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
33062   blockMBB->addSuccessor(testMBB);
33063 
33064   // Replace original instruction by the expected stack ptr
33065   BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
33066       .addReg(FinalStackPtr);
33067 
33068   tailMBB->splice(tailMBB->end(), MBB,
33069                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
33070   tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
33071   MBB->addSuccessor(testMBB);
33072 
33073   // Delete the original pseudo instruction.
33074   MI.eraseFromParent();
33075 
33076   // And we're done.
33077   return tailMBB;
33078 }
33079 
33080 MachineBasicBlock *
EmitLoweredSegAlloca(MachineInstr & MI,MachineBasicBlock * BB) const33081 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
33082                                         MachineBasicBlock *BB) const {
33083   MachineFunction *MF = BB->getParent();
33084   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33085   const DebugLoc &DL = MI.getDebugLoc();
33086   const BasicBlock *LLVM_BB = BB->getBasicBlock();
33087 
33088   assert(MF->shouldSplitStack());
33089 
33090   const bool Is64Bit = Subtarget.is64Bit();
33091   const bool IsLP64 = Subtarget.isTarget64BitLP64();
33092 
33093   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
33094   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
33095 
33096   // BB:
33097   //  ... [Till the alloca]
33098   // If stacklet is not large enough, jump to mallocMBB
33099   //
33100   // bumpMBB:
33101   //  Allocate by subtracting from RSP
33102   //  Jump to continueMBB
33103   //
33104   // mallocMBB:
33105   //  Allocate by call to runtime
33106   //
33107   // continueMBB:
33108   //  ...
33109   //  [rest of original BB]
33110   //
33111 
33112   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33113   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33114   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33115 
33116   MachineRegisterInfo &MRI = MF->getRegInfo();
33117   const TargetRegisterClass *AddrRegClass =
33118       getRegClassFor(getPointerTy(MF->getDataLayout()));
33119 
33120   Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
33121            bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
33122            tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
33123            SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
33124            sizeVReg = MI.getOperand(1).getReg(),
33125            physSPReg =
33126                IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
33127 
33128   MachineFunction::iterator MBBIter = ++BB->getIterator();
33129 
33130   MF->insert(MBBIter, bumpMBB);
33131   MF->insert(MBBIter, mallocMBB);
33132   MF->insert(MBBIter, continueMBB);
33133 
33134   continueMBB->splice(continueMBB->begin(), BB,
33135                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
33136   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
33137 
33138   // Add code to the main basic block to check if the stack limit has been hit,
33139   // and if so, jump to mallocMBB otherwise to bumpMBB.
33140   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
33141   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
33142     .addReg(tmpSPVReg).addReg(sizeVReg);
33143   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
33144     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
33145     .addReg(SPLimitVReg);
33146   BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
33147 
33148   // bumpMBB simply decreases the stack pointer, since we know the current
33149   // stacklet has enough space.
33150   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
33151     .addReg(SPLimitVReg);
33152   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
33153     .addReg(SPLimitVReg);
33154   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
33155 
33156   // Calls into a routine in libgcc to allocate more space from the heap.
33157   const uint32_t *RegMask =
33158       Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
33159   if (IsLP64) {
33160     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
33161       .addReg(sizeVReg);
33162     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
33163       .addExternalSymbol("__morestack_allocate_stack_space")
33164       .addRegMask(RegMask)
33165       .addReg(X86::RDI, RegState::Implicit)
33166       .addReg(X86::RAX, RegState::ImplicitDefine);
33167   } else if (Is64Bit) {
33168     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
33169       .addReg(sizeVReg);
33170     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
33171       .addExternalSymbol("__morestack_allocate_stack_space")
33172       .addRegMask(RegMask)
33173       .addReg(X86::EDI, RegState::Implicit)
33174       .addReg(X86::EAX, RegState::ImplicitDefine);
33175   } else {
33176     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
33177       .addImm(12);
33178     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
33179     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
33180       .addExternalSymbol("__morestack_allocate_stack_space")
33181       .addRegMask(RegMask)
33182       .addReg(X86::EAX, RegState::ImplicitDefine);
33183   }
33184 
33185   if (!Is64Bit)
33186     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
33187       .addImm(16);
33188 
33189   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
33190     .addReg(IsLP64 ? X86::RAX : X86::EAX);
33191   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
33192 
33193   // Set up the CFG correctly.
33194   BB->addSuccessor(bumpMBB);
33195   BB->addSuccessor(mallocMBB);
33196   mallocMBB->addSuccessor(continueMBB);
33197   bumpMBB->addSuccessor(continueMBB);
33198 
33199   // Take care of the PHI nodes.
33200   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
33201           MI.getOperand(0).getReg())
33202       .addReg(mallocPtrVReg)
33203       .addMBB(mallocMBB)
33204       .addReg(bumpSPPtrVReg)
33205       .addMBB(bumpMBB);
33206 
33207   // Delete the original pseudo instruction.
33208   MI.eraseFromParent();
33209 
33210   // And we're done.
33211   return continueMBB;
33212 }
33213 
33214 MachineBasicBlock *
EmitLoweredCatchRet(MachineInstr & MI,MachineBasicBlock * BB) const33215 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
33216                                        MachineBasicBlock *BB) const {
33217   MachineFunction *MF = BB->getParent();
33218   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
33219   MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
33220   const DebugLoc &DL = MI.getDebugLoc();
33221 
33222   assert(!isAsynchronousEHPersonality(
33223              classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
33224          "SEH does not use catchret!");
33225 
33226   // Only 32-bit EH needs to worry about manually restoring stack pointers.
33227   if (!Subtarget.is32Bit())
33228     return BB;
33229 
33230   // C++ EH creates a new target block to hold the restore code, and wires up
33231   // the new block to the return destination with a normal JMP_4.
33232   MachineBasicBlock *RestoreMBB =
33233       MF->CreateMachineBasicBlock(BB->getBasicBlock());
33234   assert(BB->succ_size() == 1);
33235   MF->insert(std::next(BB->getIterator()), RestoreMBB);
33236   RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
33237   BB->addSuccessor(RestoreMBB);
33238   MI.getOperand(0).setMBB(RestoreMBB);
33239 
33240   // Marking this as an EH pad but not a funclet entry block causes PEI to
33241   // restore stack pointers in the block.
33242   RestoreMBB->setIsEHPad(true);
33243 
33244   auto RestoreMBBI = RestoreMBB->begin();
33245   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
33246   return BB;
33247 }
33248 
33249 MachineBasicBlock *
EmitLoweredTLSAddr(MachineInstr & MI,MachineBasicBlock * BB) const33250 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
33251                                       MachineBasicBlock *BB) const {
33252   // So, here we replace TLSADDR with the sequence:
33253   // adjust_stackdown -> TLSADDR -> adjust_stackup.
33254   // We need this because TLSADDR is lowered into calls
33255   // inside MC, therefore without the two markers shrink-wrapping
33256   // may push the prologue/epilogue pass them.
33257   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
33258   const DebugLoc &DL = MI.getDebugLoc();
33259   MachineFunction &MF = *BB->getParent();
33260 
33261   // Emit CALLSEQ_START right before the instruction.
33262   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
33263   MachineInstrBuilder CallseqStart =
33264     BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
33265   BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
33266 
33267   // Emit CALLSEQ_END right after the instruction.
33268   // We don't call erase from parent because we want to keep the
33269   // original instruction around.
33270   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
33271   MachineInstrBuilder CallseqEnd =
33272     BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
33273   BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
33274 
33275   return BB;
33276 }
33277 
33278 MachineBasicBlock *
EmitLoweredTLSCall(MachineInstr & MI,MachineBasicBlock * BB) const33279 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
33280                                       MachineBasicBlock *BB) const {
33281   // This is pretty easy.  We're taking the value that we received from
33282   // our load from the relocation, sticking it in either RDI (x86-64)
33283   // or EAX and doing an indirect call.  The return value will then
33284   // be in the normal return register.
33285   MachineFunction *F = BB->getParent();
33286   const X86InstrInfo *TII = Subtarget.getInstrInfo();
33287   const DebugLoc &DL = MI.getDebugLoc();
33288 
33289   assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
33290   assert(MI.getOperand(3).isGlobal() && "This should be a global");
33291 
33292   // Get a register mask for the lowered call.
33293   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
33294   // proper register mask.
33295   const uint32_t *RegMask =
33296       Subtarget.is64Bit() ?
33297       Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
33298       Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
33299   if (Subtarget.is64Bit()) {
33300     MachineInstrBuilder MIB =
33301         BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
33302             .addReg(X86::RIP)
33303             .addImm(0)
33304             .addReg(0)
33305             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
33306                               MI.getOperand(3).getTargetFlags())
33307             .addReg(0);
33308     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
33309     addDirectMem(MIB, X86::RDI);
33310     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
33311   } else if (!isPositionIndependent()) {
33312     MachineInstrBuilder MIB =
33313         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
33314             .addReg(0)
33315             .addImm(0)
33316             .addReg(0)
33317             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
33318                               MI.getOperand(3).getTargetFlags())
33319             .addReg(0);
33320     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
33321     addDirectMem(MIB, X86::EAX);
33322     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
33323   } else {
33324     MachineInstrBuilder MIB =
33325         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
33326             .addReg(TII->getGlobalBaseReg(F))
33327             .addImm(0)
33328             .addReg(0)
33329             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
33330                               MI.getOperand(3).getTargetFlags())
33331             .addReg(0);
33332     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
33333     addDirectMem(MIB, X86::EAX);
33334     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
33335   }
33336 
33337   MI.eraseFromParent(); // The pseudo instruction is gone now.
33338   return BB;
33339 }
33340 
getOpcodeForIndirectThunk(unsigned RPOpc)33341 static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
33342   switch (RPOpc) {
33343   case X86::INDIRECT_THUNK_CALL32:
33344     return X86::CALLpcrel32;
33345   case X86::INDIRECT_THUNK_CALL64:
33346     return X86::CALL64pcrel32;
33347   case X86::INDIRECT_THUNK_TCRETURN32:
33348     return X86::TCRETURNdi;
33349   case X86::INDIRECT_THUNK_TCRETURN64:
33350     return X86::TCRETURNdi64;
33351   }
33352   llvm_unreachable("not indirect thunk opcode");
33353 }
33354 
getIndirectThunkSymbol(const X86Subtarget & Subtarget,unsigned Reg)33355 static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
33356                                           unsigned Reg) {
33357   if (Subtarget.useRetpolineExternalThunk()) {
33358     // When using an external thunk for retpolines, we pick names that match the
33359     // names GCC happens to use as well. This helps simplify the implementation
33360     // of the thunks for kernels where they have no easy ability to create
33361     // aliases and are doing non-trivial configuration of the thunk's body. For
33362     // example, the Linux kernel will do boot-time hot patching of the thunk
33363     // bodies and cannot easily export aliases of these to loaded modules.
33364     //
33365     // Note that at any point in the future, we may need to change the semantics
33366     // of how we implement retpolines and at that time will likely change the
33367     // name of the called thunk. Essentially, there is no hard guarantee that
33368     // LLVM will generate calls to specific thunks, we merely make a best-effort
33369     // attempt to help out kernels and other systems where duplicating the
33370     // thunks is costly.
33371     switch (Reg) {
33372     case X86::EAX:
33373       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33374       return "__x86_indirect_thunk_eax";
33375     case X86::ECX:
33376       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33377       return "__x86_indirect_thunk_ecx";
33378     case X86::EDX:
33379       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33380       return "__x86_indirect_thunk_edx";
33381     case X86::EDI:
33382       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33383       return "__x86_indirect_thunk_edi";
33384     case X86::R11:
33385       assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
33386       return "__x86_indirect_thunk_r11";
33387     }
33388     llvm_unreachable("unexpected reg for external indirect thunk");
33389   }
33390 
33391   if (Subtarget.useRetpolineIndirectCalls() ||
33392       Subtarget.useRetpolineIndirectBranches()) {
33393     // When targeting an internal COMDAT thunk use an LLVM-specific name.
33394     switch (Reg) {
33395     case X86::EAX:
33396       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33397       return "__llvm_retpoline_eax";
33398     case X86::ECX:
33399       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33400       return "__llvm_retpoline_ecx";
33401     case X86::EDX:
33402       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33403       return "__llvm_retpoline_edx";
33404     case X86::EDI:
33405       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33406       return "__llvm_retpoline_edi";
33407     case X86::R11:
33408       assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
33409       return "__llvm_retpoline_r11";
33410     }
33411     llvm_unreachable("unexpected reg for retpoline");
33412   }
33413 
33414   if (Subtarget.useLVIControlFlowIntegrity()) {
33415     assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
33416     return "__llvm_lvi_thunk_r11";
33417   }
33418   llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
33419 }
33420 
33421 MachineBasicBlock *
EmitLoweredIndirectThunk(MachineInstr & MI,MachineBasicBlock * BB) const33422 X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
33423                                             MachineBasicBlock *BB) const {
33424   // Copy the virtual register into the R11 physical register and
33425   // call the retpoline thunk.
33426   const DebugLoc &DL = MI.getDebugLoc();
33427   const X86InstrInfo *TII = Subtarget.getInstrInfo();
33428   Register CalleeVReg = MI.getOperand(0).getReg();
33429   unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
33430 
33431   // Find an available scratch register to hold the callee. On 64-bit, we can
33432   // just use R11, but we scan for uses anyway to ensure we don't generate
33433   // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
33434   // already a register use operand to the call to hold the callee. If none
33435   // are available, use EDI instead. EDI is chosen because EBX is the PIC base
33436   // register and ESI is the base pointer to realigned stack frames with VLAs.
33437   SmallVector<unsigned, 3> AvailableRegs;
33438   if (Subtarget.is64Bit())
33439     AvailableRegs.push_back(X86::R11);
33440   else
33441     AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
33442 
33443   // Zero out any registers that are already used.
33444   for (const auto &MO : MI.operands()) {
33445     if (MO.isReg() && MO.isUse())
33446       for (unsigned &Reg : AvailableRegs)
33447         if (Reg == MO.getReg())
33448           Reg = 0;
33449   }
33450 
33451   // Choose the first remaining non-zero available register.
33452   unsigned AvailableReg = 0;
33453   for (unsigned MaybeReg : AvailableRegs) {
33454     if (MaybeReg) {
33455       AvailableReg = MaybeReg;
33456       break;
33457     }
33458   }
33459   if (!AvailableReg)
33460     report_fatal_error("calling convention incompatible with retpoline, no "
33461                        "available registers");
33462 
33463   const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
33464 
33465   BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
33466       .addReg(CalleeVReg);
33467   MI.getOperand(0).ChangeToES(Symbol);
33468   MI.setDesc(TII->get(Opc));
33469   MachineInstrBuilder(*BB->getParent(), &MI)
33470       .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
33471   return BB;
33472 }
33473 
33474 /// SetJmp implies future control flow change upon calling the corresponding
33475 /// LongJmp.
33476 /// Instead of using the 'return' instruction, the long jump fixes the stack and
33477 /// performs an indirect branch. To do so it uses the registers that were stored
33478 /// in the jump buffer (when calling SetJmp).
33479 /// In case the shadow stack is enabled we need to fix it as well, because some
33480 /// return addresses will be skipped.
33481 /// The function will save the SSP for future fixing in the function
33482 /// emitLongJmpShadowStackFix.
33483 /// \sa emitLongJmpShadowStackFix
33484 /// \param [in] MI The temporary Machine Instruction for the builtin.
33485 /// \param [in] MBB The Machine Basic Block that will be modified.
emitSetJmpShadowStackFix(MachineInstr & MI,MachineBasicBlock * MBB) const33486 void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
33487                                                  MachineBasicBlock *MBB) const {
33488   const DebugLoc &DL = MI.getDebugLoc();
33489   MachineFunction *MF = MBB->getParent();
33490   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33491   MachineRegisterInfo &MRI = MF->getRegInfo();
33492   MachineInstrBuilder MIB;
33493 
33494   // Memory Reference.
33495   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33496                                            MI.memoperands_end());
33497 
33498   // Initialize a register with zero.
33499   MVT PVT = getPointerTy(MF->getDataLayout());
33500   const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
33501   Register ZReg = MRI.createVirtualRegister(PtrRC);
33502   unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
33503   BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
33504       .addDef(ZReg)
33505       .addReg(ZReg, RegState::Undef)
33506       .addReg(ZReg, RegState::Undef);
33507 
33508   // Read the current SSP Register value to the zeroed register.
33509   Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
33510   unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
33511   BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
33512 
33513   // Write the SSP register value to offset 3 in input memory buffer.
33514   unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
33515   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
33516   const int64_t SSPOffset = 3 * PVT.getStoreSize();
33517   const unsigned MemOpndSlot = 1;
33518   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33519     if (i == X86::AddrDisp)
33520       MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
33521     else
33522       MIB.add(MI.getOperand(MemOpndSlot + i));
33523   }
33524   MIB.addReg(SSPCopyReg);
33525   MIB.setMemRefs(MMOs);
33526 }
33527 
33528 MachineBasicBlock *
emitEHSjLjSetJmp(MachineInstr & MI,MachineBasicBlock * MBB) const33529 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
33530                                     MachineBasicBlock *MBB) const {
33531   const DebugLoc &DL = MI.getDebugLoc();
33532   MachineFunction *MF = MBB->getParent();
33533   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33534   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
33535   MachineRegisterInfo &MRI = MF->getRegInfo();
33536 
33537   const BasicBlock *BB = MBB->getBasicBlock();
33538   MachineFunction::iterator I = ++MBB->getIterator();
33539 
33540   // Memory Reference
33541   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33542                                            MI.memoperands_end());
33543 
33544   unsigned DstReg;
33545   unsigned MemOpndSlot = 0;
33546 
33547   unsigned CurOp = 0;
33548 
33549   DstReg = MI.getOperand(CurOp++).getReg();
33550   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
33551   assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
33552   (void)TRI;
33553   Register mainDstReg = MRI.createVirtualRegister(RC);
33554   Register restoreDstReg = MRI.createVirtualRegister(RC);
33555 
33556   MemOpndSlot = CurOp;
33557 
33558   MVT PVT = getPointerTy(MF->getDataLayout());
33559   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
33560          "Invalid Pointer Size!");
33561 
33562   // For v = setjmp(buf), we generate
33563   //
33564   // thisMBB:
33565   //  buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
33566   //  SjLjSetup restoreMBB
33567   //
33568   // mainMBB:
33569   //  v_main = 0
33570   //
33571   // sinkMBB:
33572   //  v = phi(main, restore)
33573   //
33574   // restoreMBB:
33575   //  if base pointer being used, load it from frame
33576   //  v_restore = 1
33577 
33578   MachineBasicBlock *thisMBB = MBB;
33579   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
33580   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
33581   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
33582   MF->insert(I, mainMBB);
33583   MF->insert(I, sinkMBB);
33584   MF->push_back(restoreMBB);
33585   restoreMBB->setHasAddressTaken();
33586 
33587   MachineInstrBuilder MIB;
33588 
33589   // Transfer the remainder of BB and its successor edges to sinkMBB.
33590   sinkMBB->splice(sinkMBB->begin(), MBB,
33591                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
33592   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
33593 
33594   // thisMBB:
33595   unsigned PtrStoreOpc = 0;
33596   unsigned LabelReg = 0;
33597   const int64_t LabelOffset = 1 * PVT.getStoreSize();
33598   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
33599                      !isPositionIndependent();
33600 
33601   // Prepare IP either in reg or imm.
33602   if (!UseImmLabel) {
33603     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
33604     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
33605     LabelReg = MRI.createVirtualRegister(PtrRC);
33606     if (Subtarget.is64Bit()) {
33607       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
33608               .addReg(X86::RIP)
33609               .addImm(0)
33610               .addReg(0)
33611               .addMBB(restoreMBB)
33612               .addReg(0);
33613     } else {
33614       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
33615       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
33616               .addReg(XII->getGlobalBaseReg(MF))
33617               .addImm(0)
33618               .addReg(0)
33619               .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
33620               .addReg(0);
33621     }
33622   } else
33623     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
33624   // Store IP
33625   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
33626   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33627     if (i == X86::AddrDisp)
33628       MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
33629     else
33630       MIB.add(MI.getOperand(MemOpndSlot + i));
33631   }
33632   if (!UseImmLabel)
33633     MIB.addReg(LabelReg);
33634   else
33635     MIB.addMBB(restoreMBB);
33636   MIB.setMemRefs(MMOs);
33637 
33638   if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
33639     emitSetJmpShadowStackFix(MI, thisMBB);
33640   }
33641 
33642   // Setup
33643   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
33644           .addMBB(restoreMBB);
33645 
33646   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
33647   MIB.addRegMask(RegInfo->getNoPreservedMask());
33648   thisMBB->addSuccessor(mainMBB);
33649   thisMBB->addSuccessor(restoreMBB);
33650 
33651   // mainMBB:
33652   //  EAX = 0
33653   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
33654   mainMBB->addSuccessor(sinkMBB);
33655 
33656   // sinkMBB:
33657   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
33658           TII->get(X86::PHI), DstReg)
33659     .addReg(mainDstReg).addMBB(mainMBB)
33660     .addReg(restoreDstReg).addMBB(restoreMBB);
33661 
33662   // restoreMBB:
33663   if (RegInfo->hasBasePointer(*MF)) {
33664     const bool Uses64BitFramePtr =
33665         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
33666     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
33667     X86FI->setRestoreBasePointer(MF);
33668     Register FramePtr = RegInfo->getFrameRegister(*MF);
33669     Register BasePtr = RegInfo->getBaseRegister();
33670     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
33671     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
33672                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
33673       .setMIFlag(MachineInstr::FrameSetup);
33674   }
33675   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
33676   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
33677   restoreMBB->addSuccessor(sinkMBB);
33678 
33679   MI.eraseFromParent();
33680   return sinkMBB;
33681 }
33682 
33683 /// Fix the shadow stack using the previously saved SSP pointer.
33684 /// \sa emitSetJmpShadowStackFix
33685 /// \param [in] MI The temporary Machine Instruction for the builtin.
33686 /// \param [in] MBB The Machine Basic Block that will be modified.
33687 /// \return The sink MBB that will perform the future indirect branch.
33688 MachineBasicBlock *
emitLongJmpShadowStackFix(MachineInstr & MI,MachineBasicBlock * MBB) const33689 X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
33690                                              MachineBasicBlock *MBB) const {
33691   const DebugLoc &DL = MI.getDebugLoc();
33692   MachineFunction *MF = MBB->getParent();
33693   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33694   MachineRegisterInfo &MRI = MF->getRegInfo();
33695 
33696   // Memory Reference
33697   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33698                                            MI.memoperands_end());
33699 
33700   MVT PVT = getPointerTy(MF->getDataLayout());
33701   const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
33702 
33703   // checkSspMBB:
33704   //         xor vreg1, vreg1
33705   //         rdssp vreg1
33706   //         test vreg1, vreg1
33707   //         je sinkMBB   # Jump if Shadow Stack is not supported
33708   // fallMBB:
33709   //         mov buf+24/12(%rip), vreg2
33710   //         sub vreg1, vreg2
33711   //         jbe sinkMBB  # No need to fix the Shadow Stack
33712   // fixShadowMBB:
33713   //         shr 3/2, vreg2
33714   //         incssp vreg2  # fix the SSP according to the lower 8 bits
33715   //         shr 8, vreg2
33716   //         je sinkMBB
33717   // fixShadowLoopPrepareMBB:
33718   //         shl vreg2
33719   //         mov 128, vreg3
33720   // fixShadowLoopMBB:
33721   //         incssp vreg3
33722   //         dec vreg2
33723   //         jne fixShadowLoopMBB # Iterate until you finish fixing
33724   //                              # the Shadow Stack
33725   // sinkMBB:
33726 
33727   MachineFunction::iterator I = ++MBB->getIterator();
33728   const BasicBlock *BB = MBB->getBasicBlock();
33729 
33730   MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
33731   MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
33732   MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
33733   MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
33734   MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
33735   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
33736   MF->insert(I, checkSspMBB);
33737   MF->insert(I, fallMBB);
33738   MF->insert(I, fixShadowMBB);
33739   MF->insert(I, fixShadowLoopPrepareMBB);
33740   MF->insert(I, fixShadowLoopMBB);
33741   MF->insert(I, sinkMBB);
33742 
33743   // Transfer the remainder of BB and its successor edges to sinkMBB.
33744   sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
33745                   MBB->end());
33746   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
33747 
33748   MBB->addSuccessor(checkSspMBB);
33749 
33750   // Initialize a register with zero.
33751   Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
33752   BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);
33753 
33754   if (PVT == MVT::i64) {
33755     Register TmpZReg = MRI.createVirtualRegister(PtrRC);
33756     BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
33757       .addImm(0)
33758       .addReg(ZReg)
33759       .addImm(X86::sub_32bit);
33760     ZReg = TmpZReg;
33761   }
33762 
33763   // Read the current SSP Register value to the zeroed register.
33764   Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
33765   unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
33766   BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
33767 
33768   // Check whether the result of the SSP register is zero and jump directly
33769   // to the sink.
33770   unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
33771   BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
33772       .addReg(SSPCopyReg)
33773       .addReg(SSPCopyReg);
33774   BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
33775   checkSspMBB->addSuccessor(sinkMBB);
33776   checkSspMBB->addSuccessor(fallMBB);
33777 
33778   // Reload the previously saved SSP register value.
33779   Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
33780   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
33781   const int64_t SPPOffset = 3 * PVT.getStoreSize();
33782   MachineInstrBuilder MIB =
33783       BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
33784   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33785     const MachineOperand &MO = MI.getOperand(i);
33786     if (i == X86::AddrDisp)
33787       MIB.addDisp(MO, SPPOffset);
33788     else if (MO.isReg()) // Don't add the whole operand, we don't want to
33789                          // preserve kill flags.
33790       MIB.addReg(MO.getReg());
33791     else
33792       MIB.add(MO);
33793   }
33794   MIB.setMemRefs(MMOs);
33795 
33796   // Subtract the current SSP from the previous SSP.
33797   Register SspSubReg = MRI.createVirtualRegister(PtrRC);
33798   unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
33799   BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
33800       .addReg(PrevSSPReg)
33801       .addReg(SSPCopyReg);
33802 
33803   // Jump to sink in case PrevSSPReg <= SSPCopyReg.
33804   BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
33805   fallMBB->addSuccessor(sinkMBB);
33806   fallMBB->addSuccessor(fixShadowMBB);
33807 
33808   // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
33809   unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
33810   unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
33811   Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
33812   BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
33813       .addReg(SspSubReg)
33814       .addImm(Offset);
33815 
33816   // Increase SSP when looking only on the lower 8 bits of the delta.
33817   unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
33818   BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
33819 
33820   // Reset the lower 8 bits.
33821   Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
33822   BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
33823       .addReg(SspFirstShrReg)
33824       .addImm(8);
33825 
33826   // Jump if the result of the shift is zero.
33827   BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
33828   fixShadowMBB->addSuccessor(sinkMBB);
33829   fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
33830 
33831   // Do a single shift left.
33832   unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
33833   Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
33834   BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
33835       .addReg(SspSecondShrReg);
33836 
33837   // Save the value 128 to a register (will be used next with incssp).
33838   Register Value128InReg = MRI.createVirtualRegister(PtrRC);
33839   unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
33840   BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
33841       .addImm(128);
33842   fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
33843 
33844   // Since incssp only looks at the lower 8 bits, we might need to do several
33845   // iterations of incssp until we finish fixing the shadow stack.
33846   Register DecReg = MRI.createVirtualRegister(PtrRC);
33847   Register CounterReg = MRI.createVirtualRegister(PtrRC);
33848   BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
33849       .addReg(SspAfterShlReg)
33850       .addMBB(fixShadowLoopPrepareMBB)
33851       .addReg(DecReg)
33852       .addMBB(fixShadowLoopMBB);
33853 
33854   // Every iteration we increase the SSP by 128.
33855   BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
33856 
33857   // Every iteration we decrement the counter by 1.
33858   unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
33859   BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
33860 
33861   // Jump if the counter is not zero yet.
33862   BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
33863   fixShadowLoopMBB->addSuccessor(sinkMBB);
33864   fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
33865 
33866   return sinkMBB;
33867 }
33868 
33869 MachineBasicBlock *
emitEHSjLjLongJmp(MachineInstr & MI,MachineBasicBlock * MBB) const33870 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
33871                                      MachineBasicBlock *MBB) const {
33872   const DebugLoc &DL = MI.getDebugLoc();
33873   MachineFunction *MF = MBB->getParent();
33874   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33875   MachineRegisterInfo &MRI = MF->getRegInfo();
33876 
33877   // Memory Reference
33878   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33879                                            MI.memoperands_end());
33880 
33881   MVT PVT = getPointerTy(MF->getDataLayout());
33882   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
33883          "Invalid Pointer Size!");
33884 
33885   const TargetRegisterClass *RC =
33886     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
33887   Register Tmp = MRI.createVirtualRegister(RC);
33888   // Since FP is only updated here but NOT referenced, it's treated as GPR.
33889   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
33890   Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
33891   Register SP = RegInfo->getStackRegister();
33892 
33893   MachineInstrBuilder MIB;
33894 
33895   const int64_t LabelOffset = 1 * PVT.getStoreSize();
33896   const int64_t SPOffset = 2 * PVT.getStoreSize();
33897 
33898   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
33899   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
33900 
33901   MachineBasicBlock *thisMBB = MBB;
33902 
33903   // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
33904   if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
33905     thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
33906   }
33907 
33908   // Reload FP
33909   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
33910   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33911     const MachineOperand &MO = MI.getOperand(i);
33912     if (MO.isReg()) // Don't add the whole operand, we don't want to
33913                     // preserve kill flags.
33914       MIB.addReg(MO.getReg());
33915     else
33916       MIB.add(MO);
33917   }
33918   MIB.setMemRefs(MMOs);
33919 
33920   // Reload IP
33921   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
33922   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33923     const MachineOperand &MO = MI.getOperand(i);
33924     if (i == X86::AddrDisp)
33925       MIB.addDisp(MO, LabelOffset);
33926     else if (MO.isReg()) // Don't add the whole operand, we don't want to
33927                          // preserve kill flags.
33928       MIB.addReg(MO.getReg());
33929     else
33930       MIB.add(MO);
33931   }
33932   MIB.setMemRefs(MMOs);
33933 
33934   // Reload SP
33935   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
33936   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33937     if (i == X86::AddrDisp)
33938       MIB.addDisp(MI.getOperand(i), SPOffset);
33939     else
33940       MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
33941                                  // the last instruction of the expansion.
33942   }
33943   MIB.setMemRefs(MMOs);
33944 
33945   // Jump
33946   BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
33947 
33948   MI.eraseFromParent();
33949   return thisMBB;
33950 }
33951 
SetupEntryBlockForSjLj(MachineInstr & MI,MachineBasicBlock * MBB,MachineBasicBlock * DispatchBB,int FI) const33952 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
33953                                                MachineBasicBlock *MBB,
33954                                                MachineBasicBlock *DispatchBB,
33955                                                int FI) const {
33956   const DebugLoc &DL = MI.getDebugLoc();
33957   MachineFunction *MF = MBB->getParent();
33958   MachineRegisterInfo *MRI = &MF->getRegInfo();
33959   const X86InstrInfo *TII = Subtarget.getInstrInfo();
33960 
33961   MVT PVT = getPointerTy(MF->getDataLayout());
33962   assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
33963 
33964   unsigned Op = 0;
33965   unsigned VR = 0;
33966 
33967   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
33968                      !isPositionIndependent();
33969 
33970   if (UseImmLabel) {
33971     Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
33972   } else {
33973     const TargetRegisterClass *TRC =
33974         (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
33975     VR = MRI->createVirtualRegister(TRC);
33976     Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
33977 
33978     if (Subtarget.is64Bit())
33979       BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
33980           .addReg(X86::RIP)
33981           .addImm(1)
33982           .addReg(0)
33983           .addMBB(DispatchBB)
33984           .addReg(0);
33985     else
33986       BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
33987           .addReg(0) /* TII->getGlobalBaseReg(MF) */
33988           .addImm(1)
33989           .addReg(0)
33990           .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
33991           .addReg(0);
33992   }
33993 
33994   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
33995   addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
33996   if (UseImmLabel)
33997     MIB.addMBB(DispatchBB);
33998   else
33999     MIB.addReg(VR);
34000 }
34001 
34002 MachineBasicBlock *
EmitSjLjDispatchBlock(MachineInstr & MI,MachineBasicBlock * BB) const34003 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
34004                                          MachineBasicBlock *BB) const {
34005   const DebugLoc &DL = MI.getDebugLoc();
34006   MachineFunction *MF = BB->getParent();
34007   MachineRegisterInfo *MRI = &MF->getRegInfo();
34008   const X86InstrInfo *TII = Subtarget.getInstrInfo();
34009   int FI = MF->getFrameInfo().getFunctionContextIndex();
34010 
34011   // Get a mapping of the call site numbers to all of the landing pads they're
34012   // associated with.
34013   DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
34014   unsigned MaxCSNum = 0;
34015   for (auto &MBB : *MF) {
34016     if (!MBB.isEHPad())
34017       continue;
34018 
34019     MCSymbol *Sym = nullptr;
34020     for (const auto &MI : MBB) {
34021       if (MI.isDebugInstr())
34022         continue;
34023 
34024       assert(MI.isEHLabel() && "expected EH_LABEL");
34025       Sym = MI.getOperand(0).getMCSymbol();
34026       break;
34027     }
34028 
34029     if (!MF->hasCallSiteLandingPad(Sym))
34030       continue;
34031 
34032     for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
34033       CallSiteNumToLPad[CSI].push_back(&MBB);
34034       MaxCSNum = std::max(MaxCSNum, CSI);
34035     }
34036   }
34037 
34038   // Get an ordered list of the machine basic blocks for the jump table.
34039   std::vector<MachineBasicBlock *> LPadList;
34040   SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
34041   LPadList.reserve(CallSiteNumToLPad.size());
34042 
34043   for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
34044     for (auto &LP : CallSiteNumToLPad[CSI]) {
34045       LPadList.push_back(LP);
34046       InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
34047     }
34048   }
34049 
34050   assert(!LPadList.empty() &&
34051          "No landing pad destinations for the dispatch jump table!");
34052 
34053   // Create the MBBs for the dispatch code.
34054 
34055   // Shove the dispatch's address into the return slot in the function context.
34056   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
34057   DispatchBB->setIsEHPad(true);
34058 
34059   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
34060   BuildMI(TrapBB, DL, TII->get(X86::TRAP));
34061   DispatchBB->addSuccessor(TrapBB);
34062 
34063   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
34064   DispatchBB->addSuccessor(DispContBB);
34065 
34066   // Insert MBBs.
34067   MF->push_back(DispatchBB);
34068   MF->push_back(DispContBB);
34069   MF->push_back(TrapBB);
34070 
34071   // Insert code into the entry block that creates and registers the function
34072   // context.
34073   SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
34074 
34075   // Create the jump table and associated information
34076   unsigned JTE = getJumpTableEncoding();
34077   MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
34078   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
34079 
34080   const X86RegisterInfo &RI = TII->getRegisterInfo();
34081   // Add a register mask with no preserved registers.  This results in all
34082   // registers being marked as clobbered.
34083   if (RI.hasBasePointer(*MF)) {
34084     const bool FPIs64Bit =
34085         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
34086     X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
34087     MFI->setRestoreBasePointer(MF);
34088 
34089     Register FP = RI.getFrameRegister(*MF);
34090     Register BP = RI.getBaseRegister();
34091     unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
34092     addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
34093                  MFI->getRestoreBasePointerOffset())
34094         .addRegMask(RI.getNoPreservedMask());
34095   } else {
34096     BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
34097         .addRegMask(RI.getNoPreservedMask());
34098   }
34099 
34100   // IReg is used as an index in a memory operand and therefore can't be SP
34101   Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
34102   addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
34103                     Subtarget.is64Bit() ? 8 : 4);
34104   BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
34105       .addReg(IReg)
34106       .addImm(LPadList.size());
34107   BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
34108 
34109   if (Subtarget.is64Bit()) {
34110     Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
34111     Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
34112 
34113     // leaq .LJTI0_0(%rip), BReg
34114     BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
34115         .addReg(X86::RIP)
34116         .addImm(1)
34117         .addReg(0)
34118         .addJumpTableIndex(MJTI)
34119         .addReg(0);
34120     // movzx IReg64, IReg
34121     BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
34122         .addImm(0)
34123         .addReg(IReg)
34124         .addImm(X86::sub_32bit);
34125 
34126     switch (JTE) {
34127     case MachineJumpTableInfo::EK_BlockAddress:
34128       // jmpq *(BReg,IReg64,8)
34129       BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
34130           .addReg(BReg)
34131           .addImm(8)
34132           .addReg(IReg64)
34133           .addImm(0)
34134           .addReg(0);
34135       break;
34136     case MachineJumpTableInfo::EK_LabelDifference32: {
34137       Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
34138       Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
34139       Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
34140 
34141       // movl (BReg,IReg64,4), OReg
34142       BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
34143           .addReg(BReg)
34144           .addImm(4)
34145           .addReg(IReg64)
34146           .addImm(0)
34147           .addReg(0);
34148       // movsx OReg64, OReg
34149       BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
34150       // addq BReg, OReg64, TReg
34151       BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
34152           .addReg(OReg64)
34153           .addReg(BReg);
34154       // jmpq *TReg
34155       BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
34156       break;
34157     }
34158     default:
34159       llvm_unreachable("Unexpected jump table encoding");
34160     }
34161   } else {
34162     // jmpl *.LJTI0_0(,IReg,4)
34163     BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
34164         .addReg(0)
34165         .addImm(4)
34166         .addReg(IReg)
34167         .addJumpTableIndex(MJTI)
34168         .addReg(0);
34169   }
34170 
34171   // Add the jump table entries as successors to the MBB.
34172   SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
34173   for (auto &LP : LPadList)
34174     if (SeenMBBs.insert(LP).second)
34175       DispContBB->addSuccessor(LP);
34176 
34177   // N.B. the order the invoke BBs are processed in doesn't matter here.
34178   SmallVector<MachineBasicBlock *, 64> MBBLPads;
34179   const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
34180   for (MachineBasicBlock *MBB : InvokeBBs) {
34181     // Remove the landing pad successor from the invoke block and replace it
34182     // with the new dispatch block.
34183     // Keep a copy of Successors since it's modified inside the loop.
34184     SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
34185                                                    MBB->succ_rend());
34186     // FIXME: Avoid quadratic complexity.
34187     for (auto MBBS : Successors) {
34188       if (MBBS->isEHPad()) {
34189         MBB->removeSuccessor(MBBS);
34190         MBBLPads.push_back(MBBS);
34191       }
34192     }
34193 
34194     MBB->addSuccessor(DispatchBB);
34195 
34196     // Find the invoke call and mark all of the callee-saved registers as
34197     // 'implicit defined' so that they're spilled.  This prevents code from
34198     // moving instructions to before the EH block, where they will never be
34199     // executed.
34200     for (auto &II : reverse(*MBB)) {
34201       if (!II.isCall())
34202         continue;
34203 
34204       DenseMap<unsigned, bool> DefRegs;
34205       for (auto &MOp : II.operands())
34206         if (MOp.isReg())
34207           DefRegs[MOp.getReg()] = true;
34208 
34209       MachineInstrBuilder MIB(*MF, &II);
34210       for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
34211         unsigned Reg = SavedRegs[RegIdx];
34212         if (!DefRegs[Reg])
34213           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
34214       }
34215 
34216       break;
34217     }
34218   }
34219 
34220   // Mark all former landing pads as non-landing pads.  The dispatch is the only
34221   // landing pad now.
34222   for (auto &LP : MBBLPads)
34223     LP->setIsEHPad(false);
34224 
34225   // The instruction is gone now.
34226   MI.eraseFromParent();
34227   return BB;
34228 }
34229 
34230 MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr & MI,MachineBasicBlock * BB) const34231 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
34232                                                MachineBasicBlock *BB) const {
34233   MachineFunction *MF = BB->getParent();
34234   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34235   const DebugLoc &DL = MI.getDebugLoc();
34236 
34237   auto TMMImmToTMMReg = [](unsigned Imm) {
34238     assert (Imm < 8 && "Illegal tmm index");
34239     return X86::TMM0 + Imm;
34240   };
34241   switch (MI.getOpcode()) {
34242   default: llvm_unreachable("Unexpected instr type to insert");
34243   case X86::TLS_addr32:
34244   case X86::TLS_addr64:
34245   case X86::TLS_addrX32:
34246   case X86::TLS_base_addr32:
34247   case X86::TLS_base_addr64:
34248   case X86::TLS_base_addrX32:
34249     return EmitLoweredTLSAddr(MI, BB);
34250   case X86::INDIRECT_THUNK_CALL32:
34251   case X86::INDIRECT_THUNK_CALL64:
34252   case X86::INDIRECT_THUNK_TCRETURN32:
34253   case X86::INDIRECT_THUNK_TCRETURN64:
34254     return EmitLoweredIndirectThunk(MI, BB);
34255   case X86::CATCHRET:
34256     return EmitLoweredCatchRet(MI, BB);
34257   case X86::SEG_ALLOCA_32:
34258   case X86::SEG_ALLOCA_64:
34259     return EmitLoweredSegAlloca(MI, BB);
34260   case X86::PROBED_ALLOCA_32:
34261   case X86::PROBED_ALLOCA_64:
34262     return EmitLoweredProbedAlloca(MI, BB);
34263   case X86::TLSCall_32:
34264   case X86::TLSCall_64:
34265     return EmitLoweredTLSCall(MI, BB);
34266   case X86::CMOV_FR32:
34267   case X86::CMOV_FR32X:
34268   case X86::CMOV_FR64:
34269   case X86::CMOV_FR64X:
34270   case X86::CMOV_GR8:
34271   case X86::CMOV_GR16:
34272   case X86::CMOV_GR32:
34273   case X86::CMOV_RFP32:
34274   case X86::CMOV_RFP64:
34275   case X86::CMOV_RFP80:
34276   case X86::CMOV_VR64:
34277   case X86::CMOV_VR128:
34278   case X86::CMOV_VR128X:
34279   case X86::CMOV_VR256:
34280   case X86::CMOV_VR256X:
34281   case X86::CMOV_VR512:
34282   case X86::CMOV_VK1:
34283   case X86::CMOV_VK2:
34284   case X86::CMOV_VK4:
34285   case X86::CMOV_VK8:
34286   case X86::CMOV_VK16:
34287   case X86::CMOV_VK32:
34288   case X86::CMOV_VK64:
34289     return EmitLoweredSelect(MI, BB);
34290 
34291   case X86::RDFLAGS32:
34292   case X86::RDFLAGS64: {
34293     unsigned PushF =
34294         MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
34295     unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
34296     MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
34297     // Permit reads of the EFLAGS and DF registers without them being defined.
34298     // This intrinsic exists to read external processor state in flags, such as
34299     // the trap flag, interrupt flag, and direction flag, none of which are
34300     // modeled by the backend.
34301     assert(Push->getOperand(2).getReg() == X86::EFLAGS &&
34302            "Unexpected register in operand!");
34303     Push->getOperand(2).setIsUndef();
34304     assert(Push->getOperand(3).getReg() == X86::DF &&
34305            "Unexpected register in operand!");
34306     Push->getOperand(3).setIsUndef();
34307     BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
34308 
34309     MI.eraseFromParent(); // The pseudo is gone now.
34310     return BB;
34311   }
34312 
34313   case X86::WRFLAGS32:
34314   case X86::WRFLAGS64: {
34315     unsigned Push =
34316         MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
34317     unsigned PopF =
34318         MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
34319     BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
34320     BuildMI(*BB, MI, DL, TII->get(PopF));
34321 
34322     MI.eraseFromParent(); // The pseudo is gone now.
34323     return BB;
34324   }
34325 
34326   case X86::FP32_TO_INT16_IN_MEM:
34327   case X86::FP32_TO_INT32_IN_MEM:
34328   case X86::FP32_TO_INT64_IN_MEM:
34329   case X86::FP64_TO_INT16_IN_MEM:
34330   case X86::FP64_TO_INT32_IN_MEM:
34331   case X86::FP64_TO_INT64_IN_MEM:
34332   case X86::FP80_TO_INT16_IN_MEM:
34333   case X86::FP80_TO_INT32_IN_MEM:
34334   case X86::FP80_TO_INT64_IN_MEM: {
34335     // Change the floating point control register to use "round towards zero"
34336     // mode when truncating to an integer value.
34337     int OrigCWFrameIdx =
34338         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
34339     addFrameReference(BuildMI(*BB, MI, DL,
34340                               TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
34341 
34342     // Load the old value of the control word...
34343     Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
34344     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
34345                       OrigCWFrameIdx);
34346 
34347     // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
34348     Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
34349     BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
34350       .addReg(OldCW, RegState::Kill).addImm(0xC00);
34351 
34352     // Extract to 16 bits.
34353     Register NewCW16 =
34354         MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
34355     BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
34356       .addReg(NewCW, RegState::Kill, X86::sub_16bit);
34357 
34358     // Prepare memory for FLDCW.
34359     int NewCWFrameIdx =
34360         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
34361     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
34362                       NewCWFrameIdx)
34363       .addReg(NewCW16, RegState::Kill);
34364 
34365     // Reload the modified control word now...
34366     addFrameReference(BuildMI(*BB, MI, DL,
34367                               TII->get(X86::FLDCW16m)), NewCWFrameIdx);
34368 
34369     // Get the X86 opcode to use.
34370     unsigned Opc;
34371     switch (MI.getOpcode()) {
34372     default: llvm_unreachable("illegal opcode!");
34373     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
34374     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
34375     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
34376     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
34377     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
34378     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
34379     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
34380     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
34381     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
34382     }
34383 
34384     X86AddressMode AM = getAddressFromInstr(&MI, 0);
34385     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
34386         .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
34387 
34388     // Reload the original control word now.
34389     addFrameReference(BuildMI(*BB, MI, DL,
34390                               TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
34391 
34392     MI.eraseFromParent(); // The pseudo instruction is gone now.
34393     return BB;
34394   }
34395 
34396   // xbegin
34397   case X86::XBEGIN:
34398     return emitXBegin(MI, BB, Subtarget.getInstrInfo());
34399 
34400   case X86::VAARG_64:
34401   case X86::VAARG_X32:
34402     return EmitVAARGWithCustomInserter(MI, BB);
34403 
34404   case X86::EH_SjLj_SetJmp32:
34405   case X86::EH_SjLj_SetJmp64:
34406     return emitEHSjLjSetJmp(MI, BB);
34407 
34408   case X86::EH_SjLj_LongJmp32:
34409   case X86::EH_SjLj_LongJmp64:
34410     return emitEHSjLjLongJmp(MI, BB);
34411 
34412   case X86::Int_eh_sjlj_setup_dispatch:
34413     return EmitSjLjDispatchBlock(MI, BB);
34414 
34415   case TargetOpcode::STATEPOINT:
34416     // As an implementation detail, STATEPOINT shares the STACKMAP format at
34417     // this point in the process.  We diverge later.
34418     return emitPatchPoint(MI, BB);
34419 
34420   case TargetOpcode::STACKMAP:
34421   case TargetOpcode::PATCHPOINT:
34422     return emitPatchPoint(MI, BB);
34423 
34424   case TargetOpcode::PATCHABLE_EVENT_CALL:
34425   case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
34426     return BB;
34427 
34428   case X86::LCMPXCHG8B: {
34429     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
34430     // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
34431     // requires a memory operand. If it happens that current architecture is
34432     // i686 and for current function we need a base pointer
34433     // - which is ESI for i686 - register allocator would not be able to
34434     // allocate registers for an address in form of X(%reg, %reg, Y)
34435     // - there never would be enough unreserved registers during regalloc
34436     // (without the need for base ptr the only option would be X(%edi, %esi, Y).
34437     // We are giving a hand to register allocator by precomputing the address in
34438     // a new vreg using LEA.
34439 
34440     // If it is not i686 or there is no base pointer - nothing to do here.
34441     if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
34442       return BB;
34443 
34444     // Even though this code does not necessarily needs the base pointer to
34445     // be ESI, we check for that. The reason: if this assert fails, there are
34446     // some changes happened in the compiler base pointer handling, which most
34447     // probably have to be addressed somehow here.
34448     assert(TRI->getBaseRegister() == X86::ESI &&
34449            "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
34450            "base pointer in mind");
34451 
34452     MachineRegisterInfo &MRI = MF->getRegInfo();
34453     MVT SPTy = getPointerTy(MF->getDataLayout());
34454     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
34455     Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
34456 
34457     X86AddressMode AM = getAddressFromInstr(&MI, 0);
34458     // Regalloc does not need any help when the memory operand of CMPXCHG8B
34459     // does not use index register.
34460     if (AM.IndexReg == X86::NoRegister)
34461       return BB;
34462 
34463     // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
34464     // four operand definitions that are E[ABCD] registers. We skip them and
34465     // then insert the LEA.
34466     MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
34467     while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
34468                                    RMBBI->definesRegister(X86::EBX) ||
34469                                    RMBBI->definesRegister(X86::ECX) ||
34470                                    RMBBI->definesRegister(X86::EDX))) {
34471       ++RMBBI;
34472     }
34473     MachineBasicBlock::iterator MBBI(RMBBI);
34474     addFullAddress(
34475         BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
34476 
34477     setDirectAddressInInstr(&MI, 0, computedAddrVReg);
34478 
34479     return BB;
34480   }
34481   case X86::LCMPXCHG16B_NO_RBX: {
34482     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
34483     Register BasePtr = TRI->getBaseRegister();
34484     if (TRI->hasBasePointer(*MF) &&
34485         (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
34486       if (!BB->isLiveIn(BasePtr))
34487         BB->addLiveIn(BasePtr);
34488       // Save RBX into a virtual register.
34489       Register SaveRBX =
34490           MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
34491       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
34492           .addReg(X86::RBX);
34493       Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
34494       MachineInstrBuilder MIB =
34495           BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
34496       for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
34497         MIB.add(MI.getOperand(Idx));
34498       MIB.add(MI.getOperand(X86::AddrNumOperands));
34499       MIB.addReg(SaveRBX);
34500     } else {
34501       // Simple case, just copy the virtual register to RBX.
34502       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
34503           .add(MI.getOperand(X86::AddrNumOperands));
34504       MachineInstrBuilder MIB =
34505           BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));
34506       for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
34507         MIB.add(MI.getOperand(Idx));
34508     }
34509     MI.eraseFromParent();
34510     return BB;
34511   }
34512   case X86::MWAITX: {
34513     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
34514     Register BasePtr = TRI->getBaseRegister();
34515     bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
34516     // If no need to save the base pointer, we generate MWAITXrrr,
34517     // else we generate pseudo MWAITX_SAVE_RBX.
34518     if (!IsRBX || !TRI->hasBasePointer(*MF)) {
34519       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
34520           .addReg(MI.getOperand(0).getReg());
34521       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
34522           .addReg(MI.getOperand(1).getReg());
34523       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)
34524           .addReg(MI.getOperand(2).getReg());
34525       BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));
34526       MI.eraseFromParent();
34527     } else {
34528       if (!BB->isLiveIn(BasePtr)) {
34529         BB->addLiveIn(BasePtr);
34530       }
34531       // Parameters can be copied into ECX and EAX but not EBX yet.
34532       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
34533           .addReg(MI.getOperand(0).getReg());
34534       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
34535           .addReg(MI.getOperand(1).getReg());
34536       assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
34537       // Save RBX into a virtual register.
34538       Register SaveRBX =
34539           MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
34540       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
34541           .addReg(X86::RBX);
34542       // Generate mwaitx pseudo.
34543       Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
34544       BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))
34545           .addDef(Dst) // Destination tied in with SaveRBX.
34546           .addReg(MI.getOperand(2).getReg()) // input value of EBX.
34547           .addUse(SaveRBX);                  // Save of base pointer.
34548       MI.eraseFromParent();
34549     }
34550     return BB;
34551   }
34552   case TargetOpcode::PREALLOCATED_SETUP: {
34553     assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
34554     auto MFI = MF->getInfo<X86MachineFunctionInfo>();
34555     MFI->setHasPreallocatedCall(true);
34556     int64_t PreallocatedId = MI.getOperand(0).getImm();
34557     size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
34558     assert(StackAdjustment != 0 && "0 stack adjustment");
34559     LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
34560                       << StackAdjustment << "\n");
34561     BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
34562         .addReg(X86::ESP)
34563         .addImm(StackAdjustment);
34564     MI.eraseFromParent();
34565     return BB;
34566   }
34567   case TargetOpcode::PREALLOCATED_ARG: {
34568     assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
34569     int64_t PreallocatedId = MI.getOperand(1).getImm();
34570     int64_t ArgIdx = MI.getOperand(2).getImm();
34571     auto MFI = MF->getInfo<X86MachineFunctionInfo>();
34572     size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
34573     LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
34574                       << ", arg offset " << ArgOffset << "\n");
34575     // stack pointer + offset
34576     addRegOffset(
34577         BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
34578         X86::ESP, false, ArgOffset);
34579     MI.eraseFromParent();
34580     return BB;
34581   }
34582   case X86::PTDPBSSD:
34583   case X86::PTDPBSUD:
34584   case X86::PTDPBUSD:
34585   case X86::PTDPBUUD:
34586   case X86::PTDPBF16PS: {
34587     unsigned Opc;
34588     switch (MI.getOpcode()) {
34589     case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
34590     case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
34591     case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
34592     case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
34593     case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
34594     }
34595 
34596     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
34597     MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
34598     MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
34599     MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
34600     MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
34601 
34602     MI.eraseFromParent(); // The pseudo is gone now.
34603     return BB;
34604   }
34605   case X86::PTILEZERO: {
34606     unsigned Imm = MI.getOperand(0).getImm();
34607     BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
34608     MI.eraseFromParent(); // The pseudo is gone now.
34609     return BB;
34610   }
34611   case X86::PTILELOADD:
34612   case X86::PTILELOADDT1:
34613   case X86::PTILESTORED: {
34614     unsigned Opc;
34615     switch (MI.getOpcode()) {
34616     case X86::PTILELOADD:   Opc = X86::TILELOADD;   break;
34617     case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
34618     case X86::PTILESTORED:  Opc = X86::TILESTORED;  break;
34619     }
34620 
34621     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
34622     unsigned CurOp = 0;
34623     if (Opc != X86::TILESTORED)
34624       MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
34625                  RegState::Define);
34626 
34627     MIB.add(MI.getOperand(CurOp++)); // base
34628     MIB.add(MI.getOperand(CurOp++)); // scale
34629     MIB.add(MI.getOperand(CurOp++)); // index -- stride
34630     MIB.add(MI.getOperand(CurOp++)); // displacement
34631     MIB.add(MI.getOperand(CurOp++)); // segment
34632 
34633     if (Opc == X86::TILESTORED)
34634       MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
34635                  RegState::Undef);
34636 
34637     MI.eraseFromParent(); // The pseudo is gone now.
34638     return BB;
34639   }
34640   }
34641 }
34642 
34643 //===----------------------------------------------------------------------===//
34644 //                           X86 Optimization Hooks
34645 //===----------------------------------------------------------------------===//
34646 
34647 bool
targetShrinkDemandedConstant(SDValue Op,const APInt & DemandedBits,const APInt & DemandedElts,TargetLoweringOpt & TLO) const34648 X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
34649                                                 const APInt &DemandedBits,
34650                                                 const APInt &DemandedElts,
34651                                                 TargetLoweringOpt &TLO) const {
34652   EVT VT = Op.getValueType();
34653   unsigned Opcode = Op.getOpcode();
34654   unsigned EltSize = VT.getScalarSizeInBits();
34655 
34656   if (VT.isVector()) {
34657     // If the constant is only all signbits in the active bits, then we should
34658     // extend it to the entire constant to allow it act as a boolean constant
34659     // vector.
34660     auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
34661       if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
34662         return false;
34663       for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
34664         if (!DemandedElts[i] || V.getOperand(i).isUndef())
34665           continue;
34666         const APInt &Val = V.getConstantOperandAPInt(i);
34667         if (Val.getBitWidth() > Val.getNumSignBits() &&
34668             Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
34669           return true;
34670       }
34671       return false;
34672     };
34673     // For vectors - if we have a constant, then try to sign extend.
34674     // TODO: Handle AND/ANDN cases.
34675     unsigned ActiveBits = DemandedBits.getActiveBits();
34676     if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
34677         (Opcode == ISD::OR || Opcode == ISD::XOR) &&
34678         NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
34679       EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
34680       EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
34681                                     VT.getVectorNumElements());
34682       SDValue NewC =
34683           TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
34684                           Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
34685       SDValue NewOp =
34686           TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
34687       return TLO.CombineTo(Op, NewOp);
34688     }
34689     return false;
34690   }
34691 
34692   // Only optimize Ands to prevent shrinking a constant that could be
34693   // matched by movzx.
34694   if (Opcode != ISD::AND)
34695     return false;
34696 
34697   // Make sure the RHS really is a constant.
34698   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
34699   if (!C)
34700     return false;
34701 
34702   const APInt &Mask = C->getAPIntValue();
34703 
34704   // Clear all non-demanded bits initially.
34705   APInt ShrunkMask = Mask & DemandedBits;
34706 
34707   // Find the width of the shrunk mask.
34708   unsigned Width = ShrunkMask.getActiveBits();
34709 
34710   // If the mask is all 0s there's nothing to do here.
34711   if (Width == 0)
34712     return false;
34713 
34714   // Find the next power of 2 width, rounding up to a byte.
34715   Width = PowerOf2Ceil(std::max(Width, 8U));
34716   // Truncate the width to size to handle illegal types.
34717   Width = std::min(Width, EltSize);
34718 
34719   // Calculate a possible zero extend mask for this constant.
34720   APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
34721 
34722   // If we aren't changing the mask, just return true to keep it and prevent
34723   // the caller from optimizing.
34724   if (ZeroExtendMask == Mask)
34725     return true;
34726 
34727   // Make sure the new mask can be represented by a combination of mask bits
34728   // and non-demanded bits.
34729   if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
34730     return false;
34731 
34732   // Replace the constant with the zero extend mask.
34733   SDLoc DL(Op);
34734   SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
34735   SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
34736   return TLO.CombineTo(Op, NewOp);
34737 }
34738 
computeKnownBitsForTargetNode(const SDValue Op,KnownBits & Known,const APInt & DemandedElts,const SelectionDAG & DAG,unsigned Depth) const34739 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
34740                                                       KnownBits &Known,
34741                                                       const APInt &DemandedElts,
34742                                                       const SelectionDAG &DAG,
34743                                                       unsigned Depth) const {
34744   unsigned BitWidth = Known.getBitWidth();
34745   unsigned NumElts = DemandedElts.getBitWidth();
34746   unsigned Opc = Op.getOpcode();
34747   EVT VT = Op.getValueType();
34748   assert((Opc >= ISD::BUILTIN_OP_END ||
34749           Opc == ISD::INTRINSIC_WO_CHAIN ||
34750           Opc == ISD::INTRINSIC_W_CHAIN ||
34751           Opc == ISD::INTRINSIC_VOID) &&
34752          "Should use MaskedValueIsZero if you don't know whether Op"
34753          " is a target node!");
34754 
34755   Known.resetAll();
34756   switch (Opc) {
34757   default: break;
34758   case X86ISD::SETCC:
34759     Known.Zero.setBitsFrom(1);
34760     break;
34761   case X86ISD::MOVMSK: {
34762     unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
34763     Known.Zero.setBitsFrom(NumLoBits);
34764     break;
34765   }
34766   case X86ISD::PEXTRB:
34767   case X86ISD::PEXTRW: {
34768     SDValue Src = Op.getOperand(0);
34769     EVT SrcVT = Src.getValueType();
34770     APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
34771                                             Op.getConstantOperandVal(1));
34772     Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
34773     Known = Known.anyextOrTrunc(BitWidth);
34774     Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
34775     break;
34776   }
34777   case X86ISD::VSRAI:
34778   case X86ISD::VSHLI:
34779   case X86ISD::VSRLI: {
34780     unsigned ShAmt = Op.getConstantOperandVal(1);
34781     if (ShAmt >= VT.getScalarSizeInBits()) {
34782       Known.setAllZero();
34783       break;
34784     }
34785 
34786     Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34787     if (Opc == X86ISD::VSHLI) {
34788       Known.Zero <<= ShAmt;
34789       Known.One <<= ShAmt;
34790       // Low bits are known zero.
34791       Known.Zero.setLowBits(ShAmt);
34792     } else if (Opc == X86ISD::VSRLI) {
34793       Known.Zero.lshrInPlace(ShAmt);
34794       Known.One.lshrInPlace(ShAmt);
34795       // High bits are known zero.
34796       Known.Zero.setHighBits(ShAmt);
34797     } else {
34798       Known.Zero.ashrInPlace(ShAmt);
34799       Known.One.ashrInPlace(ShAmt);
34800     }
34801     break;
34802   }
34803   case X86ISD::PACKUS: {
34804     // PACKUS is just a truncation if the upper half is zero.
34805     APInt DemandedLHS, DemandedRHS;
34806     getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
34807 
34808     Known.One = APInt::getAllOnesValue(BitWidth * 2);
34809     Known.Zero = APInt::getAllOnesValue(BitWidth * 2);
34810 
34811     KnownBits Known2;
34812     if (!!DemandedLHS) {
34813       Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
34814       Known = KnownBits::commonBits(Known, Known2);
34815     }
34816     if (!!DemandedRHS) {
34817       Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
34818       Known = KnownBits::commonBits(Known, Known2);
34819     }
34820 
34821     if (Known.countMinLeadingZeros() < BitWidth)
34822       Known.resetAll();
34823     Known = Known.trunc(BitWidth);
34824     break;
34825   }
34826   case X86ISD::VBROADCAST: {
34827     SDValue Src = Op.getOperand(0);
34828     if (!Src.getSimpleValueType().isVector()) {
34829       Known = DAG.computeKnownBits(Src, Depth + 1);
34830       return;
34831     }
34832     break;
34833   }
34834   case X86ISD::ANDNP: {
34835     KnownBits Known2;
34836     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34837     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34838 
34839     // ANDNP = (~X & Y);
34840     Known.One &= Known2.Zero;
34841     Known.Zero |= Known2.One;
34842     break;
34843   }
34844   case X86ISD::FOR: {
34845     KnownBits Known2;
34846     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34847     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34848 
34849     Known |= Known2;
34850     break;
34851   }
34852   case X86ISD::PSADBW: {
34853     assert(VT.getScalarType() == MVT::i64 &&
34854            Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
34855            "Unexpected PSADBW types");
34856 
34857     // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
34858     Known.Zero.setBitsFrom(16);
34859     break;
34860   }
34861   case X86ISD::PMULUDQ: {
34862     KnownBits Known2;
34863     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34864     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34865 
34866     Known = Known.trunc(BitWidth / 2).zext(BitWidth);
34867     Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
34868     Known = KnownBits::mul(Known, Known2);
34869     break;
34870   }
34871   case X86ISD::CMOV: {
34872     Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
34873     // If we don't know any bits, early out.
34874     if (Known.isUnknown())
34875       break;
34876     KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
34877 
34878     // Only known if known in both the LHS and RHS.
34879     Known = KnownBits::commonBits(Known, Known2);
34880     break;
34881   }
34882   case X86ISD::BEXTR:
34883   case X86ISD::BEXTRI: {
34884     SDValue Op0 = Op.getOperand(0);
34885     SDValue Op1 = Op.getOperand(1);
34886 
34887     if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
34888       unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
34889       unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
34890 
34891       // If the length is 0, the result is 0.
34892       if (Length == 0) {
34893         Known.setAllZero();
34894         break;
34895       }
34896 
34897       if ((Shift + Length) <= BitWidth) {
34898         Known = DAG.computeKnownBits(Op0, Depth + 1);
34899         Known = Known.extractBits(Length, Shift);
34900         Known = Known.zextOrTrunc(BitWidth);
34901       }
34902     }
34903     break;
34904   }
34905   case X86ISD::PDEP: {
34906     KnownBits Known2;
34907     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34908     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34909     // Zeros are retained from the mask operand. But not ones.
34910     Known.One.clearAllBits();
34911     // The result will have at least as many trailing zeros as the non-mask
34912     // operand since bits can only map to the same or higher bit position.
34913     Known.Zero.setLowBits(Known2.countMinTrailingZeros());
34914     break;
34915   }
34916   case X86ISD::PEXT: {
34917     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34918     // The result has as many leading zeros as the number of zeroes in the mask.
34919     unsigned Count = Known.Zero.countPopulation();
34920     Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
34921     Known.One.clearAllBits();
34922     break;
34923   }
34924   case X86ISD::VTRUNC:
34925   case X86ISD::VTRUNCS:
34926   case X86ISD::VTRUNCUS:
34927   case X86ISD::CVTSI2P:
34928   case X86ISD::CVTUI2P:
34929   case X86ISD::CVTP2SI:
34930   case X86ISD::CVTP2UI:
34931   case X86ISD::MCVTP2SI:
34932   case X86ISD::MCVTP2UI:
34933   case X86ISD::CVTTP2SI:
34934   case X86ISD::CVTTP2UI:
34935   case X86ISD::MCVTTP2SI:
34936   case X86ISD::MCVTTP2UI:
34937   case X86ISD::MCVTSI2P:
34938   case X86ISD::MCVTUI2P:
34939   case X86ISD::VFPROUND:
34940   case X86ISD::VMFPROUND:
34941   case X86ISD::CVTPS2PH:
34942   case X86ISD::MCVTPS2PH: {
34943     // Truncations/Conversions - upper elements are known zero.
34944     EVT SrcVT = Op.getOperand(0).getValueType();
34945     if (SrcVT.isVector()) {
34946       unsigned NumSrcElts = SrcVT.getVectorNumElements();
34947       if (NumElts > NumSrcElts &&
34948           DemandedElts.countTrailingZeros() >= NumSrcElts)
34949         Known.setAllZero();
34950     }
34951     break;
34952   }
34953   case X86ISD::STRICT_CVTTP2SI:
34954   case X86ISD::STRICT_CVTTP2UI:
34955   case X86ISD::STRICT_CVTSI2P:
34956   case X86ISD::STRICT_CVTUI2P:
34957   case X86ISD::STRICT_VFPROUND:
34958   case X86ISD::STRICT_CVTPS2PH: {
34959     // Strict Conversions - upper elements are known zero.
34960     EVT SrcVT = Op.getOperand(1).getValueType();
34961     if (SrcVT.isVector()) {
34962       unsigned NumSrcElts = SrcVT.getVectorNumElements();
34963       if (NumElts > NumSrcElts &&
34964           DemandedElts.countTrailingZeros() >= NumSrcElts)
34965         Known.setAllZero();
34966     }
34967     break;
34968   }
34969   case X86ISD::MOVQ2DQ: {
34970     // Move from MMX to XMM. Upper half of XMM should be 0.
34971     if (DemandedElts.countTrailingZeros() >= (NumElts / 2))
34972       Known.setAllZero();
34973     break;
34974   }
34975   }
34976 
34977   // Handle target shuffles.
34978   // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
34979   if (isTargetShuffle(Opc)) {
34980     SmallVector<int, 64> Mask;
34981     SmallVector<SDValue, 2> Ops;
34982     if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
34983       unsigned NumOps = Ops.size();
34984       unsigned NumElts = VT.getVectorNumElements();
34985       if (Mask.size() == NumElts) {
34986         SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
34987         Known.Zero.setAllBits(); Known.One.setAllBits();
34988         for (unsigned i = 0; i != NumElts; ++i) {
34989           if (!DemandedElts[i])
34990             continue;
34991           int M = Mask[i];
34992           if (M == SM_SentinelUndef) {
34993             // For UNDEF elements, we don't know anything about the common state
34994             // of the shuffle result.
34995             Known.resetAll();
34996             break;
34997           }
34998           if (M == SM_SentinelZero) {
34999             Known.One.clearAllBits();
35000             continue;
35001           }
35002           assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
35003                  "Shuffle index out of range");
35004 
35005           unsigned OpIdx = (unsigned)M / NumElts;
35006           unsigned EltIdx = (unsigned)M % NumElts;
35007           if (Ops[OpIdx].getValueType() != VT) {
35008             // TODO - handle target shuffle ops with different value types.
35009             Known.resetAll();
35010             break;
35011           }
35012           DemandedOps[OpIdx].setBit(EltIdx);
35013         }
35014         // Known bits are the values that are shared by every demanded element.
35015         for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
35016           if (!DemandedOps[i])
35017             continue;
35018           KnownBits Known2 =
35019               DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
35020           Known = KnownBits::commonBits(Known, Known2);
35021         }
35022       }
35023     }
35024   }
35025 }
35026 
ComputeNumSignBitsForTargetNode(SDValue Op,const APInt & DemandedElts,const SelectionDAG & DAG,unsigned Depth) const35027 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
35028     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
35029     unsigned Depth) const {
35030   EVT VT = Op.getValueType();
35031   unsigned VTBits = VT.getScalarSizeInBits();
35032   unsigned Opcode = Op.getOpcode();
35033   switch (Opcode) {
35034   case X86ISD::SETCC_CARRY:
35035     // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
35036     return VTBits;
35037 
35038   case X86ISD::VTRUNC: {
35039     SDValue Src = Op.getOperand(0);
35040     MVT SrcVT = Src.getSimpleValueType();
35041     unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
35042     assert(VTBits < NumSrcBits && "Illegal truncation input type");
35043     APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
35044     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
35045     if (Tmp > (NumSrcBits - VTBits))
35046       return Tmp - (NumSrcBits - VTBits);
35047     return 1;
35048   }
35049 
35050   case X86ISD::PACKSS: {
35051     // PACKSS is just a truncation if the sign bits extend to the packed size.
35052     APInt DemandedLHS, DemandedRHS;
35053     getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
35054                         DemandedRHS);
35055 
35056     unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
35057     unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
35058     if (!!DemandedLHS)
35059       Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
35060     if (!!DemandedRHS)
35061       Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
35062     unsigned Tmp = std::min(Tmp0, Tmp1);
35063     if (Tmp > (SrcBits - VTBits))
35064       return Tmp - (SrcBits - VTBits);
35065     return 1;
35066   }
35067 
35068   case X86ISD::VBROADCAST: {
35069     SDValue Src = Op.getOperand(0);
35070     if (!Src.getSimpleValueType().isVector())
35071       return DAG.ComputeNumSignBits(Src, Depth + 1);
35072     break;
35073   }
35074 
35075   case X86ISD::VSHLI: {
35076     SDValue Src = Op.getOperand(0);
35077     const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
35078     if (ShiftVal.uge(VTBits))
35079       return VTBits; // Shifted all bits out --> zero.
35080     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
35081     if (ShiftVal.uge(Tmp))
35082       return 1; // Shifted all sign bits out --> unknown.
35083     return Tmp - ShiftVal.getZExtValue();
35084   }
35085 
35086   case X86ISD::VSRAI: {
35087     SDValue Src = Op.getOperand(0);
35088     APInt ShiftVal = Op.getConstantOperandAPInt(1);
35089     if (ShiftVal.uge(VTBits - 1))
35090       return VTBits; // Sign splat.
35091     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
35092     ShiftVal += Tmp;
35093     return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
35094   }
35095 
35096   case X86ISD::FSETCC:
35097     // cmpss/cmpsd return zero/all-bits result values in the bottom element.
35098     if (VT == MVT::f32 || VT == MVT::f64 ||
35099         ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
35100       return VTBits;
35101     break;
35102 
35103   case X86ISD::PCMPGT:
35104   case X86ISD::PCMPEQ:
35105   case X86ISD::CMPP:
35106   case X86ISD::VPCOM:
35107   case X86ISD::VPCOMU:
35108     // Vector compares return zero/all-bits result values.
35109     return VTBits;
35110 
35111   case X86ISD::ANDNP: {
35112     unsigned Tmp0 =
35113         DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
35114     if (Tmp0 == 1) return 1; // Early out.
35115     unsigned Tmp1 =
35116         DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
35117     return std::min(Tmp0, Tmp1);
35118   }
35119 
35120   case X86ISD::CMOV: {
35121     unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
35122     if (Tmp0 == 1) return 1;  // Early out.
35123     unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
35124     return std::min(Tmp0, Tmp1);
35125   }
35126   }
35127 
35128   // Handle target shuffles.
35129   // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
35130   if (isTargetShuffle(Opcode)) {
35131     SmallVector<int, 64> Mask;
35132     SmallVector<SDValue, 2> Ops;
35133     if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
35134       unsigned NumOps = Ops.size();
35135       unsigned NumElts = VT.getVectorNumElements();
35136       if (Mask.size() == NumElts) {
35137         SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
35138         for (unsigned i = 0; i != NumElts; ++i) {
35139           if (!DemandedElts[i])
35140             continue;
35141           int M = Mask[i];
35142           if (M == SM_SentinelUndef) {
35143             // For UNDEF elements, we don't know anything about the common state
35144             // of the shuffle result.
35145             return 1;
35146           } else if (M == SM_SentinelZero) {
35147             // Zero = all sign bits.
35148             continue;
35149           }
35150           assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
35151                  "Shuffle index out of range");
35152 
35153           unsigned OpIdx = (unsigned)M / NumElts;
35154           unsigned EltIdx = (unsigned)M % NumElts;
35155           if (Ops[OpIdx].getValueType() != VT) {
35156             // TODO - handle target shuffle ops with different value types.
35157             return 1;
35158           }
35159           DemandedOps[OpIdx].setBit(EltIdx);
35160         }
35161         unsigned Tmp0 = VTBits;
35162         for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
35163           if (!DemandedOps[i])
35164             continue;
35165           unsigned Tmp1 =
35166               DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
35167           Tmp0 = std::min(Tmp0, Tmp1);
35168         }
35169         return Tmp0;
35170       }
35171     }
35172   }
35173 
35174   // Fallback case.
35175   return 1;
35176 }
35177 
unwrapAddress(SDValue N) const35178 SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
35179   if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
35180     return N->getOperand(0);
35181   return N;
35182 }
35183 
35184 // Helper to look for a normal load that can be narrowed into a vzload with the
35185 // specified VT and memory VT. Returns SDValue() on failure.
narrowLoadToVZLoad(LoadSDNode * LN,MVT MemVT,MVT VT,SelectionDAG & DAG)35186 static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
35187                                   SelectionDAG &DAG) {
35188   // Can't if the load is volatile or atomic.
35189   if (!LN->isSimple())
35190     return SDValue();
35191 
35192   SDVTList Tys = DAG.getVTList(VT, MVT::Other);
35193   SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
35194   return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
35195                                  LN->getPointerInfo(), LN->getOriginalAlign(),
35196                                  LN->getMemOperand()->getFlags());
35197 }
35198 
35199 // Attempt to match a combined shuffle mask against supported unary shuffle
35200 // instructions.
35201 // TODO: Investigate sharing more of this with shuffle lowering.
matchUnaryShuffle(MVT MaskVT,ArrayRef<int> Mask,bool AllowFloatDomain,bool AllowIntDomain,SDValue & V1,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget,unsigned & Shuffle,MVT & SrcVT,MVT & DstVT)35202 static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
35203                               bool AllowFloatDomain, bool AllowIntDomain,
35204                               SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,
35205                               const X86Subtarget &Subtarget, unsigned &Shuffle,
35206                               MVT &SrcVT, MVT &DstVT) {
35207   unsigned NumMaskElts = Mask.size();
35208   unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
35209 
35210   // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
35211   if (MaskEltSize == 32 && Mask[0] == 0) {
35212     if (isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
35213       Shuffle = X86ISD::VZEXT_MOVL;
35214       SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
35215       return true;
35216     }
35217     if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
35218         isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
35219       Shuffle = X86ISD::VZEXT_MOVL;
35220       SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
35221       return true;
35222     }
35223   }
35224 
35225   // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
35226   // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
35227   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
35228                          (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
35229     unsigned MaxScale = 64 / MaskEltSize;
35230     for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
35231       bool MatchAny = true;
35232       bool MatchZero = true;
35233       unsigned NumDstElts = NumMaskElts / Scale;
35234       for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
35235         if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
35236           MatchAny = MatchZero = false;
35237           break;
35238         }
35239         MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
35240         MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
35241       }
35242       if (MatchAny || MatchZero) {
35243         assert(MatchZero && "Failed to match zext but matched aext?");
35244         unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
35245         MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
35246                                             MVT::getIntegerVT(MaskEltSize);
35247         SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
35248 
35249         if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
35250           V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
35251 
35252         Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
35253         if (SrcVT.getVectorNumElements() != NumDstElts)
35254           Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);
35255 
35256         DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
35257         DstVT = MVT::getVectorVT(DstVT, NumDstElts);
35258         return true;
35259       }
35260     }
35261   }
35262 
35263   // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
35264   if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
35265       isUndefOrEqual(Mask[0], 0) &&
35266       isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
35267     Shuffle = X86ISD::VZEXT_MOVL;
35268     SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
35269     return true;
35270   }
35271 
35272   // Check if we have SSE3 which will let us use MOVDDUP etc. The
35273   // instructions are no slower than UNPCKLPD but has the option to
35274   // fold the input operand into even an unaligned memory load.
35275   if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
35276     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) {
35277       Shuffle = X86ISD::MOVDDUP;
35278       SrcVT = DstVT = MVT::v2f64;
35279       return true;
35280     }
35281     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
35282       Shuffle = X86ISD::MOVSLDUP;
35283       SrcVT = DstVT = MVT::v4f32;
35284       return true;
35285     }
35286     if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, V1)) {
35287       Shuffle = X86ISD::MOVSHDUP;
35288       SrcVT = DstVT = MVT::v4f32;
35289       return true;
35290     }
35291   }
35292 
35293   if (MaskVT.is256BitVector() && AllowFloatDomain) {
35294     assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
35295     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
35296       Shuffle = X86ISD::MOVDDUP;
35297       SrcVT = DstVT = MVT::v4f64;
35298       return true;
35299     }
35300     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
35301       Shuffle = X86ISD::MOVSLDUP;
35302       SrcVT = DstVT = MVT::v8f32;
35303       return true;
35304     }
35305     if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) {
35306       Shuffle = X86ISD::MOVSHDUP;
35307       SrcVT = DstVT = MVT::v8f32;
35308       return true;
35309     }
35310   }
35311 
35312   if (MaskVT.is512BitVector() && AllowFloatDomain) {
35313     assert(Subtarget.hasAVX512() &&
35314            "AVX512 required for 512-bit vector shuffles");
35315     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
35316       Shuffle = X86ISD::MOVDDUP;
35317       SrcVT = DstVT = MVT::v8f64;
35318       return true;
35319     }
35320     if (isTargetShuffleEquivalent(
35321             MaskVT, Mask,
35322             {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) {
35323       Shuffle = X86ISD::MOVSLDUP;
35324       SrcVT = DstVT = MVT::v16f32;
35325       return true;
35326     }
35327     if (isTargetShuffleEquivalent(
35328             MaskVT, Mask,
35329             {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) {
35330       Shuffle = X86ISD::MOVSHDUP;
35331       SrcVT = DstVT = MVT::v16f32;
35332       return true;
35333     }
35334   }
35335 
35336   return false;
35337 }
35338 
35339 // Attempt to match a combined shuffle mask against supported unary immediate
35340 // permute instructions.
35341 // TODO: Investigate sharing more of this with shuffle lowering.
matchUnaryPermuteShuffle(MVT MaskVT,ArrayRef<int> Mask,const APInt & Zeroable,bool AllowFloatDomain,bool AllowIntDomain,const X86Subtarget & Subtarget,unsigned & Shuffle,MVT & ShuffleVT,unsigned & PermuteImm)35342 static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
35343                                      const APInt &Zeroable,
35344                                      bool AllowFloatDomain, bool AllowIntDomain,
35345                                      const X86Subtarget &Subtarget,
35346                                      unsigned &Shuffle, MVT &ShuffleVT,
35347                                      unsigned &PermuteImm) {
35348   unsigned NumMaskElts = Mask.size();
35349   unsigned InputSizeInBits = MaskVT.getSizeInBits();
35350   unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
35351   MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
35352   bool ContainsZeros = isAnyZero(Mask);
35353 
35354   // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
35355   if (!ContainsZeros && MaskScalarSizeInBits == 64) {
35356     // Check for lane crossing permutes.
35357     if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
35358       // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
35359       if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
35360         Shuffle = X86ISD::VPERMI;
35361         ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
35362         PermuteImm = getV4X86ShuffleImm(Mask);
35363         return true;
35364       }
35365       if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
35366         SmallVector<int, 4> RepeatedMask;
35367         if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
35368           Shuffle = X86ISD::VPERMI;
35369           ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
35370           PermuteImm = getV4X86ShuffleImm(RepeatedMask);
35371           return true;
35372         }
35373       }
35374     } else if (AllowFloatDomain && Subtarget.hasAVX()) {
35375       // VPERMILPD can permute with a non-repeating shuffle.
35376       Shuffle = X86ISD::VPERMILPI;
35377       ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
35378       PermuteImm = 0;
35379       for (int i = 0, e = Mask.size(); i != e; ++i) {
35380         int M = Mask[i];
35381         if (M == SM_SentinelUndef)
35382           continue;
35383         assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
35384         PermuteImm |= (M & 1) << i;
35385       }
35386       return true;
35387     }
35388   }
35389 
35390   // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
35391   // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
35392   // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
35393   if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
35394       !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
35395     SmallVector<int, 4> RepeatedMask;
35396     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
35397       // Narrow the repeated mask to create 32-bit element permutes.
35398       SmallVector<int, 4> WordMask = RepeatedMask;
35399       if (MaskScalarSizeInBits == 64)
35400         narrowShuffleMaskElts(2, RepeatedMask, WordMask);
35401 
35402       Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
35403       ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
35404       ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
35405       PermuteImm = getV4X86ShuffleImm(WordMask);
35406       return true;
35407     }
35408   }
35409 
35410   // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
35411   if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
35412       ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
35413        (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
35414        (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
35415     SmallVector<int, 4> RepeatedMask;
35416     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
35417       ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
35418       ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
35419 
35420       // PSHUFLW: permute lower 4 elements only.
35421       if (isUndefOrInRange(LoMask, 0, 4) &&
35422           isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
35423         Shuffle = X86ISD::PSHUFLW;
35424         ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
35425         PermuteImm = getV4X86ShuffleImm(LoMask);
35426         return true;
35427       }
35428 
35429       // PSHUFHW: permute upper 4 elements only.
35430       if (isUndefOrInRange(HiMask, 4, 8) &&
35431           isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
35432         // Offset the HiMask so that we can create the shuffle immediate.
35433         int OffsetHiMask[4];
35434         for (int i = 0; i != 4; ++i)
35435           OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
35436 
35437         Shuffle = X86ISD::PSHUFHW;
35438         ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
35439         PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
35440         return true;
35441       }
35442     }
35443   }
35444 
35445   // Attempt to match against byte/bit shifts.
35446   if (AllowIntDomain &&
35447       ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
35448        (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
35449        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
35450     int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
35451                                        Mask, 0, Zeroable, Subtarget);
35452     if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
35453                          32 <= ShuffleVT.getScalarSizeInBits())) {
35454       PermuteImm = (unsigned)ShiftAmt;
35455       return true;
35456     }
35457   }
35458 
35459   // Attempt to match against bit rotates.
35460   if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
35461       ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
35462        Subtarget.hasAVX512())) {
35463     int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
35464                                             Subtarget, Mask);
35465     if (0 < RotateAmt) {
35466       Shuffle = X86ISD::VROTLI;
35467       PermuteImm = (unsigned)RotateAmt;
35468       return true;
35469     }
35470   }
35471 
35472   return false;
35473 }
35474 
35475 // Attempt to match a combined unary shuffle mask against supported binary
35476 // shuffle instructions.
35477 // TODO: Investigate sharing more of this with shuffle lowering.
matchBinaryShuffle(MVT MaskVT,ArrayRef<int> Mask,bool AllowFloatDomain,bool AllowIntDomain,SDValue & V1,SDValue & V2,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget,unsigned & Shuffle,MVT & SrcVT,MVT & DstVT,bool IsUnary)35478 static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
35479                                bool AllowFloatDomain, bool AllowIntDomain,
35480                                SDValue &V1, SDValue &V2, const SDLoc &DL,
35481                                SelectionDAG &DAG, const X86Subtarget &Subtarget,
35482                                unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
35483                                bool IsUnary) {
35484   unsigned NumMaskElts = Mask.size();
35485   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
35486 
35487   if (MaskVT.is128BitVector()) {
35488     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}) && AllowFloatDomain) {
35489       V2 = V1;
35490       V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
35491       Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
35492       SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
35493       return true;
35494     }
35495     if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}) && AllowFloatDomain) {
35496       V2 = V1;
35497       Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
35498       SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
35499       return true;
35500     }
35501     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}) &&
35502         Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
35503       std::swap(V1, V2);
35504       Shuffle = X86ISD::MOVSD;
35505       SrcVT = DstVT = MVT::v2f64;
35506       return true;
35507     }
35508     if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}) &&
35509         (AllowFloatDomain || !Subtarget.hasSSE41())) {
35510       Shuffle = X86ISD::MOVSS;
35511       SrcVT = DstVT = MVT::v4f32;
35512       return true;
35513     }
35514   }
35515 
35516   // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
35517   if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
35518       ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
35519       ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
35520     if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
35521                              Subtarget)) {
35522       DstVT = MaskVT;
35523       return true;
35524     }
35525   }
35526 
35527   // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
35528   if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
35529       (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
35530       (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
35531       (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
35532       (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
35533     if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
35534                               Subtarget)) {
35535       SrcVT = DstVT = MaskVT;
35536       if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
35537         SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
35538       return true;
35539     }
35540   }
35541 
35542   // Attempt to match against a OR if we're performing a blend shuffle and the
35543   // non-blended source element is zero in each case.
35544   if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
35545       (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
35546     bool IsBlend = true;
35547     unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
35548     unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
35549     unsigned Scale1 = NumV1Elts / NumMaskElts;
35550     unsigned Scale2 = NumV2Elts / NumMaskElts;
35551     APInt DemandedZeroV1 = APInt::getNullValue(NumV1Elts);
35552     APInt DemandedZeroV2 = APInt::getNullValue(NumV2Elts);
35553     for (unsigned i = 0; i != NumMaskElts; ++i) {
35554       int M = Mask[i];
35555       if (M == SM_SentinelUndef)
35556         continue;
35557       if (M == SM_SentinelZero) {
35558         DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
35559         DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
35560         continue;
35561       }
35562       if (M == (int)i) {
35563         DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
35564         continue;
35565       }
35566       if (M == (int)(i + NumMaskElts)) {
35567         DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
35568         continue;
35569       }
35570       IsBlend = false;
35571       break;
35572     }
35573     if (IsBlend &&
35574         DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
35575         DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
35576       Shuffle = ISD::OR;
35577       SrcVT = DstVT = MaskVT.changeTypeToInteger();
35578       return true;
35579     }
35580   }
35581 
35582   return false;
35583 }
35584 
matchBinaryPermuteShuffle(MVT MaskVT,ArrayRef<int> Mask,const APInt & Zeroable,bool AllowFloatDomain,bool AllowIntDomain,SDValue & V1,SDValue & V2,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget,unsigned & Shuffle,MVT & ShuffleVT,unsigned & PermuteImm)35585 static bool matchBinaryPermuteShuffle(
35586     MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
35587     bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
35588     const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
35589     unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
35590   unsigned NumMaskElts = Mask.size();
35591   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
35592 
35593   // Attempt to match against VALIGND/VALIGNQ rotate.
35594   if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
35595       ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
35596        (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
35597        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
35598     if (!isAnyZero(Mask)) {
35599       int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
35600       if (0 < Rotation) {
35601         Shuffle = X86ISD::VALIGN;
35602         if (EltSizeInBits == 64)
35603           ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
35604         else
35605           ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
35606         PermuteImm = Rotation;
35607         return true;
35608       }
35609     }
35610   }
35611 
35612   // Attempt to match against PALIGNR byte rotate.
35613   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
35614                          (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
35615                          (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
35616     int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
35617     if (0 < ByteRotation) {
35618       Shuffle = X86ISD::PALIGNR;
35619       ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
35620       PermuteImm = ByteRotation;
35621       return true;
35622     }
35623   }
35624 
35625   // Attempt to combine to X86ISD::BLENDI.
35626   if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
35627                             (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
35628       (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
35629     uint64_t BlendMask = 0;
35630     bool ForceV1Zero = false, ForceV2Zero = false;
35631     SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
35632     if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
35633                             ForceV2Zero, BlendMask)) {
35634       if (MaskVT == MVT::v16i16) {
35635         // We can only use v16i16 PBLENDW if the lanes are repeated.
35636         SmallVector<int, 8> RepeatedMask;
35637         if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
35638                                         RepeatedMask)) {
35639           assert(RepeatedMask.size() == 8 &&
35640                  "Repeated mask size doesn't match!");
35641           PermuteImm = 0;
35642           for (int i = 0; i < 8; ++i)
35643             if (RepeatedMask[i] >= 8)
35644               PermuteImm |= 1 << i;
35645           V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
35646           V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
35647           Shuffle = X86ISD::BLENDI;
35648           ShuffleVT = MaskVT;
35649           return true;
35650         }
35651       } else {
35652         V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
35653         V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
35654         PermuteImm = (unsigned)BlendMask;
35655         Shuffle = X86ISD::BLENDI;
35656         ShuffleVT = MaskVT;
35657         return true;
35658       }
35659     }
35660   }
35661 
35662   // Attempt to combine to INSERTPS, but only if it has elements that need to
35663   // be set to zero.
35664   if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
35665       MaskVT.is128BitVector() && isAnyZero(Mask) &&
35666       matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
35667     Shuffle = X86ISD::INSERTPS;
35668     ShuffleVT = MVT::v4f32;
35669     return true;
35670   }
35671 
35672   // Attempt to combine to SHUFPD.
35673   if (AllowFloatDomain && EltSizeInBits == 64 &&
35674       ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
35675        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
35676        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
35677     bool ForceV1Zero = false, ForceV2Zero = false;
35678     if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
35679                                PermuteImm, Mask, Zeroable)) {
35680       V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
35681       V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
35682       Shuffle = X86ISD::SHUFP;
35683       ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
35684       return true;
35685     }
35686   }
35687 
35688   // Attempt to combine to SHUFPS.
35689   if (AllowFloatDomain && EltSizeInBits == 32 &&
35690       ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
35691        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
35692        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
35693     SmallVector<int, 4> RepeatedMask;
35694     if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
35695       // Match each half of the repeated mask, to determine if its just
35696       // referencing one of the vectors, is zeroable or entirely undef.
35697       auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
35698         int M0 = RepeatedMask[Offset];
35699         int M1 = RepeatedMask[Offset + 1];
35700 
35701         if (isUndefInRange(RepeatedMask, Offset, 2)) {
35702           return DAG.getUNDEF(MaskVT);
35703         } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
35704           S0 = (SM_SentinelUndef == M0 ? -1 : 0);
35705           S1 = (SM_SentinelUndef == M1 ? -1 : 1);
35706           return getZeroVector(MaskVT, Subtarget, DAG, DL);
35707         } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
35708           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
35709           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
35710           return V1;
35711         } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
35712           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
35713           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
35714           return V2;
35715         }
35716 
35717         return SDValue();
35718       };
35719 
35720       int ShufMask[4] = {-1, -1, -1, -1};
35721       SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
35722       SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
35723 
35724       if (Lo && Hi) {
35725         V1 = Lo;
35726         V2 = Hi;
35727         Shuffle = X86ISD::SHUFP;
35728         ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
35729         PermuteImm = getV4X86ShuffleImm(ShufMask);
35730         return true;
35731       }
35732     }
35733   }
35734 
35735   // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
35736   if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
35737       MaskVT.is128BitVector() &&
35738       matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
35739     Shuffle = X86ISD::INSERTPS;
35740     ShuffleVT = MVT::v4f32;
35741     return true;
35742   }
35743 
35744   return false;
35745 }
35746 
35747 static SDValue combineX86ShuffleChainWithExtract(
35748     ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
35749     bool HasVariableMask, bool AllowVariableCrossLaneMask,
35750     bool AllowVariablePerLaneMask, SelectionDAG &DAG,
35751     const X86Subtarget &Subtarget);
35752 
35753 /// Combine an arbitrary chain of shuffles into a single instruction if
35754 /// possible.
35755 ///
35756 /// This is the leaf of the recursive combine below. When we have found some
35757 /// chain of single-use x86 shuffle instructions and accumulated the combined
35758 /// shuffle mask represented by them, this will try to pattern match that mask
35759 /// into either a single instruction if there is a special purpose instruction
35760 /// for this operation, or into a PSHUFB instruction which is a fully general
35761 /// instruction but should only be used to replace chains over a certain depth.
combineX86ShuffleChain(ArrayRef<SDValue> Inputs,SDValue Root,ArrayRef<int> BaseMask,int Depth,bool HasVariableMask,bool AllowVariableCrossLaneMask,bool AllowVariablePerLaneMask,SelectionDAG & DAG,const X86Subtarget & Subtarget)35762 static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
35763                                       ArrayRef<int> BaseMask, int Depth,
35764                                       bool HasVariableMask,
35765                                       bool AllowVariableCrossLaneMask,
35766                                       bool AllowVariablePerLaneMask,
35767                                       SelectionDAG &DAG,
35768                                       const X86Subtarget &Subtarget) {
35769   assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
35770   assert((Inputs.size() == 1 || Inputs.size() == 2) &&
35771          "Unexpected number of shuffle inputs!");
35772 
35773   MVT RootVT = Root.getSimpleValueType();
35774   unsigned RootSizeInBits = RootVT.getSizeInBits();
35775   unsigned NumRootElts = RootVT.getVectorNumElements();
35776 
35777   // Canonicalize shuffle input op to the requested type.
35778   // TODO: Support cases where Op is smaller than VT.
35779   auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
35780     return DAG.getBitcast(VT, Op);
35781   };
35782 
35783   // Find the inputs that enter the chain. Note that multiple uses are OK
35784   // here, we're not going to remove the operands we find.
35785   bool UnaryShuffle = (Inputs.size() == 1);
35786   SDValue V1 = peekThroughBitcasts(Inputs[0]);
35787   SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
35788                              : peekThroughBitcasts(Inputs[1]));
35789 
35790   MVT VT1 = V1.getSimpleValueType();
35791   MVT VT2 = V2.getSimpleValueType();
35792   assert(VT1.getSizeInBits() == RootSizeInBits &&
35793          VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch");
35794 
35795   SDLoc DL(Root);
35796   SDValue Res;
35797 
35798   unsigned NumBaseMaskElts = BaseMask.size();
35799   if (NumBaseMaskElts == 1) {
35800     assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
35801     return CanonicalizeShuffleInput(RootVT, V1);
35802   }
35803 
35804   bool OptForSize = DAG.shouldOptForSize();
35805   unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
35806   bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
35807                      (RootVT.isFloatingPoint() && Depth >= 1) ||
35808                      (RootVT.is256BitVector() && !Subtarget.hasAVX2());
35809 
35810   // Don't combine if we are a AVX512/EVEX target and the mask element size
35811   // is different from the root element size - this would prevent writemasks
35812   // from being reused.
35813   bool IsMaskedShuffle = false;
35814   if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
35815     if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
35816         Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
35817       IsMaskedShuffle = true;
35818     }
35819   }
35820 
35821   // If we are shuffling a broadcast (and not introducing zeros) then
35822   // we can just use the broadcast directly. This works for smaller broadcast
35823   // elements as well as they already repeat across each mask element
35824   if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
35825       (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
35826       V1.getValueSizeInBits() >= RootSizeInBits) {
35827     return CanonicalizeShuffleInput(RootVT, V1);
35828   }
35829 
35830   // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
35831   // etc. can be simplified.
35832   if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
35833     SmallVector<int> ScaledMask, IdentityMask;
35834     unsigned NumElts = VT1.getVectorNumElements();
35835     if (BaseMask.size() <= NumElts &&
35836         scaleShuffleElements(BaseMask, NumElts, ScaledMask)) {
35837       for (unsigned i = 0; i != NumElts; ++i)
35838         IdentityMask.push_back(i);
35839       if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2))
35840         return CanonicalizeShuffleInput(RootVT, V1);
35841     }
35842   }
35843 
35844   // Handle 128/256-bit lane shuffles of 512-bit vectors.
35845   if (RootVT.is512BitVector() &&
35846       (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
35847     // If the upper subvectors are zeroable, then an extract+insert is more
35848     // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
35849     // to zero the upper subvectors.
35850     if (isUndefOrZeroInRange(BaseMask, 1, NumBaseMaskElts - 1)) {
35851       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
35852         return SDValue(); // Nothing to do!
35853       assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) &&
35854              "Unexpected lane shuffle");
35855       Res = CanonicalizeShuffleInput(RootVT, V1);
35856       unsigned SubIdx = BaseMask[0] * (NumRootElts / NumBaseMaskElts);
35857       bool UseZero = isAnyZero(BaseMask);
35858       Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
35859       return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
35860     }
35861 
35862     // Narrow shuffle mask to v4x128.
35863     SmallVector<int, 4> Mask;
35864     assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
35865     narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, Mask);
35866 
35867     // Try to lower to vshuf64x2/vshuf32x4.
35868     auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask,
35869                             SDValue V1, SDValue V2, SelectionDAG &DAG) {
35870       unsigned PermMask = 0;
35871       // Insure elements came from the same Op.
35872       SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
35873       for (int i = 0; i < 4; ++i) {
35874         assert(Mask[i] >= -1 && "Illegal shuffle sentinel value");
35875         if (Mask[i] < 0)
35876           continue;
35877 
35878         SDValue Op = Mask[i] >= 4 ? V2 : V1;
35879         unsigned OpIndex = i / 2;
35880         if (Ops[OpIndex].isUndef())
35881           Ops[OpIndex] = Op;
35882         else if (Ops[OpIndex] != Op)
35883           return SDValue();
35884 
35885         // Convert the 128-bit shuffle mask selection values into 128-bit
35886         // selection bits defined by a vshuf64x2 instruction's immediate control
35887         // byte.
35888         PermMask |= (Mask[i] % 4) << (i * 2);
35889       }
35890 
35891       return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
35892                          CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
35893                          CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
35894                          DAG.getTargetConstant(PermMask, DL, MVT::i8));
35895     };
35896 
35897     // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
35898     // doesn't work because our mask is for 128 bits and we don't have an MVT
35899     // to match that.
35900     bool PreferPERMQ =
35901         UnaryShuffle && isUndefOrInRange(Mask[0], 0, 2) &&
35902         isUndefOrInRange(Mask[1], 0, 2) && isUndefOrInRange(Mask[2], 2, 4) &&
35903         isUndefOrInRange(Mask[3], 2, 4) &&
35904         (Mask[0] < 0 || Mask[2] < 0 || Mask[0] == (Mask[2] % 2)) &&
35905         (Mask[1] < 0 || Mask[3] < 0 || Mask[1] == (Mask[3] % 2));
35906 
35907     if (!isAnyZero(Mask) && !PreferPERMQ) {
35908       if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
35909         return SDValue(); // Nothing to do!
35910       MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
35911       if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG))
35912         return DAG.getBitcast(RootVT, V);
35913     }
35914   }
35915 
35916   // Handle 128-bit lane shuffles of 256-bit vectors.
35917   if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
35918     // If the upper half is zeroable, then an extract+insert is more optimal
35919     // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
35920     // zero the upper half.
35921     if (isUndefOrZero(BaseMask[1])) {
35922       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
35923         return SDValue(); // Nothing to do!
35924       assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle");
35925       Res = CanonicalizeShuffleInput(RootVT, V1);
35926       Res = extract128BitVector(Res, BaseMask[0] * (NumRootElts / 2), DAG, DL);
35927       return widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
35928                             DL, 256);
35929     }
35930 
35931     // If we're splatting the low subvector, an insert-subvector 'concat'
35932     // pattern is quicker than VPERM2X128.
35933     // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
35934     if (BaseMask[0] == 0 && BaseMask[1] == 0 && !Subtarget.hasAVX2()) {
35935       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
35936         return SDValue(); // Nothing to do!
35937       Res = CanonicalizeShuffleInput(RootVT, V1);
35938       Res = extractSubVector(Res, 0, DAG, DL, 128);
35939       return concatSubVectors(Res, Res, DAG, DL);
35940     }
35941 
35942     if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
35943       return SDValue(); // Nothing to do!
35944 
35945     // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
35946     // we need to use the zeroing feature.
35947     // Prefer blends for sequential shuffles unless we are optimizing for size.
35948     if (UnaryShuffle &&
35949         !(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) &&
35950         (OptForSize || !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0))) {
35951       unsigned PermMask = 0;
35952       PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
35953       PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
35954       return DAG.getNode(
35955           X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
35956           DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
35957     }
35958 
35959     if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
35960       return SDValue(); // Nothing to do!
35961 
35962     // TODO - handle AVX512VL cases with X86ISD::SHUF128.
35963     if (!UnaryShuffle && !IsMaskedShuffle) {
35964       assert(llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) &&
35965              "Unexpected shuffle sentinel value");
35966       // Prefer blends to X86ISD::VPERM2X128.
35967       if (!((BaseMask[0] == 0 && BaseMask[1] == 3) ||
35968             (BaseMask[0] == 2 && BaseMask[1] == 1))) {
35969         unsigned PermMask = 0;
35970         PermMask |= ((BaseMask[0] & 3) << 0);
35971         PermMask |= ((BaseMask[1] & 3) << 4);
35972         SDValue LHS = isInRange(BaseMask[0], 0, 2) ? V1 : V2;
35973         SDValue RHS = isInRange(BaseMask[1], 0, 2) ? V1 : V2;
35974         return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
35975                           CanonicalizeShuffleInput(RootVT, LHS),
35976                           CanonicalizeShuffleInput(RootVT, RHS),
35977                           DAG.getTargetConstant(PermMask, DL, MVT::i8));
35978       }
35979     }
35980   }
35981 
35982   // For masks that have been widened to 128-bit elements or more,
35983   // narrow back down to 64-bit elements.
35984   SmallVector<int, 64> Mask;
35985   if (BaseMaskEltSizeInBits > 64) {
35986     assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
35987     int MaskScale = BaseMaskEltSizeInBits / 64;
35988     narrowShuffleMaskElts(MaskScale, BaseMask, Mask);
35989   } else {
35990     Mask.assign(BaseMask.begin(), BaseMask.end());
35991   }
35992 
35993   // For masked shuffles, we're trying to match the root width for better
35994   // writemask folding, attempt to scale the mask.
35995   // TODO - variable shuffles might need this to be widened again.
35996   if (IsMaskedShuffle && NumRootElts > Mask.size()) {
35997     assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
35998     int MaskScale = NumRootElts / Mask.size();
35999     SmallVector<int, 64> ScaledMask;
36000     narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
36001     Mask = std::move(ScaledMask);
36002   }
36003 
36004   unsigned NumMaskElts = Mask.size();
36005   unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
36006 
36007   // Determine the effective mask value type.
36008   FloatDomain &= (32 <= MaskEltSizeInBits);
36009   MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
36010                            : MVT::getIntegerVT(MaskEltSizeInBits);
36011   MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
36012 
36013   // Only allow legal mask types.
36014   if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
36015     return SDValue();
36016 
36017   // Attempt to match the mask against known shuffle patterns.
36018   MVT ShuffleSrcVT, ShuffleVT;
36019   unsigned Shuffle, PermuteImm;
36020 
36021   // Which shuffle domains are permitted?
36022   // Permit domain crossing at higher combine depths.
36023   // TODO: Should we indicate which domain is preferred if both are allowed?
36024   bool AllowFloatDomain = FloatDomain || (Depth >= 3);
36025   bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
36026                         (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
36027 
36028   // Determine zeroable mask elements.
36029   APInt KnownUndef, KnownZero;
36030   resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
36031   APInt Zeroable = KnownUndef | KnownZero;
36032 
36033   if (UnaryShuffle) {
36034     // Attempt to match against broadcast-from-vector.
36035     // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
36036     if ((Subtarget.hasAVX2() ||
36037          (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
36038         (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
36039       if (isUndefOrEqual(Mask, 0)) {
36040         if (V1.getValueType() == MaskVT &&
36041             V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
36042             MayFoldLoad(V1.getOperand(0))) {
36043           if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
36044             return SDValue(); // Nothing to do!
36045           Res = V1.getOperand(0);
36046           Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
36047           return DAG.getBitcast(RootVT, Res);
36048         }
36049         if (Subtarget.hasAVX2()) {
36050           if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
36051             return SDValue(); // Nothing to do!
36052           Res = CanonicalizeShuffleInput(MaskVT, V1);
36053           Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
36054           return DAG.getBitcast(RootVT, Res);
36055         }
36056       }
36057     }
36058 
36059     SDValue NewV1 = V1; // Save operand in case early exit happens.
36060     if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
36061                           DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
36062                           ShuffleVT) &&
36063         (!IsMaskedShuffle ||
36064          (NumRootElts == ShuffleVT.getVectorNumElements()))) {
36065       if (Depth == 0 && Root.getOpcode() == Shuffle)
36066         return SDValue(); // Nothing to do!
36067       Res = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
36068       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
36069       return DAG.getBitcast(RootVT, Res);
36070     }
36071 
36072     if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
36073                                  AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
36074                                  PermuteImm) &&
36075         (!IsMaskedShuffle ||
36076          (NumRootElts == ShuffleVT.getVectorNumElements()))) {
36077       if (Depth == 0 && Root.getOpcode() == Shuffle)
36078         return SDValue(); // Nothing to do!
36079       Res = CanonicalizeShuffleInput(ShuffleVT, V1);
36080       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
36081                         DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
36082       return DAG.getBitcast(RootVT, Res);
36083     }
36084   }
36085 
36086   // Attempt to combine to INSERTPS, but only if the inserted element has come
36087   // from a scalar.
36088   // TODO: Handle other insertions here as well?
36089   if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
36090       Subtarget.hasSSE41() &&
36091       !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3})) {
36092     if (MaskEltSizeInBits == 32) {
36093       SDValue SrcV1 = V1, SrcV2 = V2;
36094       if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
36095                                  DAG) &&
36096           SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
36097         if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
36098           return SDValue(); // Nothing to do!
36099         Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
36100                           CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
36101                           CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
36102                           DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
36103         return DAG.getBitcast(RootVT, Res);
36104       }
36105     }
36106     if (MaskEltSizeInBits == 64 &&
36107         isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}) &&
36108         V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
36109         V2.getScalarValueSizeInBits() <= 32) {
36110       if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
36111         return SDValue(); // Nothing to do!
36112       PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0);
36113       Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
36114                         CanonicalizeShuffleInput(MVT::v4f32, V1),
36115                         CanonicalizeShuffleInput(MVT::v4f32, V2),
36116                         DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
36117       return DAG.getBitcast(RootVT, Res);
36118     }
36119   }
36120 
36121   SDValue NewV1 = V1; // Save operands in case early exit happens.
36122   SDValue NewV2 = V2;
36123   if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
36124                          NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
36125                          ShuffleVT, UnaryShuffle) &&
36126       (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
36127     if (Depth == 0 && Root.getOpcode() == Shuffle)
36128       return SDValue(); // Nothing to do!
36129     NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
36130     NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
36131     Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
36132     return DAG.getBitcast(RootVT, Res);
36133   }
36134 
36135   NewV1 = V1; // Save operands in case early exit happens.
36136   NewV2 = V2;
36137   if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
36138                                 AllowIntDomain, NewV1, NewV2, DL, DAG,
36139                                 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
36140       (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
36141     if (Depth == 0 && Root.getOpcode() == Shuffle)
36142       return SDValue(); // Nothing to do!
36143     NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
36144     NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
36145     Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
36146                       DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
36147     return DAG.getBitcast(RootVT, Res);
36148   }
36149 
36150   // Typically from here on, we need an integer version of MaskVT.
36151   MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
36152   IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
36153 
36154   // Annoyingly, SSE4A instructions don't map into the above match helpers.
36155   if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
36156     uint64_t BitLen, BitIdx;
36157     if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
36158                             Zeroable)) {
36159       if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
36160         return SDValue(); // Nothing to do!
36161       V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
36162       Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
36163                         DAG.getTargetConstant(BitLen, DL, MVT::i8),
36164                         DAG.getTargetConstant(BitIdx, DL, MVT::i8));
36165       return DAG.getBitcast(RootVT, Res);
36166     }
36167 
36168     if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
36169       if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
36170         return SDValue(); // Nothing to do!
36171       V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
36172       V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
36173       Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
36174                         DAG.getTargetConstant(BitLen, DL, MVT::i8),
36175                         DAG.getTargetConstant(BitIdx, DL, MVT::i8));
36176       return DAG.getBitcast(RootVT, Res);
36177     }
36178   }
36179 
36180   // Match shuffle against TRUNCATE patterns.
36181   if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
36182     // Match against a VTRUNC instruction, accounting for src/dst sizes.
36183     if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
36184                              Subtarget)) {
36185       bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
36186                         ShuffleSrcVT.getVectorNumElements();
36187       unsigned Opc =
36188           IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
36189       if (Depth == 0 && Root.getOpcode() == Opc)
36190         return SDValue(); // Nothing to do!
36191       V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
36192       Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
36193       if (ShuffleVT.getSizeInBits() < RootSizeInBits)
36194         Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
36195       return DAG.getBitcast(RootVT, Res);
36196     }
36197 
36198     // Do we need a more general binary truncation pattern?
36199     if (RootSizeInBits < 512 &&
36200         ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
36201          (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
36202         (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
36203         isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
36204       if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
36205         return SDValue(); // Nothing to do!
36206       ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
36207       ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
36208       V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
36209       V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
36210       ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
36211       ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
36212       Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
36213       Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
36214       return DAG.getBitcast(RootVT, Res);
36215     }
36216   }
36217 
36218   // Don't try to re-form single instruction chains under any circumstances now
36219   // that we've done encoding canonicalization for them.
36220   if (Depth < 1)
36221     return SDValue();
36222 
36223   // Depth threshold above which we can efficiently use variable mask shuffles.
36224   int VariableCrossLaneShuffleDepth =
36225       Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
36226   int VariablePerLaneShuffleDepth =
36227       Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
36228   AllowVariableCrossLaneMask &=
36229       (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
36230   AllowVariablePerLaneMask &=
36231       (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
36232   // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
36233   // higher depth before combining them.
36234   bool AllowBWIVPERMV3 =
36235       (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
36236 
36237   bool MaskContainsZeros = isAnyZero(Mask);
36238 
36239   if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
36240     // If we have a single input lane-crossing shuffle then lower to VPERMV.
36241     if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
36242       if (Subtarget.hasAVX2() &&
36243           (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
36244         SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
36245         Res = CanonicalizeShuffleInput(MaskVT, V1);
36246         Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
36247         return DAG.getBitcast(RootVT, Res);
36248       }
36249       // AVX512 variants (non-VLX will pad to 512-bit shuffles).
36250       if ((Subtarget.hasAVX512() &&
36251            (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
36252             MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
36253           (Subtarget.hasBWI() &&
36254            (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36255           (Subtarget.hasVBMI() &&
36256            (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
36257         V1 = CanonicalizeShuffleInput(MaskVT, V1);
36258         V2 = DAG.getUNDEF(MaskVT);
36259         Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36260         return DAG.getBitcast(RootVT, Res);
36261       }
36262     }
36263 
36264     // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
36265     // vector as the second source (non-VLX will pad to 512-bit shuffles).
36266     if (UnaryShuffle && AllowVariableCrossLaneMask &&
36267         ((Subtarget.hasAVX512() &&
36268           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
36269            MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
36270            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
36271            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
36272          (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
36273           (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36274          (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
36275           (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
36276       // Adjust shuffle mask - replace SM_SentinelZero with second source index.
36277       for (unsigned i = 0; i != NumMaskElts; ++i)
36278         if (Mask[i] == SM_SentinelZero)
36279           Mask[i] = NumMaskElts + i;
36280       V1 = CanonicalizeShuffleInput(MaskVT, V1);
36281       V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
36282       Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36283       return DAG.getBitcast(RootVT, Res);
36284     }
36285 
36286     // If that failed and either input is extracted then try to combine as a
36287     // shuffle with the larger type.
36288     if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
36289             Inputs, Root, BaseMask, Depth, HasVariableMask,
36290             AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
36291             Subtarget))
36292       return WideShuffle;
36293 
36294     // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
36295     // (non-VLX will pad to 512-bit shuffles).
36296     if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
36297         ((Subtarget.hasAVX512() &&
36298           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
36299            MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
36300            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
36301            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
36302          (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
36303           (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36304          (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
36305           (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
36306       V1 = CanonicalizeShuffleInput(MaskVT, V1);
36307       V2 = CanonicalizeShuffleInput(MaskVT, V2);
36308       Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36309       return DAG.getBitcast(RootVT, Res);
36310     }
36311     return SDValue();
36312   }
36313 
36314   // See if we can combine a single input shuffle with zeros to a bit-mask,
36315   // which is much simpler than any shuffle.
36316   if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
36317       isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
36318       DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
36319     APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
36320     APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
36321     APInt UndefElts(NumMaskElts, 0);
36322     SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
36323     for (unsigned i = 0; i != NumMaskElts; ++i) {
36324       int M = Mask[i];
36325       if (M == SM_SentinelUndef) {
36326         UndefElts.setBit(i);
36327         continue;
36328       }
36329       if (M == SM_SentinelZero)
36330         continue;
36331       EltBits[i] = AllOnes;
36332     }
36333     SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
36334     Res = CanonicalizeShuffleInput(MaskVT, V1);
36335     unsigned AndOpcode =
36336         MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
36337     Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
36338     return DAG.getBitcast(RootVT, Res);
36339   }
36340 
36341   // If we have a single input shuffle with different shuffle patterns in the
36342   // the 128-bit lanes use the variable mask to VPERMILPS.
36343   // TODO Combine other mask types at higher depths.
36344   if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
36345       ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
36346        (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
36347     SmallVector<SDValue, 16> VPermIdx;
36348     for (int M : Mask) {
36349       SDValue Idx =
36350           M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
36351       VPermIdx.push_back(Idx);
36352     }
36353     SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
36354     Res = CanonicalizeShuffleInput(MaskVT, V1);
36355     Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
36356     return DAG.getBitcast(RootVT, Res);
36357   }
36358 
36359   // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
36360   // to VPERMIL2PD/VPERMIL2PS.
36361   if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
36362       (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
36363        MaskVT == MVT::v8f32)) {
36364     // VPERMIL2 Operation.
36365     // Bits[3] - Match Bit.
36366     // Bits[2:1] - (Per Lane) PD Shuffle Mask.
36367     // Bits[2:0] - (Per Lane) PS Shuffle Mask.
36368     unsigned NumLanes = MaskVT.getSizeInBits() / 128;
36369     unsigned NumEltsPerLane = NumMaskElts / NumLanes;
36370     SmallVector<int, 8> VPerm2Idx;
36371     unsigned M2ZImm = 0;
36372     for (int M : Mask) {
36373       if (M == SM_SentinelUndef) {
36374         VPerm2Idx.push_back(-1);
36375         continue;
36376       }
36377       if (M == SM_SentinelZero) {
36378         M2ZImm = 2;
36379         VPerm2Idx.push_back(8);
36380         continue;
36381       }
36382       int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
36383       Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
36384       VPerm2Idx.push_back(Index);
36385     }
36386     V1 = CanonicalizeShuffleInput(MaskVT, V1);
36387     V2 = CanonicalizeShuffleInput(MaskVT, V2);
36388     SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
36389     Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
36390                       DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
36391     return DAG.getBitcast(RootVT, Res);
36392   }
36393 
36394   // If we have 3 or more shuffle instructions or a chain involving a variable
36395   // mask, we can replace them with a single PSHUFB instruction profitably.
36396   // Intel's manuals suggest only using PSHUFB if doing so replacing 5
36397   // instructions, but in practice PSHUFB tends to be *very* fast so we're
36398   // more aggressive.
36399   if (UnaryShuffle && AllowVariablePerLaneMask &&
36400       ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
36401        (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
36402        (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
36403     SmallVector<SDValue, 16> PSHUFBMask;
36404     int NumBytes = RootVT.getSizeInBits() / 8;
36405     int Ratio = NumBytes / NumMaskElts;
36406     for (int i = 0; i < NumBytes; ++i) {
36407       int M = Mask[i / Ratio];
36408       if (M == SM_SentinelUndef) {
36409         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
36410         continue;
36411       }
36412       if (M == SM_SentinelZero) {
36413         PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
36414         continue;
36415       }
36416       M = Ratio * M + i % Ratio;
36417       assert((M / 16) == (i / 16) && "Lane crossing detected");
36418       PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
36419     }
36420     MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
36421     Res = CanonicalizeShuffleInput(ByteVT, V1);
36422     SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
36423     Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
36424     return DAG.getBitcast(RootVT, Res);
36425   }
36426 
36427   // With XOP, if we have a 128-bit binary input shuffle we can always combine
36428   // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
36429   // slower than PSHUFB on targets that support both.
36430   if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
36431       Subtarget.hasXOP()) {
36432     // VPPERM Mask Operation
36433     // Bits[4:0] - Byte Index (0 - 31)
36434     // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
36435     SmallVector<SDValue, 16> VPPERMMask;
36436     int NumBytes = 16;
36437     int Ratio = NumBytes / NumMaskElts;
36438     for (int i = 0; i < NumBytes; ++i) {
36439       int M = Mask[i / Ratio];
36440       if (M == SM_SentinelUndef) {
36441         VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
36442         continue;
36443       }
36444       if (M == SM_SentinelZero) {
36445         VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
36446         continue;
36447       }
36448       M = Ratio * M + i % Ratio;
36449       VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
36450     }
36451     MVT ByteVT = MVT::v16i8;
36452     V1 = CanonicalizeShuffleInput(ByteVT, V1);
36453     V2 = CanonicalizeShuffleInput(ByteVT, V2);
36454     SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
36455     Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
36456     return DAG.getBitcast(RootVT, Res);
36457   }
36458 
36459   // If that failed and either input is extracted then try to combine as a
36460   // shuffle with the larger type.
36461   if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
36462           Inputs, Root, BaseMask, Depth, HasVariableMask,
36463           AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
36464     return WideShuffle;
36465 
36466   // If we have a dual input shuffle then lower to VPERMV3,
36467   // (non-VLX will pad to 512-bit shuffles)
36468   if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
36469       ((Subtarget.hasAVX512() &&
36470         (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
36471          MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
36472          MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
36473          MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
36474          MaskVT == MVT::v16i32)) ||
36475        (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
36476         (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
36477          MaskVT == MVT::v32i16)) ||
36478        (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
36479         (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
36480          MaskVT == MVT::v64i8)))) {
36481     V1 = CanonicalizeShuffleInput(MaskVT, V1);
36482     V2 = CanonicalizeShuffleInput(MaskVT, V2);
36483     Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36484     return DAG.getBitcast(RootVT, Res);
36485   }
36486 
36487   // Failed to find any combines.
36488   return SDValue();
36489 }
36490 
36491 // Combine an arbitrary chain of shuffles + extract_subvectors into a single
36492 // instruction if possible.
36493 //
36494 // Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
36495 // type size to attempt to combine:
36496 // shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
36497 // -->
36498 // extract_subvector(shuffle(x,y,m2),0)
combineX86ShuffleChainWithExtract(ArrayRef<SDValue> Inputs,SDValue Root,ArrayRef<int> BaseMask,int Depth,bool HasVariableMask,bool AllowVariableCrossLaneMask,bool AllowVariablePerLaneMask,SelectionDAG & DAG,const X86Subtarget & Subtarget)36499 static SDValue combineX86ShuffleChainWithExtract(
36500     ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
36501     bool HasVariableMask, bool AllowVariableCrossLaneMask,
36502     bool AllowVariablePerLaneMask, SelectionDAG &DAG,
36503     const X86Subtarget &Subtarget) {
36504   unsigned NumMaskElts = BaseMask.size();
36505   unsigned NumInputs = Inputs.size();
36506   if (NumInputs == 0)
36507     return SDValue();
36508 
36509   EVT RootVT = Root.getValueType();
36510   unsigned RootSizeInBits = RootVT.getSizeInBits();
36511   assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
36512 
36513   SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
36514   SmallVector<unsigned, 4> Offsets(NumInputs, 0);
36515 
36516   // Peek through subvectors.
36517   // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
36518   unsigned WideSizeInBits = RootSizeInBits;
36519   for (unsigned i = 0; i != NumInputs; ++i) {
36520     SDValue &Src = WideInputs[i];
36521     unsigned &Offset = Offsets[i];
36522     Src = peekThroughBitcasts(Src);
36523     EVT BaseVT = Src.getValueType();
36524     while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
36525       Offset += Src.getConstantOperandVal(1);
36526       Src = Src.getOperand(0);
36527     }
36528     WideSizeInBits = std::max(WideSizeInBits,
36529                               (unsigned)Src.getValueSizeInBits());
36530     assert((Offset % BaseVT.getVectorNumElements()) == 0 &&
36531            "Unexpected subvector extraction");
36532     Offset /= BaseVT.getVectorNumElements();
36533     Offset *= NumMaskElts;
36534   }
36535 
36536   // Bail if we're always extracting from the lowest subvectors,
36537   // combineX86ShuffleChain should match this for the current width.
36538   if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
36539     return SDValue();
36540 
36541   unsigned Scale = WideSizeInBits / RootSizeInBits;
36542   assert((WideSizeInBits % RootSizeInBits) == 0 &&
36543          "Unexpected subvector extraction");
36544 
36545   // If the src vector types aren't the same, see if we can extend
36546   // them to match each other.
36547   // TODO: Support different scalar types?
36548   EVT WideSVT = WideInputs[0].getValueType().getScalarType();
36549   if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
36550         return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||
36551                Op.getValueType().getScalarType() != WideSVT;
36552       }))
36553     return SDValue();
36554 
36555   for (SDValue &NewInput : WideInputs) {
36556     assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
36557            "Shuffle vector size mismatch");
36558     if (WideSizeInBits > NewInput.getValueSizeInBits())
36559       NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
36560                                 SDLoc(NewInput), WideSizeInBits);
36561     assert(WideSizeInBits == NewInput.getValueSizeInBits() &&
36562            "Unexpected subvector extraction");
36563   }
36564 
36565   // Create new mask for larger type.
36566   for (unsigned i = 1; i != NumInputs; ++i)
36567     Offsets[i] += i * Scale * NumMaskElts;
36568 
36569   SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
36570   for (int &M : WideMask) {
36571     if (M < 0)
36572       continue;
36573     M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
36574   }
36575   WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
36576 
36577   // Remove unused/repeated shuffle source ops.
36578   resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
36579   assert(!WideInputs.empty() && "Shuffle with no inputs detected");
36580 
36581   if (WideInputs.size() > 2)
36582     return SDValue();
36583 
36584   // Increase depth for every upper subvector we've peeked through.
36585   Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });
36586 
36587   // Attempt to combine wider chain.
36588   // TODO: Can we use a better Root?
36589   SDValue WideRoot = WideInputs[0];
36590   if (SDValue WideShuffle =
36591           combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
36592                                  HasVariableMask, AllowVariableCrossLaneMask,
36593                                  AllowVariablePerLaneMask, DAG, Subtarget)) {
36594     WideShuffle =
36595         extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
36596     return DAG.getBitcast(RootVT, WideShuffle);
36597   }
36598   return SDValue();
36599 }
36600 
36601 // Canonicalize the combined shuffle mask chain with horizontal ops.
36602 // NOTE: This may update the Ops and Mask.
canonicalizeShuffleMaskWithHorizOp(MutableArrayRef<SDValue> Ops,MutableArrayRef<int> Mask,unsigned RootSizeInBits,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget)36603 static SDValue canonicalizeShuffleMaskWithHorizOp(
36604     MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
36605     unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
36606     const X86Subtarget &Subtarget) {
36607   if (Mask.empty() || Ops.empty())
36608     return SDValue();
36609 
36610   SmallVector<SDValue> BC;
36611   for (SDValue Op : Ops)
36612     BC.push_back(peekThroughBitcasts(Op));
36613 
36614   // All ops must be the same horizop + type.
36615   SDValue BC0 = BC[0];
36616   EVT VT0 = BC0.getValueType();
36617   unsigned Opcode0 = BC0.getOpcode();
36618   if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
36619         return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
36620       }))
36621     return SDValue();
36622 
36623   bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
36624                   Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
36625   bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
36626   if (!isHoriz && !isPack)
36627     return SDValue();
36628 
36629   // Do all ops have a single use?
36630   bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
36631     return Op.hasOneUse() &&
36632            peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);
36633   });
36634 
36635   int NumElts = VT0.getVectorNumElements();
36636   int NumLanes = VT0.getSizeInBits() / 128;
36637   int NumEltsPerLane = NumElts / NumLanes;
36638   int NumHalfEltsPerLane = NumEltsPerLane / 2;
36639   MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
36640   unsigned EltSizeInBits = RootSizeInBits / Mask.size();
36641 
36642   if (NumEltsPerLane >= 4 &&
36643       (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
36644     SmallVector<int> LaneMask, ScaledMask;
36645     if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
36646         scaleShuffleElements(LaneMask, 4, ScaledMask)) {
36647       // See if we can remove the shuffle by resorting the HOP chain so that
36648       // the HOP args are pre-shuffled.
36649       // TODO: Generalize to any sized/depth chain.
36650       // TODO: Add support for PACKSS/PACKUS.
36651       if (isHoriz) {
36652         // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
36653         auto GetHOpSrc = [&](int M) {
36654           if (M == SM_SentinelUndef)
36655             return DAG.getUNDEF(VT0);
36656           if (M == SM_SentinelZero)
36657             return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
36658           SDValue Src0 = BC[M / 4];
36659           SDValue Src1 = Src0.getOperand((M % 4) >= 2);
36660           if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
36661             return Src1.getOperand(M % 2);
36662           return SDValue();
36663         };
36664         SDValue M0 = GetHOpSrc(ScaledMask[0]);
36665         SDValue M1 = GetHOpSrc(ScaledMask[1]);
36666         SDValue M2 = GetHOpSrc(ScaledMask[2]);
36667         SDValue M3 = GetHOpSrc(ScaledMask[3]);
36668         if (M0 && M1 && M2 && M3) {
36669           SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
36670           SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
36671           return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
36672         }
36673       }
36674       // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
36675       if (Ops.size() >= 2) {
36676         SDValue LHS, RHS;
36677         auto GetHOpSrc = [&](int M, int &OutM) {
36678           // TODO: Support SM_SentinelZero
36679           if (M < 0)
36680             return M == SM_SentinelUndef;
36681           SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
36682           if (!LHS || LHS == Src) {
36683             LHS = Src;
36684             OutM = (M % 2);
36685             return true;
36686           }
36687           if (!RHS || RHS == Src) {
36688             RHS = Src;
36689             OutM = (M % 2) + 2;
36690             return true;
36691           }
36692           return false;
36693         };
36694         int PostMask[4] = {-1, -1, -1, -1};
36695         if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
36696             GetHOpSrc(ScaledMask[1], PostMask[1]) &&
36697             GetHOpSrc(ScaledMask[2], PostMask[2]) &&
36698             GetHOpSrc(ScaledMask[3], PostMask[3])) {
36699           LHS = DAG.getBitcast(SrcVT, LHS);
36700           RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
36701           SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
36702           // Use SHUFPS for the permute so this will work on SSE3 targets,
36703           // shuffle combining and domain handling will simplify this later on.
36704           MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
36705           Res = DAG.getBitcast(ShuffleVT, Res);
36706           return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
36707                              getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
36708         }
36709       }
36710     }
36711   }
36712 
36713   if (2 < Ops.size())
36714     return SDValue();
36715 
36716   SDValue BC1 = BC[BC.size() - 1];
36717   if (Mask.size() == VT0.getVectorNumElements()) {
36718     // Canonicalize binary shuffles of horizontal ops that use the
36719     // same sources to an unary shuffle.
36720     // TODO: Try to perform this fold even if the shuffle remains.
36721     if (Ops.size() == 2) {
36722       auto ContainsOps = [](SDValue HOp, SDValue Op) {
36723         return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
36724       };
36725       // Commute if all BC0's ops are contained in BC1.
36726       if (ContainsOps(BC1, BC0.getOperand(0)) &&
36727           ContainsOps(BC1, BC0.getOperand(1))) {
36728         ShuffleVectorSDNode::commuteMask(Mask);
36729         std::swap(Ops[0], Ops[1]);
36730         std::swap(BC0, BC1);
36731       }
36732 
36733       // If BC1 can be represented by BC0, then convert to unary shuffle.
36734       if (ContainsOps(BC0, BC1.getOperand(0)) &&
36735           ContainsOps(BC0, BC1.getOperand(1))) {
36736         for (int &M : Mask) {
36737           if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
36738             continue;
36739           int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
36740           M -= NumElts + (SubLane * NumHalfEltsPerLane);
36741           if (BC1.getOperand(SubLane) != BC0.getOperand(0))
36742             M += NumHalfEltsPerLane;
36743         }
36744       }
36745     }
36746 
36747     // Canonicalize unary horizontal ops to only refer to lower halves.
36748     for (int i = 0; i != NumElts; ++i) {
36749       int &M = Mask[i];
36750       if (isUndefOrZero(M))
36751         continue;
36752       if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
36753           (M % NumEltsPerLane) >= NumHalfEltsPerLane)
36754         M -= NumHalfEltsPerLane;
36755       if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
36756           (M % NumEltsPerLane) >= NumHalfEltsPerLane)
36757         M -= NumHalfEltsPerLane;
36758     }
36759   }
36760 
36761   // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
36762   // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
36763   // represents the LHS/RHS inputs for the lower/upper halves.
36764   SmallVector<int, 16> TargetMask128, WideMask128;
36765   if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
36766       scaleShuffleElements(TargetMask128, 2, WideMask128)) {
36767     assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
36768     bool SingleOp = (Ops.size() == 1);
36769     if (isPack || OneUseOps ||
36770         shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
36771       SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
36772       SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
36773       Lo = Lo.getOperand(WideMask128[0] & 1);
36774       Hi = Hi.getOperand(WideMask128[1] & 1);
36775       if (SingleOp) {
36776         SDValue Undef = DAG.getUNDEF(SrcVT);
36777         SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
36778         Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
36779         Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
36780         Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
36781         Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
36782       }
36783       return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
36784     }
36785   }
36786 
36787   return SDValue();
36788 }
36789 
36790 // Attempt to constant fold all of the constant source ops.
36791 // Returns true if the entire shuffle is folded to a constant.
36792 // TODO: Extend this to merge multiple constant Ops and update the mask.
combineX86ShufflesConstants(ArrayRef<SDValue> Ops,ArrayRef<int> Mask,SDValue Root,bool HasVariableMask,SelectionDAG & DAG,const X86Subtarget & Subtarget)36793 static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
36794                                            ArrayRef<int> Mask, SDValue Root,
36795                                            bool HasVariableMask,
36796                                            SelectionDAG &DAG,
36797                                            const X86Subtarget &Subtarget) {
36798   MVT VT = Root.getSimpleValueType();
36799 
36800   unsigned SizeInBits = VT.getSizeInBits();
36801   unsigned NumMaskElts = Mask.size();
36802   unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
36803   unsigned NumOps = Ops.size();
36804 
36805   // Extract constant bits from each source op.
36806   bool OneUseConstantOp = false;
36807   SmallVector<APInt, 16> UndefEltsOps(NumOps);
36808   SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
36809   for (unsigned i = 0; i != NumOps; ++i) {
36810     SDValue SrcOp = Ops[i];
36811     OneUseConstantOp |= SrcOp.hasOneUse();
36812     if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
36813                                        RawBitsOps[i]))
36814       return SDValue();
36815   }
36816 
36817   // Only fold if at least one of the constants is only used once or
36818   // the combined shuffle has included a variable mask shuffle, this
36819   // is to avoid constant pool bloat.
36820   if (!OneUseConstantOp && !HasVariableMask)
36821     return SDValue();
36822 
36823   // Shuffle the constant bits according to the mask.
36824   SDLoc DL(Root);
36825   APInt UndefElts(NumMaskElts, 0);
36826   APInt ZeroElts(NumMaskElts, 0);
36827   APInt ConstantElts(NumMaskElts, 0);
36828   SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
36829                                         APInt::getNullValue(MaskSizeInBits));
36830   for (unsigned i = 0; i != NumMaskElts; ++i) {
36831     int M = Mask[i];
36832     if (M == SM_SentinelUndef) {
36833       UndefElts.setBit(i);
36834       continue;
36835     } else if (M == SM_SentinelZero) {
36836       ZeroElts.setBit(i);
36837       continue;
36838     }
36839     assert(0 <= M && M < (int)(NumMaskElts * NumOps));
36840 
36841     unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
36842     unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
36843 
36844     auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
36845     if (SrcUndefElts[SrcMaskIdx]) {
36846       UndefElts.setBit(i);
36847       continue;
36848     }
36849 
36850     auto &SrcEltBits = RawBitsOps[SrcOpIdx];
36851     APInt &Bits = SrcEltBits[SrcMaskIdx];
36852     if (!Bits) {
36853       ZeroElts.setBit(i);
36854       continue;
36855     }
36856 
36857     ConstantElts.setBit(i);
36858     ConstantBitData[i] = Bits;
36859   }
36860   assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
36861 
36862   // Attempt to create a zero vector.
36863   if ((UndefElts | ZeroElts).isAllOnesValue())
36864     return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
36865 
36866   // Create the constant data.
36867   MVT MaskSVT;
36868   if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
36869     MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
36870   else
36871     MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
36872 
36873   MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
36874   if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
36875     return SDValue();
36876 
36877   SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
36878   return DAG.getBitcast(VT, CstOp);
36879 }
36880 
36881 namespace llvm {
36882   namespace X86 {
36883     enum {
36884       MaxShuffleCombineDepth = 8
36885     };
36886   }
36887 } // namespace llvm
36888 
36889 /// Fully generic combining of x86 shuffle instructions.
36890 ///
36891 /// This should be the last combine run over the x86 shuffle instructions. Once
36892 /// they have been fully optimized, this will recursively consider all chains
36893 /// of single-use shuffle instructions, build a generic model of the cumulative
36894 /// shuffle operation, and check for simpler instructions which implement this
36895 /// operation. We use this primarily for two purposes:
36896 ///
36897 /// 1) Collapse generic shuffles to specialized single instructions when
36898 ///    equivalent. In most cases, this is just an encoding size win, but
36899 ///    sometimes we will collapse multiple generic shuffles into a single
36900 ///    special-purpose shuffle.
36901 /// 2) Look for sequences of shuffle instructions with 3 or more total
36902 ///    instructions, and replace them with the slightly more expensive SSSE3
36903 ///    PSHUFB instruction if available. We do this as the last combining step
36904 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
36905 ///    a suitable short sequence of other instructions. The PSHUFB will either
36906 ///    use a register or have to read from memory and so is slightly (but only
36907 ///    slightly) more expensive than the other shuffle instructions.
36908 ///
36909 /// Because this is inherently a quadratic operation (for each shuffle in
36910 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
36911 /// This should never be an issue in practice as the shuffle lowering doesn't
36912 /// produce sequences of more than 8 instructions.
36913 ///
36914 /// FIXME: We will currently miss some cases where the redundant shuffling
36915 /// would simplify under the threshold for PSHUFB formation because of
36916 /// combine-ordering. To fix this, we should do the redundant instruction
36917 /// combining in this recursive walk.
combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,int SrcOpIndex,SDValue Root,ArrayRef<int> RootMask,ArrayRef<const SDNode * > SrcNodes,unsigned Depth,unsigned MaxDepth,bool HasVariableMask,bool AllowVariableCrossLaneMask,bool AllowVariablePerLaneMask,SelectionDAG & DAG,const X86Subtarget & Subtarget)36918 static SDValue combineX86ShufflesRecursively(
36919     ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
36920     ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
36921     unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
36922     bool AllowVariablePerLaneMask, SelectionDAG &DAG,
36923     const X86Subtarget &Subtarget) {
36924   assert(RootMask.size() > 0 &&
36925          (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
36926          "Illegal shuffle root mask");
36927   assert(Root.getSimpleValueType().isVector() &&
36928          "Shuffles operate on vector types!");
36929   unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
36930 
36931   // Bound the depth of our recursive combine because this is ultimately
36932   // quadratic in nature.
36933   if (Depth >= MaxDepth)
36934     return SDValue();
36935 
36936   // Directly rip through bitcasts to find the underlying operand.
36937   SDValue Op = SrcOps[SrcOpIndex];
36938   Op = peekThroughOneUseBitcasts(Op);
36939 
36940   EVT VT = Op.getValueType();
36941   if (!VT.isVector() || !VT.isSimple())
36942     return SDValue(); // Bail if we hit a non-simple non-vector.
36943 
36944   assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
36945          "Can only combine shuffles upto size of the root op.");
36946 
36947   // Extract target shuffle mask and resolve sentinels and inputs.
36948   // TODO - determine Op's demanded elts from RootMask.
36949   SmallVector<int, 64> OpMask;
36950   SmallVector<SDValue, 2> OpInputs;
36951   APInt OpUndef, OpZero;
36952   APInt OpDemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
36953   bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
36954   if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
36955                               OpZero, DAG, Depth, false))
36956     return SDValue();
36957 
36958   // Shuffle inputs must not be larger than the shuffle result.
36959   // TODO: Relax this for single input faux shuffles (trunc/extract_subvector).
36960   if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
36961         return OpInput.getValueSizeInBits() > VT.getSizeInBits();
36962       }))
36963     return SDValue();
36964 
36965   // If the shuffle result was smaller than the root, we need to adjust the
36966   // mask indices and pad the mask with undefs.
36967   if (RootSizeInBits > VT.getSizeInBits()) {
36968     unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
36969     unsigned OpMaskSize = OpMask.size();
36970     if (OpInputs.size() > 1) {
36971       unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
36972       for (int &M : OpMask) {
36973         if (M < 0)
36974           continue;
36975         int EltIdx = M % OpMaskSize;
36976         int OpIdx = M / OpMaskSize;
36977         M = (PaddedMaskSize * OpIdx) + EltIdx;
36978       }
36979     }
36980     OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
36981     OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
36982     OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
36983   }
36984 
36985   SmallVector<int, 64> Mask;
36986   SmallVector<SDValue, 16> Ops;
36987 
36988   // We don't need to merge masks if the root is empty.
36989   bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
36990   if (EmptyRoot) {
36991     // Only resolve zeros if it will remove an input, otherwise we might end
36992     // up in an infinite loop.
36993     bool ResolveKnownZeros = true;
36994     if (!OpZero.isNullValue()) {
36995       APInt UsedInputs = APInt::getNullValue(OpInputs.size());
36996       for (int i = 0, e = OpMask.size(); i != e; ++i) {
36997         int M = OpMask[i];
36998         if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
36999           continue;
37000         UsedInputs.setBit(M / OpMask.size());
37001         if (UsedInputs.isAllOnesValue()) {
37002           ResolveKnownZeros = false;
37003           break;
37004         }
37005       }
37006     }
37007     resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
37008                                       ResolveKnownZeros);
37009 
37010     Mask = OpMask;
37011     Ops.append(OpInputs.begin(), OpInputs.end());
37012   } else {
37013     resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
37014 
37015     // Add the inputs to the Ops list, avoiding duplicates.
37016     Ops.append(SrcOps.begin(), SrcOps.end());
37017 
37018     auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
37019       // Attempt to find an existing match.
37020       SDValue InputBC = peekThroughBitcasts(Input);
37021       for (int i = 0, e = Ops.size(); i < e; ++i)
37022         if (InputBC == peekThroughBitcasts(Ops[i]))
37023           return i;
37024       // Match failed - should we replace an existing Op?
37025       if (InsertionPoint >= 0) {
37026         Ops[InsertionPoint] = Input;
37027         return InsertionPoint;
37028       }
37029       // Add to the end of the Ops list.
37030       Ops.push_back(Input);
37031       return Ops.size() - 1;
37032     };
37033 
37034     SmallVector<int, 2> OpInputIdx;
37035     for (SDValue OpInput : OpInputs)
37036       OpInputIdx.push_back(
37037           AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
37038 
37039     assert(((RootMask.size() > OpMask.size() &&
37040              RootMask.size() % OpMask.size() == 0) ||
37041             (OpMask.size() > RootMask.size() &&
37042              OpMask.size() % RootMask.size() == 0) ||
37043             OpMask.size() == RootMask.size()) &&
37044            "The smaller number of elements must divide the larger.");
37045 
37046     // This function can be performance-critical, so we rely on the power-of-2
37047     // knowledge that we have about the mask sizes to replace div/rem ops with
37048     // bit-masks and shifts.
37049     assert(isPowerOf2_32(RootMask.size()) &&
37050            "Non-power-of-2 shuffle mask sizes");
37051     assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
37052     unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
37053     unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
37054 
37055     unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
37056     unsigned RootRatio =
37057         std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
37058     unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
37059     assert((RootRatio == 1 || OpRatio == 1) &&
37060            "Must not have a ratio for both incoming and op masks!");
37061 
37062     assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
37063     assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
37064     assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
37065     unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
37066     unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
37067 
37068     Mask.resize(MaskWidth, SM_SentinelUndef);
37069 
37070     // Merge this shuffle operation's mask into our accumulated mask. Note that
37071     // this shuffle's mask will be the first applied to the input, followed by
37072     // the root mask to get us all the way to the root value arrangement. The
37073     // reason for this order is that we are recursing up the operation chain.
37074     for (unsigned i = 0; i < MaskWidth; ++i) {
37075       unsigned RootIdx = i >> RootRatioLog2;
37076       if (RootMask[RootIdx] < 0) {
37077         // This is a zero or undef lane, we're done.
37078         Mask[i] = RootMask[RootIdx];
37079         continue;
37080       }
37081 
37082       unsigned RootMaskedIdx =
37083           RootRatio == 1
37084               ? RootMask[RootIdx]
37085               : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
37086 
37087       // Just insert the scaled root mask value if it references an input other
37088       // than the SrcOp we're currently inserting.
37089       if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
37090           (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
37091         Mask[i] = RootMaskedIdx;
37092         continue;
37093       }
37094 
37095       RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
37096       unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
37097       if (OpMask[OpIdx] < 0) {
37098         // The incoming lanes are zero or undef, it doesn't matter which ones we
37099         // are using.
37100         Mask[i] = OpMask[OpIdx];
37101         continue;
37102       }
37103 
37104       // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
37105       unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
37106                                           : (OpMask[OpIdx] << OpRatioLog2) +
37107                                                 (RootMaskedIdx & (OpRatio - 1));
37108 
37109       OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
37110       int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
37111       assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
37112       OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
37113 
37114       Mask[i] = OpMaskedIdx;
37115     }
37116   }
37117 
37118   // Remove unused/repeated shuffle source ops.
37119   resolveTargetShuffleInputsAndMask(Ops, Mask);
37120 
37121   // Handle the all undef/zero/ones cases early.
37122   if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
37123     return DAG.getUNDEF(Root.getValueType());
37124   if (all_of(Mask, [](int Idx) { return Idx < 0; }))
37125     return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
37126                          SDLoc(Root));
37127   if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
37128       none_of(Mask, [](int M) { return M == SM_SentinelZero; }))
37129     return getOnesVector(Root.getValueType(), DAG, SDLoc(Root));
37130 
37131   assert(!Ops.empty() && "Shuffle with no inputs detected");
37132   HasVariableMask |= IsOpVariableMask;
37133 
37134   // Update the list of shuffle nodes that have been combined so far.
37135   SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
37136                                                 SrcNodes.end());
37137   CombinedNodes.push_back(Op.getNode());
37138 
37139   // See if we can recurse into each shuffle source op (if it's a target
37140   // shuffle). The source op should only be generally combined if it either has
37141   // a single use (i.e. current Op) or all its users have already been combined,
37142   // if not then we can still combine but should prevent generation of variable
37143   // shuffles to avoid constant pool bloat.
37144   // Don't recurse if we already have more source ops than we can combine in
37145   // the remaining recursion depth.
37146   if (Ops.size() < (MaxDepth - Depth)) {
37147     for (int i = 0, e = Ops.size(); i < e; ++i) {
37148       // For empty roots, we need to resolve zeroable elements before combining
37149       // them with other shuffles.
37150       SmallVector<int, 64> ResolvedMask = Mask;
37151       if (EmptyRoot)
37152         resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
37153       bool AllowCrossLaneVar = false;
37154       bool AllowPerLaneVar = false;
37155       if (Ops[i].getNode()->hasOneUse() ||
37156           SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
37157         AllowCrossLaneVar = AllowVariableCrossLaneMask;
37158         AllowPerLaneVar = AllowVariablePerLaneMask;
37159       }
37160       if (SDValue Res = combineX86ShufflesRecursively(
37161               Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
37162               HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
37163               Subtarget))
37164         return Res;
37165     }
37166   }
37167 
37168   // Attempt to constant fold all of the constant source ops.
37169   if (SDValue Cst = combineX86ShufflesConstants(
37170           Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
37171     return Cst;
37172 
37173   // If constant fold failed and we only have constants - then we have
37174   // multiple uses by a single non-variable shuffle - just bail.
37175   if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
37176         APInt UndefElts;
37177         SmallVector<APInt> RawBits;
37178         unsigned EltSizeInBits = RootSizeInBits / Mask.size();
37179         return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
37180                                              RawBits);
37181       })) {
37182     return SDValue();
37183   }
37184 
37185   // Canonicalize the combined shuffle mask chain with horizontal ops.
37186   // NOTE: This will update the Ops and Mask.
37187   if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
37188           Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
37189     return DAG.getBitcast(Root.getValueType(), HOp);
37190 
37191   // Widen any subvector shuffle inputs we've collected.
37192   if (any_of(Ops, [RootSizeInBits](SDValue Op) {
37193         return Op.getValueSizeInBits() < RootSizeInBits;
37194       })) {
37195     for (SDValue &Op : Ops)
37196       if (Op.getValueSizeInBits() < RootSizeInBits)
37197         Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
37198                             RootSizeInBits);
37199     // Reresolve - we might have repeated subvector sources.
37200     resolveTargetShuffleInputsAndMask(Ops, Mask);
37201   }
37202 
37203   // We can only combine unary and binary shuffle mask cases.
37204   if (Ops.size() <= 2) {
37205     // Minor canonicalization of the accumulated shuffle mask to make it easier
37206     // to match below. All this does is detect masks with sequential pairs of
37207     // elements, and shrink them to the half-width mask. It does this in a loop
37208     // so it will reduce the size of the mask to the minimal width mask which
37209     // performs an equivalent shuffle.
37210     while (Mask.size() > 1) {
37211       SmallVector<int, 64> WidenedMask;
37212       if (!canWidenShuffleElements(Mask, WidenedMask))
37213         break;
37214       Mask = std::move(WidenedMask);
37215     }
37216 
37217     // Canonicalization of binary shuffle masks to improve pattern matching by
37218     // commuting the inputs.
37219     if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
37220       ShuffleVectorSDNode::commuteMask(Mask);
37221       std::swap(Ops[0], Ops[1]);
37222     }
37223 
37224     // Finally, try to combine into a single shuffle instruction.
37225     return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
37226                                   AllowVariableCrossLaneMask,
37227                                   AllowVariablePerLaneMask, DAG, Subtarget);
37228   }
37229 
37230   // If that failed and any input is extracted then try to combine as a
37231   // shuffle with the larger type.
37232   return combineX86ShuffleChainWithExtract(
37233       Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
37234       AllowVariablePerLaneMask, DAG, Subtarget);
37235 }
37236 
37237 /// Helper entry wrapper to combineX86ShufflesRecursively.
combineX86ShufflesRecursively(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)37238 static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
37239                                              const X86Subtarget &Subtarget) {
37240   return combineX86ShufflesRecursively(
37241       {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
37242       /*HasVarMask*/ false,
37243       /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
37244       Subtarget);
37245 }
37246 
37247 /// Get the PSHUF-style mask from PSHUF node.
37248 ///
37249 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
37250 /// PSHUF-style masks that can be reused with such instructions.
getPSHUFShuffleMask(SDValue N)37251 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
37252   MVT VT = N.getSimpleValueType();
37253   SmallVector<int, 4> Mask;
37254   SmallVector<SDValue, 2> Ops;
37255   bool HaveMask =
37256       getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);
37257   (void)HaveMask;
37258   assert(HaveMask);
37259 
37260   // If we have more than 128-bits, only the low 128-bits of shuffle mask
37261   // matter. Check that the upper masks are repeats and remove them.
37262   if (VT.getSizeInBits() > 128) {
37263     int LaneElts = 128 / VT.getScalarSizeInBits();
37264 #ifndef NDEBUG
37265     for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
37266       for (int j = 0; j < LaneElts; ++j)
37267         assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
37268                "Mask doesn't repeat in high 128-bit lanes!");
37269 #endif
37270     Mask.resize(LaneElts);
37271   }
37272 
37273   switch (N.getOpcode()) {
37274   case X86ISD::PSHUFD:
37275     return Mask;
37276   case X86ISD::PSHUFLW:
37277     Mask.resize(4);
37278     return Mask;
37279   case X86ISD::PSHUFHW:
37280     Mask.erase(Mask.begin(), Mask.begin() + 4);
37281     for (int &M : Mask)
37282       M -= 4;
37283     return Mask;
37284   default:
37285     llvm_unreachable("No valid shuffle instruction found!");
37286   }
37287 }
37288 
37289 /// Search for a combinable shuffle across a chain ending in pshufd.
37290 ///
37291 /// We walk up the chain and look for a combinable shuffle, skipping over
37292 /// shuffles that we could hoist this shuffle's transformation past without
37293 /// altering anything.
37294 static SDValue
combineRedundantDWordShuffle(SDValue N,MutableArrayRef<int> Mask,SelectionDAG & DAG)37295 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
37296                              SelectionDAG &DAG) {
37297   assert(N.getOpcode() == X86ISD::PSHUFD &&
37298          "Called with something other than an x86 128-bit half shuffle!");
37299   SDLoc DL(N);
37300 
37301   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
37302   // of the shuffles in the chain so that we can form a fresh chain to replace
37303   // this one.
37304   SmallVector<SDValue, 8> Chain;
37305   SDValue V = N.getOperand(0);
37306   for (; V.hasOneUse(); V = V.getOperand(0)) {
37307     switch (V.getOpcode()) {
37308     default:
37309       return SDValue(); // Nothing combined!
37310 
37311     case ISD::BITCAST:
37312       // Skip bitcasts as we always know the type for the target specific
37313       // instructions.
37314       continue;
37315 
37316     case X86ISD::PSHUFD:
37317       // Found another dword shuffle.
37318       break;
37319 
37320     case X86ISD::PSHUFLW:
37321       // Check that the low words (being shuffled) are the identity in the
37322       // dword shuffle, and the high words are self-contained.
37323       if (Mask[0] != 0 || Mask[1] != 1 ||
37324           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
37325         return SDValue();
37326 
37327       Chain.push_back(V);
37328       continue;
37329 
37330     case X86ISD::PSHUFHW:
37331       // Check that the high words (being shuffled) are the identity in the
37332       // dword shuffle, and the low words are self-contained.
37333       if (Mask[2] != 2 || Mask[3] != 3 ||
37334           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
37335         return SDValue();
37336 
37337       Chain.push_back(V);
37338       continue;
37339 
37340     case X86ISD::UNPCKL:
37341     case X86ISD::UNPCKH:
37342       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
37343       // shuffle into a preceding word shuffle.
37344       if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
37345           V.getSimpleValueType().getVectorElementType() != MVT::i16)
37346         return SDValue();
37347 
37348       // Search for a half-shuffle which we can combine with.
37349       unsigned CombineOp =
37350           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
37351       if (V.getOperand(0) != V.getOperand(1) ||
37352           !V->isOnlyUserOf(V.getOperand(0).getNode()))
37353         return SDValue();
37354       Chain.push_back(V);
37355       V = V.getOperand(0);
37356       do {
37357         switch (V.getOpcode()) {
37358         default:
37359           return SDValue(); // Nothing to combine.
37360 
37361         case X86ISD::PSHUFLW:
37362         case X86ISD::PSHUFHW:
37363           if (V.getOpcode() == CombineOp)
37364             break;
37365 
37366           Chain.push_back(V);
37367 
37368           LLVM_FALLTHROUGH;
37369         case ISD::BITCAST:
37370           V = V.getOperand(0);
37371           continue;
37372         }
37373         break;
37374       } while (V.hasOneUse());
37375       break;
37376     }
37377     // Break out of the loop if we break out of the switch.
37378     break;
37379   }
37380 
37381   if (!V.hasOneUse())
37382     // We fell out of the loop without finding a viable combining instruction.
37383     return SDValue();
37384 
37385   // Merge this node's mask and our incoming mask.
37386   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
37387   for (int &M : Mask)
37388     M = VMask[M];
37389   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
37390                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
37391 
37392   // Rebuild the chain around this new shuffle.
37393   while (!Chain.empty()) {
37394     SDValue W = Chain.pop_back_val();
37395 
37396     if (V.getValueType() != W.getOperand(0).getValueType())
37397       V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
37398 
37399     switch (W.getOpcode()) {
37400     default:
37401       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
37402 
37403     case X86ISD::UNPCKL:
37404     case X86ISD::UNPCKH:
37405       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
37406       break;
37407 
37408     case X86ISD::PSHUFD:
37409     case X86ISD::PSHUFLW:
37410     case X86ISD::PSHUFHW:
37411       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
37412       break;
37413     }
37414   }
37415   if (V.getValueType() != N.getValueType())
37416     V = DAG.getBitcast(N.getValueType(), V);
37417 
37418   // Return the new chain to replace N.
37419   return V;
37420 }
37421 
37422 // Attempt to commute shufps LHS loads:
37423 // permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
combineCommutableSHUFP(SDValue N,MVT VT,const SDLoc & DL,SelectionDAG & DAG)37424 static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
37425                                       SelectionDAG &DAG) {
37426   // TODO: Add vXf64 support.
37427   if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
37428     return SDValue();
37429 
37430   // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
37431   auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
37432     if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
37433       return SDValue();
37434     SDValue N0 = V.getOperand(0);
37435     SDValue N1 = V.getOperand(1);
37436     unsigned Imm = V.getConstantOperandVal(2);
37437     if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) ||
37438         MayFoldLoad(peekThroughOneUseBitcasts(N1)))
37439       return SDValue();
37440     Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
37441     return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
37442                        DAG.getTargetConstant(Imm, DL, MVT::i8));
37443   };
37444 
37445   switch (N.getOpcode()) {
37446   case X86ISD::VPERMILPI:
37447     if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
37448       unsigned Imm = N.getConstantOperandVal(1);
37449       return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
37450                          DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
37451     }
37452     break;
37453   case X86ISD::SHUFP: {
37454     SDValue N0 = N.getOperand(0);
37455     SDValue N1 = N.getOperand(1);
37456     unsigned Imm = N.getConstantOperandVal(2);
37457     if (N0 == N1) {
37458       if (SDValue NewSHUFP = commuteSHUFP(N, N0))
37459         return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
37460                            DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
37461     } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
37462       return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
37463                          DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
37464     } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
37465       return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
37466                          DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
37467     }
37468     break;
37469   }
37470   }
37471 
37472   return SDValue();
37473 }
37474 
37475 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
canonicalizeShuffleWithBinOps(SDValue N,SelectionDAG & DAG,const SDLoc & DL)37476 static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
37477                                              const SDLoc &DL) {
37478   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
37479   EVT ShuffleVT = N.getValueType();
37480 
37481   auto IsMergeableWithShuffle = [](SDValue Op) {
37482     // AllZeros/AllOnes constants are freely shuffled and will peek through
37483     // bitcasts. Other constant build vectors do not peek through bitcasts. Only
37484     // merge with target shuffles if it has one use so shuffle combining is
37485     // likely to kick in.
37486     return ISD::isBuildVectorAllOnes(Op.getNode()) ||
37487            ISD::isBuildVectorAllZeros(Op.getNode()) ||
37488            ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
37489            ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
37490            (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse());
37491   };
37492   auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
37493     // Ensure we only shuffle whole vector src elements, unless its a logical
37494     // binops where we can more aggressively move shuffles from dst to src.
37495     return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||
37496            (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
37497   };
37498 
37499   unsigned Opc = N.getOpcode();
37500   switch (Opc) {
37501   // Unary and Unary+Permute Shuffles.
37502   case X86ISD::PSHUFB: {
37503     // Don't merge PSHUFB if it contains zero'd elements.
37504     SmallVector<int> Mask;
37505     SmallVector<SDValue> Ops;
37506     if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,
37507                               Mask))
37508       break;
37509     LLVM_FALLTHROUGH;
37510   }
37511   case X86ISD::VBROADCAST:
37512   case X86ISD::MOVDDUP:
37513   case X86ISD::PSHUFD:
37514   case X86ISD::VPERMI:
37515   case X86ISD::VPERMILPI: {
37516     if (N.getOperand(0).getValueType() == ShuffleVT &&
37517         N->isOnlyUserOf(N.getOperand(0).getNode())) {
37518       SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
37519       unsigned SrcOpcode = N0.getOpcode();
37520       if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
37521         SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
37522         SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
37523         if (IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op01)) {
37524           SDValue LHS, RHS;
37525           Op00 = DAG.getBitcast(ShuffleVT, Op00);
37526           Op01 = DAG.getBitcast(ShuffleVT, Op01);
37527           if (N.getNumOperands() == 2) {
37528             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
37529             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
37530           } else {
37531             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
37532             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
37533           }
37534           EVT OpVT = N0.getValueType();
37535           return DAG.getBitcast(ShuffleVT,
37536                                 DAG.getNode(SrcOpcode, DL, OpVT,
37537                                             DAG.getBitcast(OpVT, LHS),
37538                                             DAG.getBitcast(OpVT, RHS)));
37539         }
37540       }
37541     }
37542     break;
37543   }
37544   // Binary and Binary+Permute Shuffles.
37545   case X86ISD::INSERTPS: {
37546     // Don't merge INSERTPS if it contains zero'd elements.
37547     unsigned InsertPSMask = N.getConstantOperandVal(2);
37548     unsigned ZeroMask = InsertPSMask & 0xF;
37549     if (ZeroMask != 0)
37550       break;
37551     LLVM_FALLTHROUGH;
37552   }
37553   case X86ISD::MOVSD:
37554   case X86ISD::MOVSS:
37555   case X86ISD::BLENDI:
37556   case X86ISD::SHUFP:
37557   case X86ISD::UNPCKH:
37558   case X86ISD::UNPCKL: {
37559     if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
37560         N->isOnlyUserOf(N.getOperand(1).getNode())) {
37561       SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
37562       SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
37563       unsigned SrcOpcode = N0.getOpcode();
37564       if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
37565           IsSafeToMoveShuffle(N0, SrcOpcode) &&
37566           IsSafeToMoveShuffle(N1, SrcOpcode)) {
37567         SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
37568         SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));
37569         SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
37570         SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));
37571         // Ensure the total number of shuffles doesn't increase by folding this
37572         // shuffle through to the source ops.
37573         if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
37574              (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
37575             ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
37576              (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
37577           SDValue LHS, RHS;
37578           Op00 = DAG.getBitcast(ShuffleVT, Op00);
37579           Op10 = DAG.getBitcast(ShuffleVT, Op10);
37580           Op01 = DAG.getBitcast(ShuffleVT, Op01);
37581           Op11 = DAG.getBitcast(ShuffleVT, Op11);
37582           if (N.getNumOperands() == 3) {
37583             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
37584             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
37585           } else {
37586             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
37587             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
37588           }
37589           EVT OpVT = N0.getValueType();
37590           return DAG.getBitcast(ShuffleVT,
37591                                 DAG.getNode(SrcOpcode, DL, OpVT,
37592                                             DAG.getBitcast(OpVT, LHS),
37593                                             DAG.getBitcast(OpVT, RHS)));
37594         }
37595       }
37596     }
37597     break;
37598   }
37599   }
37600   return SDValue();
37601 }
37602 
37603 /// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
canonicalizeLaneShuffleWithRepeatedOps(SDValue V,SelectionDAG & DAG,const SDLoc & DL)37604 static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
37605                                                       SelectionDAG &DAG,
37606                                                       const SDLoc &DL) {
37607   assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
37608 
37609   MVT VT = V.getSimpleValueType();
37610   SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
37611   SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
37612   unsigned SrcOpc0 = Src0.getOpcode();
37613   unsigned SrcOpc1 = Src1.getOpcode();
37614   EVT SrcVT0 = Src0.getValueType();
37615   EVT SrcVT1 = Src1.getValueType();
37616 
37617   if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
37618     return SDValue();
37619 
37620   switch (SrcOpc0) {
37621   case X86ISD::MOVDDUP: {
37622     SDValue LHS = Src0.getOperand(0);
37623     SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
37624     SDValue Res =
37625         DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
37626     Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
37627     return DAG.getBitcast(VT, Res);
37628   }
37629   case X86ISD::VPERMILPI:
37630     // TODO: Handle v4f64 permutes with different low/high lane masks.
37631     if (SrcVT0 == MVT::v4f64) {
37632       uint64_t Mask = Src0.getConstantOperandVal(1);
37633       if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
37634         break;
37635     }
37636     LLVM_FALLTHROUGH;
37637   case X86ISD::VSHLI:
37638   case X86ISD::VSRLI:
37639   case X86ISD::VSRAI:
37640   case X86ISD::PSHUFD:
37641     if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
37642       SDValue LHS = Src0.getOperand(0);
37643       SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
37644       SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
37645                                 V.getOperand(2));
37646       Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
37647       return DAG.getBitcast(VT, Res);
37648     }
37649     break;
37650   }
37651 
37652   return SDValue();
37653 }
37654 
37655 /// Try to combine x86 target specific shuffles.
combineTargetShuffle(SDValue N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)37656 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
37657                                     TargetLowering::DAGCombinerInfo &DCI,
37658                                     const X86Subtarget &Subtarget) {
37659   SDLoc DL(N);
37660   MVT VT = N.getSimpleValueType();
37661   SmallVector<int, 4> Mask;
37662   unsigned Opcode = N.getOpcode();
37663 
37664   if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
37665     return R;
37666 
37667   if (SDValue R = canonicalizeShuffleWithBinOps(N, DAG, DL))
37668     return R;
37669 
37670   // Handle specific target shuffles.
37671   switch (Opcode) {
37672   case X86ISD::MOVDDUP: {
37673     SDValue Src = N.getOperand(0);
37674     // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
37675     if (VT == MVT::v2f64 && Src.hasOneUse() &&
37676         ISD::isNormalLoad(Src.getNode())) {
37677       LoadSDNode *LN = cast<LoadSDNode>(Src);
37678       if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
37679         SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
37680         DCI.CombineTo(N.getNode(), Movddup);
37681         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
37682         DCI.recursivelyDeleteUnusedNodes(LN);
37683         return N; // Return N so it doesn't get rechecked!
37684       }
37685     }
37686 
37687     return SDValue();
37688   }
37689   case X86ISD::VBROADCAST: {
37690     SDValue Src = N.getOperand(0);
37691     SDValue BC = peekThroughBitcasts(Src);
37692     EVT SrcVT = Src.getValueType();
37693     EVT BCVT = BC.getValueType();
37694 
37695     // If broadcasting from another shuffle, attempt to simplify it.
37696     // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
37697     if (isTargetShuffle(BC.getOpcode()) &&
37698         VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
37699       unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
37700       SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
37701                                         SM_SentinelUndef);
37702       for (unsigned i = 0; i != Scale; ++i)
37703         DemandedMask[i] = i;
37704       if (SDValue Res = combineX86ShufflesRecursively(
37705               {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
37706               X86::MaxShuffleCombineDepth,
37707               /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
37708               /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
37709         return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
37710                            DAG.getBitcast(SrcVT, Res));
37711     }
37712 
37713     // broadcast(bitcast(src)) -> bitcast(broadcast(src))
37714     // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
37715     if (Src.getOpcode() == ISD::BITCAST &&
37716         SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
37717         DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
37718         FixedVectorType::isValidElementType(
37719             BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
37720       EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
37721                                    VT.getVectorNumElements());
37722       return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
37723     }
37724 
37725     // Reduce broadcast source vector to lowest 128-bits.
37726     if (SrcVT.getSizeInBits() > 128)
37727       return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
37728                          extract128BitVector(Src, 0, DAG, DL));
37729 
37730     // broadcast(scalar_to_vector(x)) -> broadcast(x).
37731     if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
37732       return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
37733 
37734     // Share broadcast with the longest vector and extract low subvector (free).
37735     // Ensure the same SDValue from the SDNode use is being used.
37736     for (SDNode *User : Src->uses())
37737       if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
37738           Src == User->getOperand(0) &&
37739           User->getValueSizeInBits(0).getFixedSize() >
37740               VT.getFixedSizeInBits()) {
37741         return extractSubVector(SDValue(User, 0), 0, DAG, DL,
37742                                 VT.getSizeInBits());
37743       }
37744 
37745     // vbroadcast(scalarload X) -> vbroadcast_load X
37746     // For float loads, extract other uses of the scalar from the broadcast.
37747     if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
37748         ISD::isNormalLoad(Src.getNode())) {
37749       LoadSDNode *LN = cast<LoadSDNode>(Src);
37750       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37751       SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
37752       SDValue BcastLd =
37753           DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
37754                                   LN->getMemoryVT(), LN->getMemOperand());
37755       // If the load value is used only by N, replace it via CombineTo N.
37756       bool NoReplaceExtract = Src.hasOneUse();
37757       DCI.CombineTo(N.getNode(), BcastLd);
37758       if (NoReplaceExtract) {
37759         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37760         DCI.recursivelyDeleteUnusedNodes(LN);
37761       } else {
37762         SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
37763                                   DAG.getIntPtrConstant(0, DL));
37764         DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
37765       }
37766       return N; // Return N so it doesn't get rechecked!
37767     }
37768 
37769     // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
37770     // i16. So shrink it ourselves if we can make a broadcast_load.
37771     if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
37772         Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
37773       assert(Subtarget.hasAVX2() && "Expected AVX2");
37774       SDValue TruncIn = Src.getOperand(0);
37775 
37776       // If this is a truncate of a non extending load we can just narrow it to
37777       // use a broadcast_load.
37778       if (ISD::isNormalLoad(TruncIn.getNode())) {
37779         LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
37780         // Unless its volatile or atomic.
37781         if (LN->isSimple()) {
37782           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37783           SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
37784           SDValue BcastLd = DAG.getMemIntrinsicNode(
37785               X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
37786               LN->getPointerInfo(), LN->getOriginalAlign(),
37787               LN->getMemOperand()->getFlags());
37788           DCI.CombineTo(N.getNode(), BcastLd);
37789           DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37790           DCI.recursivelyDeleteUnusedNodes(Src.getNode());
37791           return N; // Return N so it doesn't get rechecked!
37792         }
37793       }
37794 
37795       // If this is a truncate of an i16 extload, we can directly replace it.
37796       if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
37797           ISD::isEXTLoad(Src.getOperand(0).getNode())) {
37798         LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
37799         if (LN->getMemoryVT().getSizeInBits() == 16) {
37800           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37801           SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
37802           SDValue BcastLd =
37803               DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
37804                                       LN->getMemoryVT(), LN->getMemOperand());
37805           DCI.CombineTo(N.getNode(), BcastLd);
37806           DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37807           DCI.recursivelyDeleteUnusedNodes(Src.getNode());
37808           return N; // Return N so it doesn't get rechecked!
37809         }
37810       }
37811 
37812       // If this is a truncate of load that has been shifted right, we can
37813       // offset the pointer and use a narrower load.
37814       if (TruncIn.getOpcode() == ISD::SRL &&
37815           TruncIn.getOperand(0).hasOneUse() &&
37816           isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
37817           ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
37818         LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
37819         unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
37820         // Make sure the shift amount and the load size are divisible by 16.
37821         // Don't do this if the load is volatile or atomic.
37822         if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
37823             LN->isSimple()) {
37824           unsigned Offset = ShiftAmt / 8;
37825           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37826           SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
37827                                                  TypeSize::Fixed(Offset), DL);
37828           SDValue Ops[] = { LN->getChain(), Ptr };
37829           SDValue BcastLd = DAG.getMemIntrinsicNode(
37830               X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
37831               LN->getPointerInfo().getWithOffset(Offset),
37832               LN->getOriginalAlign(),
37833               LN->getMemOperand()->getFlags());
37834           DCI.CombineTo(N.getNode(), BcastLd);
37835           DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37836           DCI.recursivelyDeleteUnusedNodes(Src.getNode());
37837           return N; // Return N so it doesn't get rechecked!
37838         }
37839       }
37840     }
37841 
37842     // vbroadcast(vzload X) -> vbroadcast_load X
37843     if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
37844       MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
37845       if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
37846         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37847         SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
37848         SDValue BcastLd =
37849             DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
37850                                     LN->getMemoryVT(), LN->getMemOperand());
37851         DCI.CombineTo(N.getNode(), BcastLd);
37852         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37853         DCI.recursivelyDeleteUnusedNodes(LN);
37854         return N; // Return N so it doesn't get rechecked!
37855       }
37856     }
37857 
37858     // vbroadcast(vector load X) -> vbroadcast_load
37859     if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
37860          SrcVT == MVT::v4i32) &&
37861         Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
37862       LoadSDNode *LN = cast<LoadSDNode>(Src);
37863       // Unless the load is volatile or atomic.
37864       if (LN->isSimple()) {
37865         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37866         SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
37867         SDValue BcastLd = DAG.getMemIntrinsicNode(
37868             X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
37869             LN->getPointerInfo(), LN->getOriginalAlign(),
37870             LN->getMemOperand()->getFlags());
37871         DCI.CombineTo(N.getNode(), BcastLd);
37872         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37873         DCI.recursivelyDeleteUnusedNodes(LN);
37874         return N; // Return N so it doesn't get rechecked!
37875       }
37876     }
37877 
37878     return SDValue();
37879   }
37880   case X86ISD::VZEXT_MOVL: {
37881     SDValue N0 = N.getOperand(0);
37882 
37883     // If this a vzmovl of a full vector load, replace it with a vzload, unless
37884     // the load is volatile.
37885     if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
37886       auto *LN = cast<LoadSDNode>(N0);
37887       if (SDValue VZLoad =
37888               narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
37889         DCI.CombineTo(N.getNode(), VZLoad);
37890         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
37891         DCI.recursivelyDeleteUnusedNodes(LN);
37892         return N;
37893       }
37894     }
37895 
37896     // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
37897     // and can just use a VZEXT_LOAD.
37898     // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
37899     if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
37900       auto *LN = cast<MemSDNode>(N0);
37901       if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
37902         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37903         SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
37904         SDValue VZLoad =
37905             DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
37906                                     LN->getMemoryVT(), LN->getMemOperand());
37907         DCI.CombineTo(N.getNode(), VZLoad);
37908         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
37909         DCI.recursivelyDeleteUnusedNodes(LN);
37910         return N;
37911       }
37912     }
37913 
37914     // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
37915     // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
37916     // if the upper bits of the i64 are zero.
37917     if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
37918         N0.getOperand(0).hasOneUse() &&
37919         N0.getOperand(0).getValueType() == MVT::i64) {
37920       SDValue In = N0.getOperand(0);
37921       APInt Mask = APInt::getHighBitsSet(64, 32);
37922       if (DAG.MaskedValueIsZero(In, Mask)) {
37923         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
37924         MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
37925         SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
37926         SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
37927         return DAG.getBitcast(VT, Movl);
37928       }
37929     }
37930 
37931     // Load a scalar integer constant directly to XMM instead of transferring an
37932     // immediate value from GPR.
37933     // vzext_movl (scalar_to_vector C) --> load [C,0...]
37934     if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
37935       if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
37936         // Create a vector constant - scalar constant followed by zeros.
37937         EVT ScalarVT = N0.getOperand(0).getValueType();
37938         Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
37939         unsigned NumElts = VT.getVectorNumElements();
37940         Constant *Zero = ConstantInt::getNullValue(ScalarTy);
37941         SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
37942         ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
37943 
37944         // Load the vector constant from constant pool.
37945         MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
37946         SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
37947         MachinePointerInfo MPI =
37948             MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
37949         Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
37950         return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
37951                            MachineMemOperand::MOLoad);
37952       }
37953     }
37954 
37955     // Pull subvector inserts into undef through VZEXT_MOVL by making it an
37956     // insert into a zero vector. This helps get VZEXT_MOVL closer to
37957     // scalar_to_vectors where 256/512 are canonicalized to an insert and a
37958     // 128-bit scalar_to_vector. This reduces the number of isel patterns.
37959     if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
37960       SDValue V = peekThroughOneUseBitcasts(N0);
37961 
37962       if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
37963           isNullConstant(V.getOperand(2))) {
37964         SDValue In = V.getOperand(1);
37965         MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
37966                                      In.getValueSizeInBits() /
37967                                          VT.getScalarSizeInBits());
37968         In = DAG.getBitcast(SubVT, In);
37969         SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
37970         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
37971                            getZeroVector(VT, Subtarget, DAG, DL), Movl,
37972                            V.getOperand(2));
37973       }
37974     }
37975 
37976     return SDValue();
37977   }
37978   case X86ISD::BLENDI: {
37979     SDValue N0 = N.getOperand(0);
37980     SDValue N1 = N.getOperand(1);
37981 
37982     // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
37983     // TODO: Handle MVT::v16i16 repeated blend mask.
37984     if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
37985         N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
37986       MVT SrcVT = N0.getOperand(0).getSimpleValueType();
37987       if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
37988           SrcVT.getScalarSizeInBits() >= 32) {
37989         unsigned BlendMask = N.getConstantOperandVal(2);
37990         unsigned Size = VT.getVectorNumElements();
37991         unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
37992         BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
37993         return DAG.getBitcast(
37994             VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
37995                             N1.getOperand(0),
37996                             DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
37997       }
37998     }
37999     return SDValue();
38000   }
38001   case X86ISD::VPERMI: {
38002     // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
38003     // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
38004     SDValue N0 = N.getOperand(0);
38005     SDValue N1 = N.getOperand(1);
38006     unsigned EltSizeInBits = VT.getScalarSizeInBits();
38007     if (N0.getOpcode() == ISD::BITCAST &&
38008         N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
38009       SDValue Src = N0.getOperand(0);
38010       EVT SrcVT = Src.getValueType();
38011       SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
38012       return DAG.getBitcast(VT, Res);
38013     }
38014     return SDValue();
38015   }
38016   case X86ISD::VPERM2X128: {
38017     // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
38018     SDValue LHS = N->getOperand(0);
38019     SDValue RHS = N->getOperand(1);
38020     if (LHS.getOpcode() == ISD::BITCAST &&
38021         (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
38022       EVT SrcVT = LHS.getOperand(0).getValueType();
38023       if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
38024         return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
38025                                               DAG.getBitcast(SrcVT, LHS),
38026                                               DAG.getBitcast(SrcVT, RHS),
38027                                               N->getOperand(2)));
38028       }
38029     }
38030 
38031     // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
38032     if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
38033       return Res;
38034 
38035     // Fold vperm2x128 subvector shuffle with an inner concat pattern.
38036     // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
38037     auto FindSubVector128 = [&](unsigned Idx) {
38038       if (Idx > 3)
38039         return SDValue();
38040       SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
38041       SmallVector<SDValue> SubOps;
38042       if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2)
38043         return SubOps[Idx & 1];
38044       unsigned NumElts = Src.getValueType().getVectorNumElements();
38045       if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
38046           Src.getOperand(1).getValueSizeInBits() == 128 &&
38047           Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
38048         return Src.getOperand(1);
38049       }
38050       return SDValue();
38051     };
38052     unsigned Imm = N.getConstantOperandVal(2);
38053     if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
38054       if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
38055         MVT SubVT = VT.getHalfNumVectorElementsVT();
38056         SubLo = DAG.getBitcast(SubVT, SubLo);
38057         SubHi = DAG.getBitcast(SubVT, SubHi);
38058         return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
38059       }
38060     }
38061     return SDValue();
38062   }
38063   case X86ISD::PSHUFD:
38064   case X86ISD::PSHUFLW:
38065   case X86ISD::PSHUFHW:
38066     Mask = getPSHUFShuffleMask(N);
38067     assert(Mask.size() == 4);
38068     break;
38069   case X86ISD::MOVSD:
38070   case X86ISD::MOVSS: {
38071     SDValue N0 = N.getOperand(0);
38072     SDValue N1 = N.getOperand(1);
38073 
38074     // Canonicalize scalar FPOps:
38075     // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
38076     // If commutable, allow OP(N1[0], N0[0]).
38077     unsigned Opcode1 = N1.getOpcode();
38078     if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
38079         Opcode1 == ISD::FDIV) {
38080       SDValue N10 = N1.getOperand(0);
38081       SDValue N11 = N1.getOperand(1);
38082       if (N10 == N0 ||
38083           (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
38084         if (N10 != N0)
38085           std::swap(N10, N11);
38086         MVT SVT = VT.getVectorElementType();
38087         SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
38088         N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
38089         N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
38090         SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
38091         SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
38092         return DAG.getNode(Opcode, DL, VT, N0, SclVec);
38093       }
38094     }
38095 
38096     return SDValue();
38097   }
38098   case X86ISD::INSERTPS: {
38099     assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
38100     SDValue Op0 = N.getOperand(0);
38101     SDValue Op1 = N.getOperand(1);
38102     unsigned InsertPSMask = N.getConstantOperandVal(2);
38103     unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
38104     unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
38105     unsigned ZeroMask = InsertPSMask & 0xF;
38106 
38107     // If we zero out all elements from Op0 then we don't need to reference it.
38108     if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
38109       return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
38110                          DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38111 
38112     // If we zero out the element from Op1 then we don't need to reference it.
38113     if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
38114       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
38115                          DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38116 
38117     // Attempt to merge insertps Op1 with an inner target shuffle node.
38118     SmallVector<int, 8> TargetMask1;
38119     SmallVector<SDValue, 2> Ops1;
38120     APInt KnownUndef1, KnownZero1;
38121     if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
38122                                      KnownZero1)) {
38123       if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
38124         // Zero/UNDEF insertion - zero out element and remove dependency.
38125         InsertPSMask |= (1u << DstIdx);
38126         return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
38127                            DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38128       }
38129       // Update insertps mask srcidx and reference the source input directly.
38130       int M = TargetMask1[SrcIdx];
38131       assert(0 <= M && M < 8 && "Shuffle index out of range");
38132       InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
38133       Op1 = Ops1[M < 4 ? 0 : 1];
38134       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
38135                          DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38136     }
38137 
38138     // Attempt to merge insertps Op0 with an inner target shuffle node.
38139     SmallVector<int, 8> TargetMask0;
38140     SmallVector<SDValue, 2> Ops0;
38141     APInt KnownUndef0, KnownZero0;
38142     if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
38143                                      KnownZero0)) {
38144       bool Updated = false;
38145       bool UseInput00 = false;
38146       bool UseInput01 = false;
38147       for (int i = 0; i != 4; ++i) {
38148         if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
38149           // No change if element is already zero or the inserted element.
38150           continue;
38151         } else if (KnownUndef0[i] || KnownZero0[i]) {
38152           // If the target mask is undef/zero then we must zero the element.
38153           InsertPSMask |= (1u << i);
38154           Updated = true;
38155           continue;
38156         }
38157 
38158         // The input vector element must be inline.
38159         int M = TargetMask0[i];
38160         if (M != i && M != (i + 4))
38161           return SDValue();
38162 
38163         // Determine which inputs of the target shuffle we're using.
38164         UseInput00 |= (0 <= M && M < 4);
38165         UseInput01 |= (4 <= M);
38166       }
38167 
38168       // If we're not using both inputs of the target shuffle then use the
38169       // referenced input directly.
38170       if (UseInput00 && !UseInput01) {
38171         Updated = true;
38172         Op0 = Ops0[0];
38173       } else if (!UseInput00 && UseInput01) {
38174         Updated = true;
38175         Op0 = Ops0[1];
38176       }
38177 
38178       if (Updated)
38179         return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
38180                            DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38181     }
38182 
38183     // If we're inserting an element from a vbroadcast load, fold the
38184     // load into the X86insertps instruction. We need to convert the scalar
38185     // load to a vector and clear the source lane of the INSERTPS control.
38186     if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
38187       auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
38188       if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
38189         SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
38190                                    MemIntr->getBasePtr(),
38191                                    MemIntr->getMemOperand());
38192         SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
38193                            DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
38194                                        Load),
38195                            DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
38196         DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
38197         return Insert;
38198       }
38199     }
38200 
38201     return SDValue();
38202   }
38203   default:
38204     return SDValue();
38205   }
38206 
38207   // Nuke no-op shuffles that show up after combining.
38208   if (isNoopShuffleMask(Mask))
38209     return N.getOperand(0);
38210 
38211   // Look for simplifications involving one or two shuffle instructions.
38212   SDValue V = N.getOperand(0);
38213   switch (N.getOpcode()) {
38214   default:
38215     break;
38216   case X86ISD::PSHUFLW:
38217   case X86ISD::PSHUFHW:
38218     assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
38219 
38220     // See if this reduces to a PSHUFD which is no more expensive and can
38221     // combine with more operations. Note that it has to at least flip the
38222     // dwords as otherwise it would have been removed as a no-op.
38223     if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
38224       int DMask[] = {0, 1, 2, 3};
38225       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
38226       DMask[DOffset + 0] = DOffset + 1;
38227       DMask[DOffset + 1] = DOffset + 0;
38228       MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
38229       V = DAG.getBitcast(DVT, V);
38230       V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
38231                       getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
38232       return DAG.getBitcast(VT, V);
38233     }
38234 
38235     // Look for shuffle patterns which can be implemented as a single unpack.
38236     // FIXME: This doesn't handle the location of the PSHUFD generically, and
38237     // only works when we have a PSHUFD followed by two half-shuffles.
38238     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
38239         (V.getOpcode() == X86ISD::PSHUFLW ||
38240          V.getOpcode() == X86ISD::PSHUFHW) &&
38241         V.getOpcode() != N.getOpcode() &&
38242         V.hasOneUse() && V.getOperand(0).hasOneUse()) {
38243       SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
38244       if (D.getOpcode() == X86ISD::PSHUFD) {
38245         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
38246         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
38247         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
38248         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
38249         int WordMask[8];
38250         for (int i = 0; i < 4; ++i) {
38251           WordMask[i + NOffset] = Mask[i] + NOffset;
38252           WordMask[i + VOffset] = VMask[i] + VOffset;
38253         }
38254         // Map the word mask through the DWord mask.
38255         int MappedMask[8];
38256         for (int i = 0; i < 8; ++i)
38257           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
38258         if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
38259             makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
38260           // We can replace all three shuffles with an unpack.
38261           V = DAG.getBitcast(VT, D.getOperand(0));
38262           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
38263                                                 : X86ISD::UNPCKH,
38264                              DL, VT, V, V);
38265         }
38266       }
38267     }
38268 
38269     break;
38270 
38271   case X86ISD::PSHUFD:
38272     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
38273       return NewN;
38274 
38275     break;
38276   }
38277 
38278   return SDValue();
38279 }
38280 
38281 /// Checks if the shuffle mask takes subsequent elements
38282 /// alternately from two vectors.
38283 /// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
isAddSubOrSubAddMask(ArrayRef<int> Mask,bool & Op0Even)38284 static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
38285 
38286   int ParitySrc[2] = {-1, -1};
38287   unsigned Size = Mask.size();
38288   for (unsigned i = 0; i != Size; ++i) {
38289     int M = Mask[i];
38290     if (M < 0)
38291       continue;
38292 
38293     // Make sure we are using the matching element from the input.
38294     if ((M % Size) != i)
38295       return false;
38296 
38297     // Make sure we use the same input for all elements of the same parity.
38298     int Src = M / Size;
38299     if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
38300       return false;
38301     ParitySrc[i % 2] = Src;
38302   }
38303 
38304   // Make sure each input is used.
38305   if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
38306     return false;
38307 
38308   Op0Even = ParitySrc[0] == 0;
38309   return true;
38310 }
38311 
38312 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
38313 /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
38314 /// are written to the parameters \p Opnd0 and \p Opnd1.
38315 ///
38316 /// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
38317 /// so it is easier to generically match. We also insert dummy vector shuffle
38318 /// nodes for the operands which explicitly discard the lanes which are unused
38319 /// by this operation to try to flow through the rest of the combiner
38320 /// the fact that they're unused.
isAddSubOrSubAdd(SDNode * N,const X86Subtarget & Subtarget,SelectionDAG & DAG,SDValue & Opnd0,SDValue & Opnd1,bool & IsSubAdd)38321 static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
38322                              SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
38323                              bool &IsSubAdd) {
38324 
38325   EVT VT = N->getValueType(0);
38326   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38327   if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
38328       !VT.getSimpleVT().isFloatingPoint())
38329     return false;
38330 
38331   // We only handle target-independent shuffles.
38332   // FIXME: It would be easy and harmless to use the target shuffle mask
38333   // extraction tool to support more.
38334   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
38335     return false;
38336 
38337   SDValue V1 = N->getOperand(0);
38338   SDValue V2 = N->getOperand(1);
38339 
38340   // Make sure we have an FADD and an FSUB.
38341   if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
38342       (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
38343       V1.getOpcode() == V2.getOpcode())
38344     return false;
38345 
38346   // If there are other uses of these operations we can't fold them.
38347   if (!V1->hasOneUse() || !V2->hasOneUse())
38348     return false;
38349 
38350   // Ensure that both operations have the same operands. Note that we can
38351   // commute the FADD operands.
38352   SDValue LHS, RHS;
38353   if (V1.getOpcode() == ISD::FSUB) {
38354     LHS = V1->getOperand(0); RHS = V1->getOperand(1);
38355     if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
38356         (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
38357       return false;
38358   } else {
38359     assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
38360     LHS = V2->getOperand(0); RHS = V2->getOperand(1);
38361     if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
38362         (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
38363       return false;
38364   }
38365 
38366   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
38367   bool Op0Even;
38368   if (!isAddSubOrSubAddMask(Mask, Op0Even))
38369     return false;
38370 
38371   // It's a subadd if the vector in the even parity is an FADD.
38372   IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
38373                      : V2->getOpcode() == ISD::FADD;
38374 
38375   Opnd0 = LHS;
38376   Opnd1 = RHS;
38377   return true;
38378 }
38379 
38380 /// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
combineShuffleToFMAddSub(SDNode * N,const X86Subtarget & Subtarget,SelectionDAG & DAG)38381 static SDValue combineShuffleToFMAddSub(SDNode *N,
38382                                         const X86Subtarget &Subtarget,
38383                                         SelectionDAG &DAG) {
38384   // We only handle target-independent shuffles.
38385   // FIXME: It would be easy and harmless to use the target shuffle mask
38386   // extraction tool to support more.
38387   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
38388     return SDValue();
38389 
38390   MVT VT = N->getSimpleValueType(0);
38391   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38392   if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
38393     return SDValue();
38394 
38395   // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
38396   SDValue Op0 = N->getOperand(0);
38397   SDValue Op1 = N->getOperand(1);
38398   SDValue FMAdd = Op0, FMSub = Op1;
38399   if (FMSub.getOpcode() != X86ISD::FMSUB)
38400     std::swap(FMAdd, FMSub);
38401 
38402   if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
38403       FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
38404       FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
38405       FMAdd.getOperand(2) != FMSub.getOperand(2))
38406     return SDValue();
38407 
38408   // Check for correct shuffle mask.
38409   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
38410   bool Op0Even;
38411   if (!isAddSubOrSubAddMask(Mask, Op0Even))
38412     return SDValue();
38413 
38414   // FMAddSub takes zeroth operand from FMSub node.
38415   SDLoc DL(N);
38416   bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
38417   unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
38418   return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
38419                      FMAdd.getOperand(2));
38420 }
38421 
38422 /// Try to combine a shuffle into a target-specific add-sub or
38423 /// mul-add-sub node.
combineShuffleToAddSubOrFMAddSub(SDNode * N,const X86Subtarget & Subtarget,SelectionDAG & DAG)38424 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
38425                                                 const X86Subtarget &Subtarget,
38426                                                 SelectionDAG &DAG) {
38427   if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
38428     return V;
38429 
38430   SDValue Opnd0, Opnd1;
38431   bool IsSubAdd;
38432   if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
38433     return SDValue();
38434 
38435   MVT VT = N->getSimpleValueType(0);
38436   SDLoc DL(N);
38437 
38438   // Try to generate X86ISD::FMADDSUB node here.
38439   SDValue Opnd2;
38440   if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
38441     unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
38442     return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
38443   }
38444 
38445   if (IsSubAdd)
38446     return SDValue();
38447 
38448   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
38449   // the ADDSUB idiom has been successfully recognized. There are no known
38450   // X86 targets with 512-bit ADDSUB instructions!
38451   if (VT.is512BitVector())
38452     return SDValue();
38453 
38454   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
38455 }
38456 
38457 // We are looking for a shuffle where both sources are concatenated with undef
38458 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
38459 // if we can express this as a single-source shuffle, that's preferable.
combineShuffleOfConcatUndef(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)38460 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
38461                                            const X86Subtarget &Subtarget) {
38462   if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
38463     return SDValue();
38464 
38465   EVT VT = N->getValueType(0);
38466 
38467   // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
38468   if (!VT.is128BitVector() && !VT.is256BitVector())
38469     return SDValue();
38470 
38471   if (VT.getVectorElementType() != MVT::i32 &&
38472       VT.getVectorElementType() != MVT::i64 &&
38473       VT.getVectorElementType() != MVT::f32 &&
38474       VT.getVectorElementType() != MVT::f64)
38475     return SDValue();
38476 
38477   SDValue N0 = N->getOperand(0);
38478   SDValue N1 = N->getOperand(1);
38479 
38480   // Check that both sources are concats with undef.
38481   if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
38482       N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
38483       N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
38484       !N1.getOperand(1).isUndef())
38485     return SDValue();
38486 
38487   // Construct the new shuffle mask. Elements from the first source retain their
38488   // index, but elements from the second source no longer need to skip an undef.
38489   SmallVector<int, 8> Mask;
38490   int NumElts = VT.getVectorNumElements();
38491 
38492   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
38493   for (int Elt : SVOp->getMask())
38494     Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
38495 
38496   SDLoc DL(N);
38497   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
38498                                N1.getOperand(0));
38499   return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
38500 }
38501 
38502 /// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
38503 /// low half of each source vector and does not set any high half elements in
38504 /// the destination vector, narrow the shuffle to half its original size.
narrowShuffle(ShuffleVectorSDNode * Shuf,SelectionDAG & DAG)38505 static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
38506   if (!Shuf->getValueType(0).isSimple())
38507     return SDValue();
38508   MVT VT = Shuf->getSimpleValueType(0);
38509   if (!VT.is256BitVector() && !VT.is512BitVector())
38510     return SDValue();
38511 
38512   // See if we can ignore all of the high elements of the shuffle.
38513   ArrayRef<int> Mask = Shuf->getMask();
38514   if (!isUndefUpperHalf(Mask))
38515     return SDValue();
38516 
38517   // Check if the shuffle mask accesses only the low half of each input vector
38518   // (half-index output is 0 or 2).
38519   int HalfIdx1, HalfIdx2;
38520   SmallVector<int, 8> HalfMask(Mask.size() / 2);
38521   if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
38522       (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
38523     return SDValue();
38524 
38525   // Create a half-width shuffle to replace the unnecessarily wide shuffle.
38526   // The trick is knowing that all of the insert/extract are actually free
38527   // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
38528   // of narrow inputs into a narrow output, and that is always cheaper than
38529   // the wide shuffle that we started with.
38530   return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
38531                                Shuf->getOperand(1), HalfMask, HalfIdx1,
38532                                HalfIdx2, false, DAG, /*UseConcat*/true);
38533 }
38534 
combineShuffle(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)38535 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
38536                               TargetLowering::DAGCombinerInfo &DCI,
38537                               const X86Subtarget &Subtarget) {
38538   if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
38539     if (SDValue V = narrowShuffle(Shuf, DAG))
38540       return V;
38541 
38542   // If we have legalized the vector types, look for blends of FADD and FSUB
38543   // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
38544   SDLoc dl(N);
38545   EVT VT = N->getValueType(0);
38546   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38547   if (TLI.isTypeLegal(VT))
38548     if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
38549       return AddSub;
38550 
38551   // Attempt to combine into a vector load/broadcast.
38552   if (SDValue LD = combineToConsecutiveLoads(
38553           VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
38554     return LD;
38555 
38556   // For AVX2, we sometimes want to combine
38557   // (vector_shuffle <mask> (concat_vectors t1, undef)
38558   //                        (concat_vectors t2, undef))
38559   // Into:
38560   // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
38561   // Since the latter can be efficiently lowered with VPERMD/VPERMQ
38562   if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
38563     return ShufConcat;
38564 
38565   if (isTargetShuffle(N->getOpcode())) {
38566     SDValue Op(N, 0);
38567     if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
38568       return Shuffle;
38569 
38570     // Try recursively combining arbitrary sequences of x86 shuffle
38571     // instructions into higher-order shuffles. We do this after combining
38572     // specific PSHUF instruction sequences into their minimal form so that we
38573     // can evaluate how many specialized shuffle instructions are involved in
38574     // a particular chain.
38575     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
38576       return Res;
38577 
38578     // Simplify source operands based on shuffle mask.
38579     // TODO - merge this into combineX86ShufflesRecursively.
38580     APInt KnownUndef, KnownZero;
38581     APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
38582     if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
38583                                        DCI))
38584       return SDValue(N, 0);
38585   }
38586 
38587   return SDValue();
38588 }
38589 
38590 // Simplify variable target shuffle masks based on the demanded elements.
38591 // TODO: Handle DemandedBits in mask indices as well?
SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op,const APInt & DemandedElts,unsigned MaskIndex,TargetLowering::TargetLoweringOpt & TLO,unsigned Depth) const38592 bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
38593     SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
38594     TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
38595   // If we're demanding all elements don't bother trying to simplify the mask.
38596   unsigned NumElts = DemandedElts.getBitWidth();
38597   if (DemandedElts.isAllOnesValue())
38598     return false;
38599 
38600   SDValue Mask = Op.getOperand(MaskIndex);
38601   if (!Mask.hasOneUse())
38602     return false;
38603 
38604   // Attempt to generically simplify the variable shuffle mask.
38605   APInt MaskUndef, MaskZero;
38606   if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
38607                                  Depth + 1))
38608     return true;
38609 
38610   // Attempt to extract+simplify a (constant pool load) shuffle mask.
38611   // TODO: Support other types from getTargetShuffleMaskIndices?
38612   SDValue BC = peekThroughOneUseBitcasts(Mask);
38613   EVT BCVT = BC.getValueType();
38614   auto *Load = dyn_cast<LoadSDNode>(BC);
38615   if (!Load)
38616     return false;
38617 
38618   const Constant *C = getTargetConstantFromNode(Load);
38619   if (!C)
38620     return false;
38621 
38622   Type *CTy = C->getType();
38623   if (!CTy->isVectorTy() ||
38624       CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
38625     return false;
38626 
38627   // Handle scaling for i64 elements on 32-bit targets.
38628   unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
38629   if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
38630     return false;
38631   unsigned Scale = NumCstElts / NumElts;
38632 
38633   // Simplify mask if we have an undemanded element that is not undef.
38634   bool Simplified = false;
38635   SmallVector<Constant *, 32> ConstVecOps;
38636   for (unsigned i = 0; i != NumCstElts; ++i) {
38637     Constant *Elt = C->getAggregateElement(i);
38638     if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
38639       ConstVecOps.push_back(UndefValue::get(Elt->getType()));
38640       Simplified = true;
38641       continue;
38642     }
38643     ConstVecOps.push_back(Elt);
38644   }
38645   if (!Simplified)
38646     return false;
38647 
38648   // Generate new constant pool entry + legalize immediately for the load.
38649   SDLoc DL(Op);
38650   SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
38651   SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
38652   SDValue NewMask = TLO.DAG.getLoad(
38653       BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
38654       MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
38655       Load->getAlign());
38656   return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
38657 }
38658 
SimplifyDemandedVectorEltsForTargetNode(SDValue Op,const APInt & DemandedElts,APInt & KnownUndef,APInt & KnownZero,TargetLoweringOpt & TLO,unsigned Depth) const38659 bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
38660     SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
38661     TargetLoweringOpt &TLO, unsigned Depth) const {
38662   int NumElts = DemandedElts.getBitWidth();
38663   unsigned Opc = Op.getOpcode();
38664   EVT VT = Op.getValueType();
38665 
38666   // Handle special case opcodes.
38667   switch (Opc) {
38668   case X86ISD::PMULDQ:
38669   case X86ISD::PMULUDQ: {
38670     APInt LHSUndef, LHSZero;
38671     APInt RHSUndef, RHSZero;
38672     SDValue LHS = Op.getOperand(0);
38673     SDValue RHS = Op.getOperand(1);
38674     if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
38675                                    Depth + 1))
38676       return true;
38677     if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
38678                                    Depth + 1))
38679       return true;
38680     // Multiply by zero.
38681     KnownZero = LHSZero | RHSZero;
38682     break;
38683   }
38684   case X86ISD::VSHL:
38685   case X86ISD::VSRL:
38686   case X86ISD::VSRA: {
38687     // We only need the bottom 64-bits of the (128-bit) shift amount.
38688     SDValue Amt = Op.getOperand(1);
38689     MVT AmtVT = Amt.getSimpleValueType();
38690     assert(AmtVT.is128BitVector() && "Unexpected value type");
38691 
38692     // If we reuse the shift amount just for sse shift amounts then we know that
38693     // only the bottom 64-bits are only ever used.
38694     bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
38695       unsigned UseOpc = Use->getOpcode();
38696       return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
38697               UseOpc == X86ISD::VSRA) &&
38698              Use->getOperand(0) != Amt;
38699     });
38700 
38701     APInt AmtUndef, AmtZero;
38702     unsigned NumAmtElts = AmtVT.getVectorNumElements();
38703     APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
38704     if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
38705                                    Depth + 1, AssumeSingleUse))
38706       return true;
38707     LLVM_FALLTHROUGH;
38708   }
38709   case X86ISD::VSHLI:
38710   case X86ISD::VSRLI:
38711   case X86ISD::VSRAI: {
38712     SDValue Src = Op.getOperand(0);
38713     APInt SrcUndef;
38714     if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
38715                                    Depth + 1))
38716       return true;
38717 
38718     // Aggressively peek through ops to get at the demanded elts.
38719     if (!DemandedElts.isAllOnesValue())
38720       if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
38721               Src, DemandedElts, TLO.DAG, Depth + 1))
38722         return TLO.CombineTo(
38723             Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
38724     break;
38725   }
38726   case X86ISD::KSHIFTL: {
38727     SDValue Src = Op.getOperand(0);
38728     auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
38729     assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
38730     unsigned ShiftAmt = Amt->getZExtValue();
38731 
38732     if (ShiftAmt == 0)
38733       return TLO.CombineTo(Op, Src);
38734 
38735     // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
38736     // single shift.  We can do this if the bottom bits (which are shifted
38737     // out) are never demanded.
38738     if (Src.getOpcode() == X86ISD::KSHIFTR) {
38739       if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
38740         unsigned C1 = Src.getConstantOperandVal(1);
38741         unsigned NewOpc = X86ISD::KSHIFTL;
38742         int Diff = ShiftAmt - C1;
38743         if (Diff < 0) {
38744           Diff = -Diff;
38745           NewOpc = X86ISD::KSHIFTR;
38746         }
38747 
38748         SDLoc dl(Op);
38749         SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
38750         return TLO.CombineTo(
38751             Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
38752       }
38753     }
38754 
38755     APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
38756     if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
38757                                    Depth + 1))
38758       return true;
38759 
38760     KnownUndef <<= ShiftAmt;
38761     KnownZero <<= ShiftAmt;
38762     KnownZero.setLowBits(ShiftAmt);
38763     break;
38764   }
38765   case X86ISD::KSHIFTR: {
38766     SDValue Src = Op.getOperand(0);
38767     auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
38768     assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
38769     unsigned ShiftAmt = Amt->getZExtValue();
38770 
38771     if (ShiftAmt == 0)
38772       return TLO.CombineTo(Op, Src);
38773 
38774     // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
38775     // single shift.  We can do this if the top bits (which are shifted
38776     // out) are never demanded.
38777     if (Src.getOpcode() == X86ISD::KSHIFTL) {
38778       if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
38779         unsigned C1 = Src.getConstantOperandVal(1);
38780         unsigned NewOpc = X86ISD::KSHIFTR;
38781         int Diff = ShiftAmt - C1;
38782         if (Diff < 0) {
38783           Diff = -Diff;
38784           NewOpc = X86ISD::KSHIFTL;
38785         }
38786 
38787         SDLoc dl(Op);
38788         SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
38789         return TLO.CombineTo(
38790             Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
38791       }
38792     }
38793 
38794     APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
38795     if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
38796                                    Depth + 1))
38797       return true;
38798 
38799     KnownUndef.lshrInPlace(ShiftAmt);
38800     KnownZero.lshrInPlace(ShiftAmt);
38801     KnownZero.setHighBits(ShiftAmt);
38802     break;
38803   }
38804   case X86ISD::CVTSI2P:
38805   case X86ISD::CVTUI2P: {
38806     SDValue Src = Op.getOperand(0);
38807     MVT SrcVT = Src.getSimpleValueType();
38808     APInt SrcUndef, SrcZero;
38809     APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
38810     if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
38811                                    Depth + 1))
38812       return true;
38813     break;
38814   }
38815   case X86ISD::PACKSS:
38816   case X86ISD::PACKUS: {
38817     SDValue N0 = Op.getOperand(0);
38818     SDValue N1 = Op.getOperand(1);
38819 
38820     APInt DemandedLHS, DemandedRHS;
38821     getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38822 
38823     APInt LHSUndef, LHSZero;
38824     if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
38825                                    Depth + 1))
38826       return true;
38827     APInt RHSUndef, RHSZero;
38828     if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
38829                                    Depth + 1))
38830       return true;
38831 
38832     // TODO - pass on known zero/undef.
38833 
38834     // Aggressively peek through ops to get at the demanded elts.
38835     // TODO - we should do this for all target/faux shuffles ops.
38836     if (!DemandedElts.isAllOnesValue()) {
38837       SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
38838                                                             TLO.DAG, Depth + 1);
38839       SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
38840                                                             TLO.DAG, Depth + 1);
38841       if (NewN0 || NewN1) {
38842         NewN0 = NewN0 ? NewN0 : N0;
38843         NewN1 = NewN1 ? NewN1 : N1;
38844         return TLO.CombineTo(Op,
38845                              TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
38846       }
38847     }
38848     break;
38849   }
38850   case X86ISD::HADD:
38851   case X86ISD::HSUB:
38852   case X86ISD::FHADD:
38853   case X86ISD::FHSUB: {
38854     SDValue N0 = Op.getOperand(0);
38855     SDValue N1 = Op.getOperand(1);
38856 
38857     APInt DemandedLHS, DemandedRHS;
38858     getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38859 
38860     APInt LHSUndef, LHSZero;
38861     if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
38862                                    Depth + 1))
38863       return true;
38864     APInt RHSUndef, RHSZero;
38865     if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
38866                                    Depth + 1))
38867       return true;
38868 
38869     // TODO - pass on known zero/undef.
38870 
38871     // Aggressively peek through ops to get at the demanded elts.
38872     // TODO: Handle repeated operands.
38873     if (N0 != N1 && !DemandedElts.isAllOnesValue()) {
38874       SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
38875                                                             TLO.DAG, Depth + 1);
38876       SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
38877                                                             TLO.DAG, Depth + 1);
38878       if (NewN0 || NewN1) {
38879         NewN0 = NewN0 ? NewN0 : N0;
38880         NewN1 = NewN1 ? NewN1 : N1;
38881         return TLO.CombineTo(Op,
38882                              TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
38883       }
38884     }
38885     break;
38886   }
38887   case X86ISD::VTRUNC:
38888   case X86ISD::VTRUNCS:
38889   case X86ISD::VTRUNCUS: {
38890     SDValue Src = Op.getOperand(0);
38891     MVT SrcVT = Src.getSimpleValueType();
38892     APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
38893     APInt SrcUndef, SrcZero;
38894     if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
38895                                    Depth + 1))
38896       return true;
38897     KnownZero = SrcZero.zextOrTrunc(NumElts);
38898     KnownUndef = SrcUndef.zextOrTrunc(NumElts);
38899     break;
38900   }
38901   case X86ISD::BLENDV: {
38902     APInt SelUndef, SelZero;
38903     if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
38904                                    SelZero, TLO, Depth + 1))
38905       return true;
38906 
38907     // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
38908     APInt LHSUndef, LHSZero;
38909     if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
38910                                    LHSZero, TLO, Depth + 1))
38911       return true;
38912 
38913     APInt RHSUndef, RHSZero;
38914     if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
38915                                    RHSZero, TLO, Depth + 1))
38916       return true;
38917 
38918     KnownZero = LHSZero & RHSZero;
38919     KnownUndef = LHSUndef & RHSUndef;
38920     break;
38921   }
38922   case X86ISD::VZEXT_MOVL: {
38923     // If upper demanded elements are already zero then we have nothing to do.
38924     SDValue Src = Op.getOperand(0);
38925     APInt DemandedUpperElts = DemandedElts;
38926     DemandedUpperElts.clearLowBits(1);
38927     if (TLO.DAG.computeKnownBits(Src, DemandedUpperElts, Depth + 1).isZero())
38928       return TLO.CombineTo(Op, Src);
38929     break;
38930   }
38931   case X86ISD::VBROADCAST: {
38932     SDValue Src = Op.getOperand(0);
38933     MVT SrcVT = Src.getSimpleValueType();
38934     if (!SrcVT.isVector())
38935       break;
38936     // Don't bother broadcasting if we just need the 0'th element.
38937     if (DemandedElts == 1) {
38938       if (Src.getValueType() != VT)
38939         Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
38940                              SDLoc(Op));
38941       return TLO.CombineTo(Op, Src);
38942     }
38943     APInt SrcUndef, SrcZero;
38944     APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
38945     if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
38946                                    Depth + 1))
38947       return true;
38948     // Aggressively peek through src to get at the demanded elt.
38949     // TODO - we should do this for all target/faux shuffles ops.
38950     if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
38951             Src, SrcElts, TLO.DAG, Depth + 1))
38952       return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
38953     break;
38954   }
38955   case X86ISD::VPERMV:
38956     if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
38957                                                    Depth))
38958       return true;
38959     break;
38960   case X86ISD::PSHUFB:
38961   case X86ISD::VPERMV3:
38962   case X86ISD::VPERMILPV:
38963     if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
38964                                                    Depth))
38965       return true;
38966     break;
38967   case X86ISD::VPPERM:
38968   case X86ISD::VPERMIL2:
38969     if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
38970                                                    Depth))
38971       return true;
38972     break;
38973   }
38974 
38975   // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
38976   // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
38977   // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
38978   if ((VT.is256BitVector() || VT.is512BitVector()) &&
38979       DemandedElts.lshr(NumElts / 2) == 0) {
38980     unsigned SizeInBits = VT.getSizeInBits();
38981     unsigned ExtSizeInBits = SizeInBits / 2;
38982 
38983     // See if 512-bit ops only use the bottom 128-bits.
38984     if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
38985       ExtSizeInBits = SizeInBits / 4;
38986 
38987     switch (Opc) {
38988       // Scalar broadcast.
38989     case X86ISD::VBROADCAST: {
38990       SDLoc DL(Op);
38991       SDValue Src = Op.getOperand(0);
38992       if (Src.getValueSizeInBits() > ExtSizeInBits)
38993         Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
38994       EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
38995                                     ExtSizeInBits / VT.getScalarSizeInBits());
38996       SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
38997       return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
38998                                                TLO.DAG, DL, ExtSizeInBits));
38999     }
39000     case X86ISD::VBROADCAST_LOAD: {
39001       SDLoc DL(Op);
39002       auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
39003       EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
39004                                     ExtSizeInBits / VT.getScalarSizeInBits());
39005       SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
39006       SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
39007       SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
39008           X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
39009           MemIntr->getMemOperand());
39010       TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
39011                                            Bcst.getValue(1));
39012       return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
39013                                                TLO.DAG, DL, ExtSizeInBits));
39014     }
39015       // Subvector broadcast.
39016     case X86ISD::SUBV_BROADCAST_LOAD: {
39017       auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
39018       EVT MemVT = MemIntr->getMemoryVT();
39019       if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
39020         SDLoc DL(Op);
39021         SDValue Ld =
39022             TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
39023                             MemIntr->getBasePtr(), MemIntr->getMemOperand());
39024         TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
39025                                              Ld.getValue(1));
39026         return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
39027                                                  TLO.DAG, DL, ExtSizeInBits));
39028       } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
39029         SDLoc DL(Op);
39030         EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
39031                                       ExtSizeInBits / VT.getScalarSizeInBits());
39032         SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
39033         SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
39034         SDValue Bcst =
39035             TLO.DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys,
39036                                         Ops, MemVT, MemIntr->getMemOperand());
39037         TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
39038                                              Bcst.getValue(1));
39039         return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
39040                                                  TLO.DAG, DL, ExtSizeInBits));
39041       }
39042       break;
39043     }
39044       // Byte shifts by immediate.
39045     case X86ISD::VSHLDQ:
39046     case X86ISD::VSRLDQ:
39047       // Shift by uniform.
39048     case X86ISD::VSHL:
39049     case X86ISD::VSRL:
39050     case X86ISD::VSRA:
39051       // Shift by immediate.
39052     case X86ISD::VSHLI:
39053     case X86ISD::VSRLI:
39054     case X86ISD::VSRAI: {
39055       SDLoc DL(Op);
39056       SDValue Ext0 =
39057           extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
39058       SDValue ExtOp =
39059           TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
39060       SDValue UndefVec = TLO.DAG.getUNDEF(VT);
39061       SDValue Insert =
39062           insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
39063       return TLO.CombineTo(Op, Insert);
39064     }
39065     case X86ISD::VPERMI: {
39066       // Simplify PERMPD/PERMQ to extract_subvector.
39067       // TODO: This should be done in shuffle combining.
39068       if (VT == MVT::v4f64 || VT == MVT::v4i64) {
39069         SmallVector<int, 4> Mask;
39070         DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
39071         if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
39072           SDLoc DL(Op);
39073           SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
39074           SDValue UndefVec = TLO.DAG.getUNDEF(VT);
39075           SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
39076           return TLO.CombineTo(Op, Insert);
39077         }
39078       }
39079       break;
39080     }
39081     case X86ISD::VPERM2X128: {
39082       // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
39083       SDLoc DL(Op);
39084       unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
39085       if (LoMask & 0x8)
39086         return TLO.CombineTo(
39087             Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
39088       unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
39089       unsigned SrcIdx = (LoMask & 0x2) >> 1;
39090       SDValue ExtOp =
39091           extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
39092       SDValue UndefVec = TLO.DAG.getUNDEF(VT);
39093       SDValue Insert =
39094           insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
39095       return TLO.CombineTo(Op, Insert);
39096     }
39097       // Zero upper elements.
39098     case X86ISD::VZEXT_MOVL:
39099       // Target unary shuffles by immediate:
39100     case X86ISD::PSHUFD:
39101     case X86ISD::PSHUFLW:
39102     case X86ISD::PSHUFHW:
39103     case X86ISD::VPERMILPI:
39104       // (Non-Lane Crossing) Target Shuffles.
39105     case X86ISD::VPERMILPV:
39106     case X86ISD::VPERMIL2:
39107     case X86ISD::PSHUFB:
39108     case X86ISD::UNPCKL:
39109     case X86ISD::UNPCKH:
39110     case X86ISD::BLENDI:
39111       // Integer ops.
39112     case X86ISD::AVG:
39113     case X86ISD::PACKSS:
39114     case X86ISD::PACKUS:
39115       // Horizontal Ops.
39116     case X86ISD::HADD:
39117     case X86ISD::HSUB:
39118     case X86ISD::FHADD:
39119     case X86ISD::FHSUB: {
39120       SDLoc DL(Op);
39121       SmallVector<SDValue, 4> Ops;
39122       for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
39123         SDValue SrcOp = Op.getOperand(i);
39124         EVT SrcVT = SrcOp.getValueType();
39125         assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
39126                "Unsupported vector size");
39127         Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
39128                                                           ExtSizeInBits)
39129                                        : SrcOp);
39130       }
39131       MVT ExtVT = VT.getSimpleVT();
39132       ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
39133                                ExtSizeInBits / ExtVT.getScalarSizeInBits());
39134       SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
39135       SDValue UndefVec = TLO.DAG.getUNDEF(VT);
39136       SDValue Insert =
39137           insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
39138       return TLO.CombineTo(Op, Insert);
39139     }
39140     }
39141   }
39142 
39143   // Get target/faux shuffle mask.
39144   APInt OpUndef, OpZero;
39145   SmallVector<int, 64> OpMask;
39146   SmallVector<SDValue, 2> OpInputs;
39147   if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
39148                               OpZero, TLO.DAG, Depth, false))
39149     return false;
39150 
39151   // Shuffle inputs must be the same size as the result.
39152   if (OpMask.size() != (unsigned)NumElts ||
39153       llvm::any_of(OpInputs, [VT](SDValue V) {
39154         return VT.getSizeInBits() != V.getValueSizeInBits() ||
39155                !V.getValueType().isVector();
39156       }))
39157     return false;
39158 
39159   KnownZero = OpZero;
39160   KnownUndef = OpUndef;
39161 
39162   // Check if shuffle mask can be simplified to undef/zero/identity.
39163   int NumSrcs = OpInputs.size();
39164   for (int i = 0; i != NumElts; ++i)
39165     if (!DemandedElts[i])
39166       OpMask[i] = SM_SentinelUndef;
39167 
39168   if (isUndefInRange(OpMask, 0, NumElts)) {
39169     KnownUndef.setAllBits();
39170     return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
39171   }
39172   if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
39173     KnownZero.setAllBits();
39174     return TLO.CombineTo(
39175         Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
39176   }
39177   for (int Src = 0; Src != NumSrcs; ++Src)
39178     if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
39179       return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
39180 
39181   // Attempt to simplify inputs.
39182   for (int Src = 0; Src != NumSrcs; ++Src) {
39183     // TODO: Support inputs of different types.
39184     if (OpInputs[Src].getValueType() != VT)
39185       continue;
39186 
39187     int Lo = Src * NumElts;
39188     APInt SrcElts = APInt::getNullValue(NumElts);
39189     for (int i = 0; i != NumElts; ++i)
39190       if (DemandedElts[i]) {
39191         int M = OpMask[i] - Lo;
39192         if (0 <= M && M < NumElts)
39193           SrcElts.setBit(M);
39194       }
39195 
39196     // TODO - Propagate input undef/zero elts.
39197     APInt SrcUndef, SrcZero;
39198     if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
39199                                    TLO, Depth + 1))
39200       return true;
39201   }
39202 
39203   // If we don't demand all elements, then attempt to combine to a simpler
39204   // shuffle.
39205   // We need to convert the depth to something combineX86ShufflesRecursively
39206   // can handle - so pretend its Depth == 0 again, and reduce the max depth
39207   // to match. This prevents combineX86ShuffleChain from returning a
39208   // combined shuffle that's the same as the original root, causing an
39209   // infinite loop.
39210   if (!DemandedElts.isAllOnesValue()) {
39211     assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
39212 
39213     SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
39214     for (int i = 0; i != NumElts; ++i)
39215       if (DemandedElts[i])
39216         DemandedMask[i] = i;
39217 
39218     SDValue NewShuffle = combineX86ShufflesRecursively(
39219         {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
39220         /*HasVarMask*/ false,
39221         /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
39222         Subtarget);
39223     if (NewShuffle)
39224       return TLO.CombineTo(Op, NewShuffle);
39225   }
39226 
39227   return false;
39228 }
39229 
SimplifyDemandedBitsForTargetNode(SDValue Op,const APInt & OriginalDemandedBits,const APInt & OriginalDemandedElts,KnownBits & Known,TargetLoweringOpt & TLO,unsigned Depth) const39230 bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
39231     SDValue Op, const APInt &OriginalDemandedBits,
39232     const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
39233     unsigned Depth) const {
39234   EVT VT = Op.getValueType();
39235   unsigned BitWidth = OriginalDemandedBits.getBitWidth();
39236   unsigned Opc = Op.getOpcode();
39237   switch(Opc) {
39238   case X86ISD::VTRUNC: {
39239     KnownBits KnownOp;
39240     SDValue Src = Op.getOperand(0);
39241     MVT SrcVT = Src.getSimpleValueType();
39242 
39243     // Simplify the input, using demanded bit information.
39244     APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
39245     APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
39246     if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
39247       return true;
39248     break;
39249   }
39250   case X86ISD::PMULDQ:
39251   case X86ISD::PMULUDQ: {
39252     // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
39253     KnownBits KnownOp;
39254     SDValue LHS = Op.getOperand(0);
39255     SDValue RHS = Op.getOperand(1);
39256     // FIXME: Can we bound this better?
39257     APInt DemandedMask = APInt::getLowBitsSet(64, 32);
39258     if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
39259                              TLO, Depth + 1))
39260       return true;
39261     if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
39262                              TLO, Depth + 1))
39263       return true;
39264 
39265     // Aggressively peek through ops to get at the demanded low bits.
39266     SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
39267         LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
39268     SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
39269         RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
39270     if (DemandedLHS || DemandedRHS) {
39271       DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
39272       DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
39273       return TLO.CombineTo(
39274           Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
39275     }
39276     break;
39277   }
39278   case X86ISD::VSHLI: {
39279     SDValue Op0 = Op.getOperand(0);
39280 
39281     unsigned ShAmt = Op.getConstantOperandVal(1);
39282     if (ShAmt >= BitWidth)
39283       break;
39284 
39285     APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
39286 
39287     // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
39288     // single shift.  We can do this if the bottom bits (which are shifted
39289     // out) are never demanded.
39290     if (Op0.getOpcode() == X86ISD::VSRLI &&
39291         OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
39292       unsigned Shift2Amt = Op0.getConstantOperandVal(1);
39293       if (Shift2Amt < BitWidth) {
39294         int Diff = ShAmt - Shift2Amt;
39295         if (Diff == 0)
39296           return TLO.CombineTo(Op, Op0.getOperand(0));
39297 
39298         unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
39299         SDValue NewShift = TLO.DAG.getNode(
39300             NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
39301             TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
39302         return TLO.CombineTo(Op, NewShift);
39303       }
39304     }
39305 
39306     // If we are only demanding sign bits then we can use the shift source directly.
39307     unsigned NumSignBits =
39308         TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
39309     unsigned UpperDemandedBits =
39310         BitWidth - OriginalDemandedBits.countTrailingZeros();
39311     if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
39312       return TLO.CombineTo(Op, Op0);
39313 
39314     if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
39315                              TLO, Depth + 1))
39316       return true;
39317 
39318     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
39319     Known.Zero <<= ShAmt;
39320     Known.One <<= ShAmt;
39321 
39322     // Low bits known zero.
39323     Known.Zero.setLowBits(ShAmt);
39324     return false;
39325   }
39326   case X86ISD::VSRLI: {
39327     unsigned ShAmt = Op.getConstantOperandVal(1);
39328     if (ShAmt >= BitWidth)
39329       break;
39330 
39331     APInt DemandedMask = OriginalDemandedBits << ShAmt;
39332 
39333     if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
39334                              OriginalDemandedElts, Known, TLO, Depth + 1))
39335       return true;
39336 
39337     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
39338     Known.Zero.lshrInPlace(ShAmt);
39339     Known.One.lshrInPlace(ShAmt);
39340 
39341     // High bits known zero.
39342     Known.Zero.setHighBits(ShAmt);
39343     return false;
39344   }
39345   case X86ISD::VSRAI: {
39346     SDValue Op0 = Op.getOperand(0);
39347     SDValue Op1 = Op.getOperand(1);
39348 
39349     unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
39350     if (ShAmt >= BitWidth)
39351       break;
39352 
39353     APInt DemandedMask = OriginalDemandedBits << ShAmt;
39354 
39355     // If we just want the sign bit then we don't need to shift it.
39356     if (OriginalDemandedBits.isSignMask())
39357       return TLO.CombineTo(Op, Op0);
39358 
39359     // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
39360     if (Op0.getOpcode() == X86ISD::VSHLI &&
39361         Op.getOperand(1) == Op0.getOperand(1)) {
39362       SDValue Op00 = Op0.getOperand(0);
39363       unsigned NumSignBits =
39364           TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
39365       if (ShAmt < NumSignBits)
39366         return TLO.CombineTo(Op, Op00);
39367     }
39368 
39369     // If any of the demanded bits are produced by the sign extension, we also
39370     // demand the input sign bit.
39371     if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
39372       DemandedMask.setSignBit();
39373 
39374     if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
39375                              TLO, Depth + 1))
39376       return true;
39377 
39378     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
39379     Known.Zero.lshrInPlace(ShAmt);
39380     Known.One.lshrInPlace(ShAmt);
39381 
39382     // If the input sign bit is known to be zero, or if none of the top bits
39383     // are demanded, turn this into an unsigned shift right.
39384     if (Known.Zero[BitWidth - ShAmt - 1] ||
39385         OriginalDemandedBits.countLeadingZeros() >= ShAmt)
39386       return TLO.CombineTo(
39387           Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
39388 
39389     // High bits are known one.
39390     if (Known.One[BitWidth - ShAmt - 1])
39391       Known.One.setHighBits(ShAmt);
39392     return false;
39393   }
39394   case X86ISD::PEXTRB:
39395   case X86ISD::PEXTRW: {
39396     SDValue Vec = Op.getOperand(0);
39397     auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
39398     MVT VecVT = Vec.getSimpleValueType();
39399     unsigned NumVecElts = VecVT.getVectorNumElements();
39400 
39401     if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
39402       unsigned Idx = CIdx->getZExtValue();
39403       unsigned VecBitWidth = VecVT.getScalarSizeInBits();
39404 
39405       // If we demand no bits from the vector then we must have demanded
39406       // bits from the implict zext - simplify to zero.
39407       APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
39408       if (DemandedVecBits == 0)
39409         return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
39410 
39411       APInt KnownUndef, KnownZero;
39412       APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
39413       if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
39414                                      KnownZero, TLO, Depth + 1))
39415         return true;
39416 
39417       KnownBits KnownVec;
39418       if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
39419                                KnownVec, TLO, Depth + 1))
39420         return true;
39421 
39422       if (SDValue V = SimplifyMultipleUseDemandedBits(
39423               Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
39424         return TLO.CombineTo(
39425             Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
39426 
39427       Known = KnownVec.zext(BitWidth);
39428       return false;
39429     }
39430     break;
39431   }
39432   case X86ISD::PINSRB:
39433   case X86ISD::PINSRW: {
39434     SDValue Vec = Op.getOperand(0);
39435     SDValue Scl = Op.getOperand(1);
39436     auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
39437     MVT VecVT = Vec.getSimpleValueType();
39438 
39439     if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
39440       unsigned Idx = CIdx->getZExtValue();
39441       if (!OriginalDemandedElts[Idx])
39442         return TLO.CombineTo(Op, Vec);
39443 
39444       KnownBits KnownVec;
39445       APInt DemandedVecElts(OriginalDemandedElts);
39446       DemandedVecElts.clearBit(Idx);
39447       if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
39448                                KnownVec, TLO, Depth + 1))
39449         return true;
39450 
39451       KnownBits KnownScl;
39452       unsigned NumSclBits = Scl.getScalarValueSizeInBits();
39453       APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
39454       if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
39455         return true;
39456 
39457       KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
39458       Known = KnownBits::commonBits(KnownVec, KnownScl);
39459       return false;
39460     }
39461     break;
39462   }
39463   case X86ISD::PACKSS:
39464     // PACKSS saturates to MIN/MAX integer values. So if we just want the
39465     // sign bit then we can just ask for the source operands sign bit.
39466     // TODO - add known bits handling.
39467     if (OriginalDemandedBits.isSignMask()) {
39468       APInt DemandedLHS, DemandedRHS;
39469       getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
39470 
39471       KnownBits KnownLHS, KnownRHS;
39472       APInt SignMask = APInt::getSignMask(BitWidth * 2);
39473       if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
39474                                KnownLHS, TLO, Depth + 1))
39475         return true;
39476       if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
39477                                KnownRHS, TLO, Depth + 1))
39478         return true;
39479 
39480       // Attempt to avoid multi-use ops if we don't need anything from them.
39481       SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
39482           Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
39483       SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
39484           Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
39485       if (DemandedOp0 || DemandedOp1) {
39486         SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
39487         SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
39488         return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
39489       }
39490     }
39491     // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
39492     break;
39493   case X86ISD::VBROADCAST: {
39494     SDValue Src = Op.getOperand(0);
39495     MVT SrcVT = Src.getSimpleValueType();
39496     APInt DemandedElts = APInt::getOneBitSet(
39497         SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
39498     if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
39499                              TLO, Depth + 1))
39500       return true;
39501     // If we don't need the upper bits, attempt to narrow the broadcast source.
39502     // Don't attempt this on AVX512 as it might affect broadcast folding.
39503     // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
39504     if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
39505         OriginalDemandedBits.countLeadingZeros() >= (BitWidth / 2)) {
39506       MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
39507       SDValue NewSrc =
39508           TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
39509       MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
39510       SDValue NewBcst =
39511           TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
39512       return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
39513     }
39514     break;
39515   }
39516   case X86ISD::PCMPGT:
39517     // icmp sgt(0, R) == ashr(R, BitWidth-1).
39518     // iff we only need the sign bit then we can use R directly.
39519     if (OriginalDemandedBits.isSignMask() &&
39520         ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
39521       return TLO.CombineTo(Op, Op.getOperand(1));
39522     break;
39523   case X86ISD::MOVMSK: {
39524     SDValue Src = Op.getOperand(0);
39525     MVT SrcVT = Src.getSimpleValueType();
39526     unsigned SrcBits = SrcVT.getScalarSizeInBits();
39527     unsigned NumElts = SrcVT.getVectorNumElements();
39528 
39529     // If we don't need the sign bits at all just return zero.
39530     if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
39531       return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
39532 
39533     // Only demand the vector elements of the sign bits we need.
39534     APInt KnownUndef, KnownZero;
39535     APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
39536     if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
39537                                    TLO, Depth + 1))
39538       return true;
39539 
39540     Known.Zero = KnownZero.zextOrSelf(BitWidth);
39541     Known.Zero.setHighBits(BitWidth - NumElts);
39542 
39543     // MOVMSK only uses the MSB from each vector element.
39544     KnownBits KnownSrc;
39545     APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
39546     if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
39547                              Depth + 1))
39548       return true;
39549 
39550     if (KnownSrc.One[SrcBits - 1])
39551       Known.One.setLowBits(NumElts);
39552     else if (KnownSrc.Zero[SrcBits - 1])
39553       Known.Zero.setLowBits(NumElts);
39554 
39555     // Attempt to avoid multi-use os if we don't need anything from it.
39556     if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
39557             Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
39558       return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
39559     return false;
39560   }
39561   case X86ISD::BEXTR:
39562   case X86ISD::BEXTRI: {
39563     SDValue Op0 = Op.getOperand(0);
39564     SDValue Op1 = Op.getOperand(1);
39565 
39566     // Only bottom 16-bits of the control bits are required.
39567     if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
39568       // NOTE: SimplifyDemandedBits won't do this for constants.
39569       uint64_t Val1 = Cst1->getZExtValue();
39570       uint64_t MaskedVal1 = Val1 & 0xFFFF;
39571       if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
39572         SDLoc DL(Op);
39573         return TLO.CombineTo(
39574             Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
39575                                 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
39576       }
39577 
39578       unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
39579       unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
39580 
39581       // If the length is 0, the result is 0.
39582       if (Length == 0) {
39583         Known.setAllZero();
39584         return false;
39585       }
39586 
39587       if ((Shift + Length) <= BitWidth) {
39588         APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
39589         if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
39590           return true;
39591 
39592         Known = Known.extractBits(Length, Shift);
39593         Known = Known.zextOrTrunc(BitWidth);
39594         return false;
39595       }
39596     } else {
39597       assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
39598       KnownBits Known1;
39599       APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
39600       if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
39601         return true;
39602 
39603       // If the length is 0, replace with 0.
39604       KnownBits LengthBits = Known1.extractBits(8, 8);
39605       if (LengthBits.isZero())
39606         return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
39607     }
39608 
39609     break;
39610   }
39611   case X86ISD::PDEP: {
39612     SDValue Op0 = Op.getOperand(0);
39613     SDValue Op1 = Op.getOperand(1);
39614 
39615     unsigned DemandedBitsLZ = OriginalDemandedBits.countLeadingZeros();
39616     APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
39617 
39618     // If the demanded bits has leading zeroes, we don't demand those from the
39619     // mask.
39620     if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
39621       return true;
39622 
39623     // The number of possible 1s in the mask determines the number of LSBs of
39624     // operand 0 used. Undemanded bits from the mask don't matter so filter
39625     // them before counting.
39626     KnownBits Known2;
39627     uint64_t Count = (~Known.Zero & LoMask).countPopulation();
39628     APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
39629     if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
39630       return true;
39631 
39632     // Zeroes are retained from the mask, but not ones.
39633     Known.One.clearAllBits();
39634     // The result will have at least as many trailing zeros as the non-mask
39635     // operand since bits can only map to the same or higher bit position.
39636     Known.Zero.setLowBits(Known2.countMinTrailingZeros());
39637     return false;
39638   }
39639   }
39640 
39641   return TargetLowering::SimplifyDemandedBitsForTargetNode(
39642       Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
39643 }
39644 
SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op,const APInt & DemandedBits,const APInt & DemandedElts,SelectionDAG & DAG,unsigned Depth) const39645 SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
39646     SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
39647     SelectionDAG &DAG, unsigned Depth) const {
39648   int NumElts = DemandedElts.getBitWidth();
39649   unsigned Opc = Op.getOpcode();
39650   EVT VT = Op.getValueType();
39651 
39652   switch (Opc) {
39653   case X86ISD::PINSRB:
39654   case X86ISD::PINSRW: {
39655     // If we don't demand the inserted element, return the base vector.
39656     SDValue Vec = Op.getOperand(0);
39657     auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
39658     MVT VecVT = Vec.getSimpleValueType();
39659     if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
39660         !DemandedElts[CIdx->getZExtValue()])
39661       return Vec;
39662     break;
39663   }
39664   case X86ISD::VSHLI: {
39665     // If we are only demanding sign bits then we can use the shift source
39666     // directly.
39667     SDValue Op0 = Op.getOperand(0);
39668     unsigned ShAmt = Op.getConstantOperandVal(1);
39669     unsigned BitWidth = DemandedBits.getBitWidth();
39670     unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
39671     unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();
39672     if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
39673       return Op0;
39674     break;
39675   }
39676   case X86ISD::VSRAI:
39677     // iff we only need the sign bit then we can use the source directly.
39678     // TODO: generalize where we only demand extended signbits.
39679     if (DemandedBits.isSignMask())
39680       return Op.getOperand(0);
39681     break;
39682   case X86ISD::PCMPGT:
39683     // icmp sgt(0, R) == ashr(R, BitWidth-1).
39684     // iff we only need the sign bit then we can use R directly.
39685     if (DemandedBits.isSignMask() &&
39686         ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
39687       return Op.getOperand(1);
39688     break;
39689   }
39690 
39691   APInt ShuffleUndef, ShuffleZero;
39692   SmallVector<int, 16> ShuffleMask;
39693   SmallVector<SDValue, 2> ShuffleOps;
39694   if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
39695                              ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
39696     // If all the demanded elts are from one operand and are inline,
39697     // then we can use the operand directly.
39698     int NumOps = ShuffleOps.size();
39699     if (ShuffleMask.size() == (unsigned)NumElts &&
39700         llvm::all_of(ShuffleOps, [VT](SDValue V) {
39701           return VT.getSizeInBits() == V.getValueSizeInBits();
39702         })) {
39703 
39704       if (DemandedElts.isSubsetOf(ShuffleUndef))
39705         return DAG.getUNDEF(VT);
39706       if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
39707         return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
39708 
39709       // Bitmask that indicates which ops have only been accessed 'inline'.
39710       APInt IdentityOp = APInt::getAllOnesValue(NumOps);
39711       for (int i = 0; i != NumElts; ++i) {
39712         int M = ShuffleMask[i];
39713         if (!DemandedElts[i] || ShuffleUndef[i])
39714           continue;
39715         int OpIdx = M / NumElts;
39716         int EltIdx = M % NumElts;
39717         if (M < 0 || EltIdx != i) {
39718           IdentityOp.clearAllBits();
39719           break;
39720         }
39721         IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
39722         if (IdentityOp == 0)
39723           break;
39724       }
39725       assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&
39726              "Multiple identity shuffles detected");
39727 
39728       if (IdentityOp != 0)
39729         return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
39730     }
39731   }
39732 
39733   return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
39734       Op, DemandedBits, DemandedElts, DAG, Depth);
39735 }
39736 
39737 // Helper to peek through bitops/trunc/setcc to determine size of source vector.
39738 // Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
checkBitcastSrcVectorSize(SDValue Src,unsigned Size,bool AllowTruncate)39739 static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
39740                                       bool AllowTruncate) {
39741   switch (Src.getOpcode()) {
39742   case ISD::TRUNCATE:
39743     if (!AllowTruncate)
39744       return false;
39745     LLVM_FALLTHROUGH;
39746   case ISD::SETCC:
39747     return Src.getOperand(0).getValueSizeInBits() == Size;
39748   case ISD::AND:
39749   case ISD::XOR:
39750   case ISD::OR:
39751     return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
39752            checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
39753   }
39754   return false;
39755 }
39756 
39757 // Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
getAltBitOpcode(unsigned Opcode)39758 static unsigned getAltBitOpcode(unsigned Opcode) {
39759   switch(Opcode) {
39760   case ISD::AND: return X86ISD::FAND;
39761   case ISD::OR: return X86ISD::FOR;
39762   case ISD::XOR: return X86ISD::FXOR;
39763   case X86ISD::ANDNP: return X86ISD::FANDN;
39764   }
39765   llvm_unreachable("Unknown bitwise opcode");
39766 }
39767 
39768 // Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
adjustBitcastSrcVectorSSE1(SelectionDAG & DAG,SDValue Src,const SDLoc & DL)39769 static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
39770                                           const SDLoc &DL) {
39771   EVT SrcVT = Src.getValueType();
39772   if (SrcVT != MVT::v4i1)
39773     return SDValue();
39774 
39775   switch (Src.getOpcode()) {
39776   case ISD::SETCC:
39777     if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
39778         ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
39779         cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
39780       SDValue Op0 = Src.getOperand(0);
39781       if (ISD::isNormalLoad(Op0.getNode()))
39782         return DAG.getBitcast(MVT::v4f32, Op0);
39783       if (Op0.getOpcode() == ISD::BITCAST &&
39784           Op0.getOperand(0).getValueType() == MVT::v4f32)
39785         return Op0.getOperand(0);
39786     }
39787     break;
39788   case ISD::AND:
39789   case ISD::XOR:
39790   case ISD::OR: {
39791     SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
39792     SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
39793     if (Op0 && Op1)
39794       return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
39795                          Op1);
39796     break;
39797   }
39798   }
39799   return SDValue();
39800 }
39801 
39802 // Helper to push sign extension of vXi1 SETCC result through bitops.
signExtendBitcastSrcVector(SelectionDAG & DAG,EVT SExtVT,SDValue Src,const SDLoc & DL)39803 static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
39804                                           SDValue Src, const SDLoc &DL) {
39805   switch (Src.getOpcode()) {
39806   case ISD::SETCC:
39807   case ISD::TRUNCATE:
39808     return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
39809   case ISD::AND:
39810   case ISD::XOR:
39811   case ISD::OR:
39812     return DAG.getNode(
39813         Src.getOpcode(), DL, SExtVT,
39814         signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
39815         signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
39816   }
39817   llvm_unreachable("Unexpected node type for vXi1 sign extension");
39818 }
39819 
39820 // Try to match patterns such as
39821 // (i16 bitcast (v16i1 x))
39822 // ->
39823 // (i16 movmsk (16i8 sext (v16i1 x)))
39824 // before the illegal vector is scalarized on subtargets that don't have legal
39825 // vxi1 types.
combineBitcastvxi1(SelectionDAG & DAG,EVT VT,SDValue Src,const SDLoc & DL,const X86Subtarget & Subtarget)39826 static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
39827                                   const SDLoc &DL,
39828                                   const X86Subtarget &Subtarget) {
39829   EVT SrcVT = Src.getValueType();
39830   if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
39831     return SDValue();
39832 
39833   // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
39834   // legalization destroys the v4i32 type.
39835   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
39836     if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
39837       V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
39838                       DAG.getBitcast(MVT::v4f32, V));
39839       return DAG.getZExtOrTrunc(V, DL, VT);
39840     }
39841   }
39842 
39843   // If the input is a truncate from v16i8 or v32i8 go ahead and use a
39844   // movmskb even with avx512. This will be better than truncating to vXi1 and
39845   // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
39846   // vpcmpeqb/vpcmpgtb.
39847   bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
39848                       (Src.getOperand(0).getValueType() == MVT::v16i8 ||
39849                        Src.getOperand(0).getValueType() == MVT::v32i8 ||
39850                        Src.getOperand(0).getValueType() == MVT::v64i8);
39851 
39852   // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
39853   // directly with vpmovmskb/vmovmskps/vmovmskpd.
39854   if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
39855       cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
39856       ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
39857     EVT CmpVT = Src.getOperand(0).getValueType();
39858     EVT EltVT = CmpVT.getVectorElementType();
39859     if (CmpVT.getSizeInBits() <= 256 &&
39860         (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
39861       PreferMovMsk = true;
39862   }
39863 
39864   // With AVX512 vxi1 types are legal and we prefer using k-regs.
39865   // MOVMSK is supported in SSE2 or later.
39866   if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
39867     return SDValue();
39868 
39869   // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
39870   // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
39871   // v8i16 and v16i16.
39872   // For these two cases, we can shuffle the upper element bytes to a
39873   // consecutive sequence at the start of the vector and treat the results as
39874   // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
39875   // for v16i16 this is not the case, because the shuffle is expensive, so we
39876   // avoid sign-extending to this type entirely.
39877   // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
39878   // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
39879   MVT SExtVT;
39880   bool PropagateSExt = false;
39881   switch (SrcVT.getSimpleVT().SimpleTy) {
39882   default:
39883     return SDValue();
39884   case MVT::v2i1:
39885     SExtVT = MVT::v2i64;
39886     break;
39887   case MVT::v4i1:
39888     SExtVT = MVT::v4i32;
39889     // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
39890     // sign-extend to a 256-bit operation to avoid truncation.
39891     if (Subtarget.hasAVX() &&
39892         checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
39893       SExtVT = MVT::v4i64;
39894       PropagateSExt = true;
39895     }
39896     break;
39897   case MVT::v8i1:
39898     SExtVT = MVT::v8i16;
39899     // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
39900     // sign-extend to a 256-bit operation to match the compare.
39901     // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
39902     // 256-bit because the shuffle is cheaper than sign extending the result of
39903     // the compare.
39904     if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
39905                                checkBitcastSrcVectorSize(Src, 512, true))) {
39906       SExtVT = MVT::v8i32;
39907       PropagateSExt = true;
39908     }
39909     break;
39910   case MVT::v16i1:
39911     SExtVT = MVT::v16i8;
39912     // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
39913     // it is not profitable to sign-extend to 256-bit because this will
39914     // require an extra cross-lane shuffle which is more expensive than
39915     // truncating the result of the compare to 128-bits.
39916     break;
39917   case MVT::v32i1:
39918     SExtVT = MVT::v32i8;
39919     break;
39920   case MVT::v64i1:
39921     // If we have AVX512F, but not AVX512BW and the input is truncated from
39922     // v64i8 checked earlier. Then split the input and make two pmovmskbs.
39923     if (Subtarget.hasAVX512()) {
39924       if (Subtarget.hasBWI())
39925         return SDValue();
39926       SExtVT = MVT::v64i8;
39927       break;
39928     }
39929     // Split if this is a <64 x i8> comparison result.
39930     if (checkBitcastSrcVectorSize(Src, 512, false)) {
39931       SExtVT = MVT::v64i8;
39932       break;
39933     }
39934     return SDValue();
39935   };
39936 
39937   SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
39938                             : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
39939 
39940   if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
39941     V = getPMOVMSKB(DL, V, DAG, Subtarget);
39942   } else {
39943     if (SExtVT == MVT::v8i16)
39944       V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
39945                       DAG.getUNDEF(MVT::v8i16));
39946     V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
39947   }
39948 
39949   EVT IntVT =
39950       EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
39951   V = DAG.getZExtOrTrunc(V, DL, IntVT);
39952   return DAG.getBitcast(VT, V);
39953 }
39954 
39955 // Convert a vXi1 constant build vector to the same width scalar integer.
combinevXi1ConstantToInteger(SDValue Op,SelectionDAG & DAG)39956 static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
39957   EVT SrcVT = Op.getValueType();
39958   assert(SrcVT.getVectorElementType() == MVT::i1 &&
39959          "Expected a vXi1 vector");
39960   assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
39961          "Expected a constant build vector");
39962 
39963   APInt Imm(SrcVT.getVectorNumElements(), 0);
39964   for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
39965     SDValue In = Op.getOperand(Idx);
39966     if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
39967       Imm.setBit(Idx);
39968   }
39969   EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
39970   return DAG.getConstant(Imm, SDLoc(Op), IntVT);
39971 }
39972 
combineCastedMaskArithmetic(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)39973 static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
39974                                            TargetLowering::DAGCombinerInfo &DCI,
39975                                            const X86Subtarget &Subtarget) {
39976   assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
39977 
39978   if (!DCI.isBeforeLegalizeOps())
39979     return SDValue();
39980 
39981   // Only do this if we have k-registers.
39982   if (!Subtarget.hasAVX512())
39983     return SDValue();
39984 
39985   EVT DstVT = N->getValueType(0);
39986   SDValue Op = N->getOperand(0);
39987   EVT SrcVT = Op.getValueType();
39988 
39989   if (!Op.hasOneUse())
39990     return SDValue();
39991 
39992   // Look for logic ops.
39993   if (Op.getOpcode() != ISD::AND &&
39994       Op.getOpcode() != ISD::OR &&
39995       Op.getOpcode() != ISD::XOR)
39996     return SDValue();
39997 
39998   // Make sure we have a bitcast between mask registers and a scalar type.
39999   if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
40000         DstVT.isScalarInteger()) &&
40001       !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
40002         SrcVT.isScalarInteger()))
40003     return SDValue();
40004 
40005   SDValue LHS = Op.getOperand(0);
40006   SDValue RHS = Op.getOperand(1);
40007 
40008   if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
40009       LHS.getOperand(0).getValueType() == DstVT)
40010     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
40011                        DAG.getBitcast(DstVT, RHS));
40012 
40013   if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
40014       RHS.getOperand(0).getValueType() == DstVT)
40015     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
40016                        DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
40017 
40018   // If the RHS is a vXi1 build vector, this is a good reason to flip too.
40019   // Most of these have to move a constant from the scalar domain anyway.
40020   if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
40021     RHS = combinevXi1ConstantToInteger(RHS, DAG);
40022     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
40023                        DAG.getBitcast(DstVT, LHS), RHS);
40024   }
40025 
40026   return SDValue();
40027 }
40028 
createMMXBuildVector(BuildVectorSDNode * BV,SelectionDAG & DAG,const X86Subtarget & Subtarget)40029 static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
40030                                     const X86Subtarget &Subtarget) {
40031   SDLoc DL(BV);
40032   unsigned NumElts = BV->getNumOperands();
40033   SDValue Splat = BV->getSplatValue();
40034 
40035   // Build MMX element from integer GPR or SSE float values.
40036   auto CreateMMXElement = [&](SDValue V) {
40037     if (V.isUndef())
40038       return DAG.getUNDEF(MVT::x86mmx);
40039     if (V.getValueType().isFloatingPoint()) {
40040       if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
40041         V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
40042         V = DAG.getBitcast(MVT::v2i64, V);
40043         return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
40044       }
40045       V = DAG.getBitcast(MVT::i32, V);
40046     } else {
40047       V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
40048     }
40049     return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
40050   };
40051 
40052   // Convert build vector ops to MMX data in the bottom elements.
40053   SmallVector<SDValue, 8> Ops;
40054 
40055   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40056 
40057   // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
40058   if (Splat) {
40059     if (Splat.isUndef())
40060       return DAG.getUNDEF(MVT::x86mmx);
40061 
40062     Splat = CreateMMXElement(Splat);
40063 
40064     if (Subtarget.hasSSE1()) {
40065       // Unpack v8i8 to splat i8 elements to lowest 16-bits.
40066       if (NumElts == 8)
40067         Splat = DAG.getNode(
40068             ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
40069             DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
40070                                   TLI.getPointerTy(DAG.getDataLayout())),
40071             Splat, Splat);
40072 
40073       // Use PSHUFW to repeat 16-bit elements.
40074       unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
40075       return DAG.getNode(
40076           ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
40077           DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
40078                                 TLI.getPointerTy(DAG.getDataLayout())),
40079           Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
40080     }
40081     Ops.append(NumElts, Splat);
40082   } else {
40083     for (unsigned i = 0; i != NumElts; ++i)
40084       Ops.push_back(CreateMMXElement(BV->getOperand(i)));
40085   }
40086 
40087   // Use tree of PUNPCKLs to build up general MMX vector.
40088   while (Ops.size() > 1) {
40089     unsigned NumOps = Ops.size();
40090     unsigned IntrinOp =
40091         (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
40092                      : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
40093                                     : Intrinsic::x86_mmx_punpcklbw));
40094     SDValue Intrin = DAG.getTargetConstant(
40095         IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
40096     for (unsigned i = 0; i != NumOps; i += 2)
40097       Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
40098                                Ops[i], Ops[i + 1]);
40099     Ops.resize(NumOps / 2);
40100   }
40101 
40102   return Ops[0];
40103 }
40104 
40105 // Recursive function that attempts to find if a bool vector node was originally
40106 // a vector/float/double that got truncated/extended/bitcast to/from a scalar
40107 // integer. If so, replace the scalar ops with bool vector equivalents back down
40108 // the chain.
combineBitcastToBoolVector(EVT VT,SDValue V,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget)40109 static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
40110                                           SelectionDAG &DAG,
40111                                           const X86Subtarget &Subtarget) {
40112   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40113   unsigned Opc = V.getOpcode();
40114   switch (Opc) {
40115   case ISD::BITCAST: {
40116     // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
40117     SDValue Src = V.getOperand(0);
40118     EVT SrcVT = Src.getValueType();
40119     if (SrcVT.isVector() || SrcVT.isFloatingPoint())
40120       return DAG.getBitcast(VT, Src);
40121     break;
40122   }
40123   case ISD::TRUNCATE: {
40124     // If we find a suitable source, a truncated scalar becomes a subvector.
40125     SDValue Src = V.getOperand(0);
40126     EVT NewSrcVT =
40127         EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
40128     if (TLI.isTypeLegal(NewSrcVT))
40129       if (SDValue N0 =
40130               combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
40131         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
40132                            DAG.getIntPtrConstant(0, DL));
40133     break;
40134   }
40135   case ISD::ANY_EXTEND:
40136   case ISD::ZERO_EXTEND: {
40137     // If we find a suitable source, an extended scalar becomes a subvector.
40138     SDValue Src = V.getOperand(0);
40139     EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
40140                                     Src.getScalarValueSizeInBits());
40141     if (TLI.isTypeLegal(NewSrcVT))
40142       if (SDValue N0 =
40143               combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
40144         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
40145                            Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
40146                                                   : DAG.getConstant(0, DL, VT),
40147                            N0, DAG.getIntPtrConstant(0, DL));
40148     break;
40149   }
40150   case ISD::OR: {
40151     // If we find suitable sources, we can just move an OR to the vector domain.
40152     SDValue Src0 = V.getOperand(0);
40153     SDValue Src1 = V.getOperand(1);
40154     if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
40155       if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
40156         return DAG.getNode(Opc, DL, VT, N0, N1);
40157     break;
40158   }
40159   case ISD::SHL: {
40160     // If we find a suitable source, a SHL becomes a KSHIFTL.
40161     SDValue Src0 = V.getOperand(0);
40162     if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
40163         ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
40164       break;
40165 
40166     if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
40167       if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
40168         return DAG.getNode(
40169             X86ISD::KSHIFTL, DL, VT, N0,
40170             DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
40171     break;
40172   }
40173   }
40174   return SDValue();
40175 }
40176 
combineBitcast(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)40177 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
40178                               TargetLowering::DAGCombinerInfo &DCI,
40179                               const X86Subtarget &Subtarget) {
40180   SDValue N0 = N->getOperand(0);
40181   EVT VT = N->getValueType(0);
40182   EVT SrcVT = N0.getValueType();
40183   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40184 
40185   // Try to match patterns such as
40186   // (i16 bitcast (v16i1 x))
40187   // ->
40188   // (i16 movmsk (16i8 sext (v16i1 x)))
40189   // before the setcc result is scalarized on subtargets that don't have legal
40190   // vxi1 types.
40191   if (DCI.isBeforeLegalize()) {
40192     SDLoc dl(N);
40193     if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
40194       return V;
40195 
40196     // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
40197     // type, widen both sides to avoid a trip through memory.
40198     if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
40199         Subtarget.hasAVX512()) {
40200       N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
40201       N0 = DAG.getBitcast(MVT::v8i1, N0);
40202       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
40203                          DAG.getIntPtrConstant(0, dl));
40204     }
40205 
40206     // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
40207     // type, widen both sides to avoid a trip through memory.
40208     if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
40209         Subtarget.hasAVX512()) {
40210       // Use zeros for the widening if we already have some zeroes. This can
40211       // allow SimplifyDemandedBits to remove scalar ANDs that may be down
40212       // stream of this.
40213       // FIXME: It might make sense to detect a concat_vectors with a mix of
40214       // zeroes and undef and turn it into insert_subvector for i1 vectors as
40215       // a separate combine. What we can't do is canonicalize the operands of
40216       // such a concat or we'll get into a loop with SimplifyDemandedBits.
40217       if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
40218         SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
40219         if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
40220           SrcVT = LastOp.getValueType();
40221           unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
40222           SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
40223           Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
40224           N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
40225           N0 = DAG.getBitcast(MVT::i8, N0);
40226           return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
40227         }
40228       }
40229 
40230       unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
40231       SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
40232       Ops[0] = N0;
40233       N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
40234       N0 = DAG.getBitcast(MVT::i8, N0);
40235       return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
40236     }
40237   } else {
40238     // If we're bitcasting from iX to vXi1, see if the integer originally
40239     // began as a vXi1 and whether we can remove the bitcast entirely.
40240     if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
40241         SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
40242       if (SDValue V =
40243               combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
40244         return V;
40245     }
40246   }
40247 
40248   // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
40249   // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
40250   // due to insert_subvector legalization on KNL. By promoting the copy to i16
40251   // we can help with known bits propagation from the vXi1 domain to the
40252   // scalar domain.
40253   if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
40254       !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40255       N0.getOperand(0).getValueType() == MVT::v16i1 &&
40256       isNullConstant(N0.getOperand(1)))
40257     return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
40258                        DAG.getBitcast(MVT::i16, N0.getOperand(0)));
40259 
40260   // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
40261   // and the vbroadcast_load are both integer or both fp. In some cases this
40262   // will remove the bitcast entirely.
40263   if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
40264        VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
40265     auto *BCast = cast<MemIntrinsicSDNode>(N0);
40266     unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
40267     unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
40268     // Don't swap i8/i16 since don't have fp types that size.
40269     if (MemSize >= 32) {
40270       MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
40271                                        : MVT::getIntegerVT(MemSize);
40272       MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
40273                                         : MVT::getIntegerVT(SrcVTSize);
40274       LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
40275 
40276       SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
40277       SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
40278       SDValue ResNode =
40279           DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
40280                                   MemVT, BCast->getMemOperand());
40281       DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
40282       return DAG.getBitcast(VT, ResNode);
40283     }
40284   }
40285 
40286   // Since MMX types are special and don't usually play with other vector types,
40287   // it's better to handle them early to be sure we emit efficient code by
40288   // avoiding store-load conversions.
40289   if (VT == MVT::x86mmx) {
40290     // Detect MMX constant vectors.
40291     APInt UndefElts;
40292     SmallVector<APInt, 1> EltBits;
40293     if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
40294       SDLoc DL(N0);
40295       // Handle zero-extension of i32 with MOVD.
40296       if (EltBits[0].countLeadingZeros() >= 32)
40297         return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
40298                            DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
40299       // Else, bitcast to a double.
40300       // TODO - investigate supporting sext 32-bit immediates on x86_64.
40301       APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
40302       return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
40303     }
40304 
40305     // Detect bitcasts to x86mmx low word.
40306     if (N0.getOpcode() == ISD::BUILD_VECTOR &&
40307         (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
40308         N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
40309       bool LowUndef = true, AllUndefOrZero = true;
40310       for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
40311         SDValue Op = N0.getOperand(i);
40312         LowUndef &= Op.isUndef() || (i >= e/2);
40313         AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
40314       }
40315       if (AllUndefOrZero) {
40316         SDValue N00 = N0.getOperand(0);
40317         SDLoc dl(N00);
40318         N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
40319                        : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
40320         return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
40321       }
40322     }
40323 
40324     // Detect bitcasts of 64-bit build vectors and convert to a
40325     // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
40326     // lowest element.
40327     if (N0.getOpcode() == ISD::BUILD_VECTOR &&
40328         (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
40329          SrcVT == MVT::v8i8))
40330       return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
40331 
40332     // Detect bitcasts between element or subvector extraction to x86mmx.
40333     if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
40334          N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
40335         isNullConstant(N0.getOperand(1))) {
40336       SDValue N00 = N0.getOperand(0);
40337       if (N00.getValueType().is128BitVector())
40338         return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
40339                            DAG.getBitcast(MVT::v2i64, N00));
40340     }
40341 
40342     // Detect bitcasts from FP_TO_SINT to x86mmx.
40343     if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
40344       SDLoc DL(N0);
40345       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
40346                                 DAG.getUNDEF(MVT::v2i32));
40347       return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
40348                          DAG.getBitcast(MVT::v2i64, Res));
40349     }
40350   }
40351 
40352   // Try to remove a bitcast of constant vXi1 vector. We have to legalize
40353   // most of these to scalar anyway.
40354   if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
40355       SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
40356       ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
40357     return combinevXi1ConstantToInteger(N0, DAG);
40358   }
40359 
40360   if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
40361       VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
40362       isa<ConstantSDNode>(N0)) {
40363     auto *C = cast<ConstantSDNode>(N0);
40364     if (C->isAllOnesValue())
40365       return DAG.getConstant(1, SDLoc(N0), VT);
40366     if (C->isNullValue())
40367       return DAG.getConstant(0, SDLoc(N0), VT);
40368   }
40369 
40370   // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
40371   // Turn it into a sign bit compare that produces a k-register. This avoids
40372   // a trip through a GPR.
40373   if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
40374       VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
40375       isPowerOf2_32(VT.getVectorNumElements())) {
40376     unsigned NumElts = VT.getVectorNumElements();
40377     SDValue Src = N0;
40378 
40379     // Peek through truncate.
40380     if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
40381       Src = N0.getOperand(0);
40382 
40383     if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
40384       SDValue MovmskIn = Src.getOperand(0);
40385       MVT MovmskVT = MovmskIn.getSimpleValueType();
40386       unsigned MovMskElts = MovmskVT.getVectorNumElements();
40387 
40388       // We allow extra bits of the movmsk to be used since they are known zero.
40389       // We can't convert a VPMOVMSKB without avx512bw.
40390       if (MovMskElts <= NumElts &&
40391           (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
40392         EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
40393         MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
40394         SDLoc dl(N);
40395         MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
40396         SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
40397                                    DAG.getConstant(0, dl, IntVT), ISD::SETLT);
40398         if (EVT(CmpVT) == VT)
40399           return Cmp;
40400 
40401         // Pad with zeroes up to original VT to replace the zeroes that were
40402         // being used from the MOVMSK.
40403         unsigned NumConcats = NumElts / MovMskElts;
40404         SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
40405         Ops[0] = Cmp;
40406         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
40407       }
40408     }
40409   }
40410 
40411   // Try to remove bitcasts from input and output of mask arithmetic to
40412   // remove GPR<->K-register crossings.
40413   if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
40414     return V;
40415 
40416   // Convert a bitcasted integer logic operation that has one bitcasted
40417   // floating-point operand into a floating-point logic operation. This may
40418   // create a load of a constant, but that is cheaper than materializing the
40419   // constant in an integer register and transferring it to an SSE register or
40420   // transferring the SSE operand to integer register and back.
40421   unsigned FPOpcode;
40422   switch (N0.getOpcode()) {
40423     case ISD::AND: FPOpcode = X86ISD::FAND; break;
40424     case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
40425     case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
40426     default: return SDValue();
40427   }
40428 
40429   // Check if we have a bitcast from another integer type as well.
40430   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
40431         (Subtarget.hasSSE2() && VT == MVT::f64) ||
40432         (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
40433          TLI.isTypeLegal(VT))))
40434     return SDValue();
40435 
40436   SDValue LogicOp0 = N0.getOperand(0);
40437   SDValue LogicOp1 = N0.getOperand(1);
40438   SDLoc DL0(N0);
40439 
40440   // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
40441   if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
40442       LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
40443       LogicOp0.getOperand(0).getValueType() == VT &&
40444       !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
40445     SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
40446     unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
40447     return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
40448   }
40449   // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
40450   if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
40451       LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
40452       LogicOp1.getOperand(0).getValueType() == VT &&
40453       !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
40454     SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
40455     unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
40456     return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
40457   }
40458 
40459   return SDValue();
40460 }
40461 
40462 // Given a ABS node, detect the following pattern:
40463 // (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
40464 // This is useful as it is the input into a SAD pattern.
detectZextAbsDiff(const SDValue & Abs,SDValue & Op0,SDValue & Op1)40465 static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
40466   SDValue AbsOp1 = Abs->getOperand(0);
40467   if (AbsOp1.getOpcode() != ISD::SUB)
40468     return false;
40469 
40470   Op0 = AbsOp1.getOperand(0);
40471   Op1 = AbsOp1.getOperand(1);
40472 
40473   // Check if the operands of the sub are zero-extended from vectors of i8.
40474   if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
40475       Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
40476       Op1.getOpcode() != ISD::ZERO_EXTEND ||
40477       Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
40478     return false;
40479 
40480   return true;
40481 }
40482 
40483 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
40484 // to these zexts.
createPSADBW(SelectionDAG & DAG,const SDValue & Zext0,const SDValue & Zext1,const SDLoc & DL,const X86Subtarget & Subtarget)40485 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
40486                             const SDValue &Zext1, const SDLoc &DL,
40487                             const X86Subtarget &Subtarget) {
40488   // Find the appropriate width for the PSADBW.
40489   EVT InVT = Zext0.getOperand(0).getValueType();
40490   unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
40491 
40492   // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
40493   // fill in the missing vector elements with 0.
40494   unsigned NumConcat = RegSize / InVT.getSizeInBits();
40495   SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
40496   Ops[0] = Zext0.getOperand(0);
40497   MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
40498   SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
40499   Ops[0] = Zext1.getOperand(0);
40500   SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
40501 
40502   // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
40503   auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
40504                           ArrayRef<SDValue> Ops) {
40505     MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
40506     return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
40507   };
40508   MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
40509   return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
40510                           PSADBWBuilder);
40511 }
40512 
40513 // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
40514 // PHMINPOSUW.
combineMinMaxReduction(SDNode * Extract,SelectionDAG & DAG,const X86Subtarget & Subtarget)40515 static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
40516                                       const X86Subtarget &Subtarget) {
40517   // Bail without SSE41.
40518   if (!Subtarget.hasSSE41())
40519     return SDValue();
40520 
40521   EVT ExtractVT = Extract->getValueType(0);
40522   if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
40523     return SDValue();
40524 
40525   // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
40526   ISD::NodeType BinOp;
40527   SDValue Src = DAG.matchBinOpReduction(
40528       Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
40529   if (!Src)
40530     return SDValue();
40531 
40532   EVT SrcVT = Src.getValueType();
40533   EVT SrcSVT = SrcVT.getScalarType();
40534   if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
40535     return SDValue();
40536 
40537   SDLoc DL(Extract);
40538   SDValue MinPos = Src;
40539 
40540   // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
40541   while (SrcVT.getSizeInBits() > 128) {
40542     SDValue Lo, Hi;
40543     std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
40544     SrcVT = Lo.getValueType();
40545     MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
40546   }
40547   assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
40548           (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
40549          "Unexpected value type");
40550 
40551   // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
40552   // to flip the value accordingly.
40553   SDValue Mask;
40554   unsigned MaskEltsBits = ExtractVT.getSizeInBits();
40555   if (BinOp == ISD::SMAX)
40556     Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
40557   else if (BinOp == ISD::SMIN)
40558     Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
40559   else if (BinOp == ISD::UMAX)
40560     Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
40561 
40562   if (Mask)
40563     MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
40564 
40565   // For v16i8 cases we need to perform UMIN on pairs of byte elements,
40566   // shuffling each upper element down and insert zeros. This means that the
40567   // v16i8 UMIN will leave the upper element as zero, performing zero-extension
40568   // ready for the PHMINPOS.
40569   if (ExtractVT == MVT::i8) {
40570     SDValue Upper = DAG.getVectorShuffle(
40571         SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
40572         {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
40573     MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
40574   }
40575 
40576   // Perform the PHMINPOS on a v8i16 vector,
40577   MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
40578   MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
40579   MinPos = DAG.getBitcast(SrcVT, MinPos);
40580 
40581   if (Mask)
40582     MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
40583 
40584   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
40585                      DAG.getIntPtrConstant(0, DL));
40586 }
40587 
40588 // Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
combinePredicateReduction(SDNode * Extract,SelectionDAG & DAG,const X86Subtarget & Subtarget)40589 static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
40590                                          const X86Subtarget &Subtarget) {
40591   // Bail without SSE2.
40592   if (!Subtarget.hasSSE2())
40593     return SDValue();
40594 
40595   EVT ExtractVT = Extract->getValueType(0);
40596   unsigned BitWidth = ExtractVT.getSizeInBits();
40597   if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
40598       ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
40599     return SDValue();
40600 
40601   // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
40602   ISD::NodeType BinOp;
40603   SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
40604   if (!Match && ExtractVT == MVT::i1)
40605     Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
40606   if (!Match)
40607     return SDValue();
40608 
40609   // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
40610   // which we can't support here for now.
40611   if (Match.getScalarValueSizeInBits() != BitWidth)
40612     return SDValue();
40613 
40614   SDValue Movmsk;
40615   SDLoc DL(Extract);
40616   EVT MatchVT = Match.getValueType();
40617   unsigned NumElts = MatchVT.getVectorNumElements();
40618   unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
40619   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40620 
40621   if (ExtractVT == MVT::i1) {
40622     // Special case for (pre-legalization) vXi1 reductions.
40623     if (NumElts > 64 || !isPowerOf2_32(NumElts))
40624       return SDValue();
40625     if (TLI.isTypeLegal(MatchVT)) {
40626       // If this is a legal AVX512 predicate type then we can just bitcast.
40627       EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
40628       Movmsk = DAG.getBitcast(MovmskVT, Match);
40629     } else {
40630       // For all_of(setcc(vec,0,eq)) - avoid vXi64 comparisons if we don't have
40631       // PCMPEQQ (SSE41+), use PCMPEQD instead.
40632       if (BinOp == ISD::AND && !Subtarget.hasSSE41() &&
40633           Match.getOpcode() == ISD::SETCC &&
40634           ISD::isBuildVectorAllZeros(Match.getOperand(1).getNode()) &&
40635           cast<CondCodeSDNode>(Match.getOperand(2))->get() ==
40636               ISD::CondCode::SETEQ) {
40637         SDValue Vec = Match.getOperand(0);
40638         if (Vec.getValueType().getScalarType() == MVT::i64 &&
40639             (2 * NumElts) <= MaxElts) {
40640           NumElts *= 2;
40641           EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
40642           MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
40643           Match = DAG.getSetCC(
40644               DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),
40645               DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ);
40646         }
40647       }
40648 
40649       // Use combineBitcastvxi1 to create the MOVMSK.
40650       while (NumElts > MaxElts) {
40651         SDValue Lo, Hi;
40652         std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
40653         Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
40654         NumElts /= 2;
40655       }
40656       EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
40657       Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
40658     }
40659     if (!Movmsk)
40660       return SDValue();
40661     Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
40662   } else {
40663     // FIXME: Better handling of k-registers or 512-bit vectors?
40664     unsigned MatchSizeInBits = Match.getValueSizeInBits();
40665     if (!(MatchSizeInBits == 128 ||
40666           (MatchSizeInBits == 256 && Subtarget.hasAVX())))
40667       return SDValue();
40668 
40669     // Make sure this isn't a vector of 1 element. The perf win from using
40670     // MOVMSK diminishes with less elements in the reduction, but it is
40671     // generally better to get the comparison over to the GPRs as soon as
40672     // possible to reduce the number of vector ops.
40673     if (Match.getValueType().getVectorNumElements() < 2)
40674       return SDValue();
40675 
40676     // Check that we are extracting a reduction of all sign bits.
40677     if (DAG.ComputeNumSignBits(Match) != BitWidth)
40678       return SDValue();
40679 
40680     if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
40681       SDValue Lo, Hi;
40682       std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
40683       Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
40684       MatchSizeInBits = Match.getValueSizeInBits();
40685     }
40686 
40687     // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
40688     MVT MaskSrcVT;
40689     if (64 == BitWidth || 32 == BitWidth)
40690       MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
40691                                    MatchSizeInBits / BitWidth);
40692     else
40693       MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
40694 
40695     SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
40696     Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
40697     NumElts = MaskSrcVT.getVectorNumElements();
40698   }
40699   assert((NumElts <= 32 || NumElts == 64) &&
40700          "Not expecting more than 64 elements");
40701 
40702   MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
40703   if (BinOp == ISD::XOR) {
40704     // parity -> (PARITY(MOVMSK X))
40705     SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
40706     return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
40707   }
40708 
40709   SDValue CmpC;
40710   ISD::CondCode CondCode;
40711   if (BinOp == ISD::OR) {
40712     // any_of -> MOVMSK != 0
40713     CmpC = DAG.getConstant(0, DL, CmpVT);
40714     CondCode = ISD::CondCode::SETNE;
40715   } else {
40716     // all_of -> MOVMSK == ((1 << NumElts) - 1)
40717     CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
40718                            DL, CmpVT);
40719     CondCode = ISD::CondCode::SETEQ;
40720   }
40721 
40722   // The setcc produces an i8 of 0/1, so extend that to the result width and
40723   // negate to get the final 0/-1 mask value.
40724   EVT SetccVT =
40725       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
40726   SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
40727   SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
40728   SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
40729   return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
40730 }
40731 
combineBasicSADPattern(SDNode * Extract,SelectionDAG & DAG,const X86Subtarget & Subtarget)40732 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
40733                                       const X86Subtarget &Subtarget) {
40734   // PSADBW is only supported on SSE2 and up.
40735   if (!Subtarget.hasSSE2())
40736     return SDValue();
40737 
40738   EVT ExtractVT = Extract->getValueType(0);
40739   // Verify the type we're extracting is either i32 or i64.
40740   // FIXME: Could support other types, but this is what we have coverage for.
40741   if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
40742     return SDValue();
40743 
40744   EVT VT = Extract->getOperand(0).getValueType();
40745   if (!isPowerOf2_32(VT.getVectorNumElements()))
40746     return SDValue();
40747 
40748   // Match shuffle + add pyramid.
40749   ISD::NodeType BinOp;
40750   SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
40751 
40752   // The operand is expected to be zero extended from i8
40753   // (verified in detectZextAbsDiff).
40754   // In order to convert to i64 and above, additional any/zero/sign
40755   // extend is expected.
40756   // The zero extend from 32 bit has no mathematical effect on the result.
40757   // Also the sign extend is basically zero extend
40758   // (extends the sign bit which is zero).
40759   // So it is correct to skip the sign/zero extend instruction.
40760   if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
40761                Root.getOpcode() == ISD::ZERO_EXTEND ||
40762                Root.getOpcode() == ISD::ANY_EXTEND))
40763     Root = Root.getOperand(0);
40764 
40765   // If there was a match, we want Root to be a select that is the root of an
40766   // abs-diff pattern.
40767   if (!Root || Root.getOpcode() != ISD::ABS)
40768     return SDValue();
40769 
40770   // Check whether we have an abs-diff pattern feeding into the select.
40771   SDValue Zext0, Zext1;
40772   if (!detectZextAbsDiff(Root, Zext0, Zext1))
40773     return SDValue();
40774 
40775   // Create the SAD instruction.
40776   SDLoc DL(Extract);
40777   SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
40778 
40779   // If the original vector was wider than 8 elements, sum over the results
40780   // in the SAD vector.
40781   unsigned Stages = Log2_32(VT.getVectorNumElements());
40782   EVT SadVT = SAD.getValueType();
40783   if (Stages > 3) {
40784     unsigned SadElems = SadVT.getVectorNumElements();
40785 
40786     for(unsigned i = Stages - 3; i > 0; --i) {
40787       SmallVector<int, 16> Mask(SadElems, -1);
40788       for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
40789         Mask[j] = MaskEnd + j;
40790 
40791       SDValue Shuffle =
40792           DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
40793       SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
40794     }
40795   }
40796 
40797   unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
40798   // Return the lowest ExtractSizeInBits bits.
40799   EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
40800                                SadVT.getSizeInBits() / ExtractSizeInBits);
40801   SAD = DAG.getBitcast(ResVT, SAD);
40802   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
40803                      Extract->getOperand(1));
40804 }
40805 
40806 // Attempt to peek through a target shuffle and extract the scalar from the
40807 // source.
combineExtractWithShuffle(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)40808 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
40809                                          TargetLowering::DAGCombinerInfo &DCI,
40810                                          const X86Subtarget &Subtarget) {
40811   if (DCI.isBeforeLegalizeOps())
40812     return SDValue();
40813 
40814   SDLoc dl(N);
40815   SDValue Src = N->getOperand(0);
40816   SDValue Idx = N->getOperand(1);
40817 
40818   EVT VT = N->getValueType(0);
40819   EVT SrcVT = Src.getValueType();
40820   EVT SrcSVT = SrcVT.getVectorElementType();
40821   unsigned SrcEltBits = SrcSVT.getSizeInBits();
40822   unsigned NumSrcElts = SrcVT.getVectorNumElements();
40823 
40824   // Don't attempt this for boolean mask vectors or unknown extraction indices.
40825   if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
40826     return SDValue();
40827 
40828   const APInt &IdxC = N->getConstantOperandAPInt(1);
40829   if (IdxC.uge(NumSrcElts))
40830     return SDValue();
40831 
40832   SDValue SrcBC = peekThroughBitcasts(Src);
40833 
40834   // Handle extract(bitcast(broadcast(scalar_value))).
40835   if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
40836     SDValue SrcOp = SrcBC.getOperand(0);
40837     EVT SrcOpVT = SrcOp.getValueType();
40838     if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
40839         (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
40840       unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
40841       unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
40842       // TODO support non-zero offsets.
40843       if (Offset == 0) {
40844         SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
40845         SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
40846         return SrcOp;
40847       }
40848     }
40849   }
40850 
40851   // If we're extracting a single element from a broadcast load and there are
40852   // no other users, just create a single load.
40853   if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
40854     auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
40855     unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
40856     if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
40857         VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
40858       SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
40859                                  MemIntr->getBasePtr(),
40860                                  MemIntr->getPointerInfo(),
40861                                  MemIntr->getOriginalAlign(),
40862                                  MemIntr->getMemOperand()->getFlags());
40863       DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
40864       return Load;
40865     }
40866   }
40867 
40868   // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
40869   // TODO: Move to DAGCombine?
40870   if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
40871       SrcBC.getValueType().isInteger() &&
40872       (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
40873       SrcBC.getScalarValueSizeInBits() ==
40874           SrcBC.getOperand(0).getValueSizeInBits()) {
40875     unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
40876     if (IdxC.ult(Scale)) {
40877       unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
40878       SDValue Scl = SrcBC.getOperand(0);
40879       EVT SclVT = Scl.getValueType();
40880       if (Offset) {
40881         Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
40882                           DAG.getShiftAmountConstant(Offset, SclVT, dl));
40883       }
40884       Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
40885       Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
40886       return Scl;
40887     }
40888   }
40889 
40890   // Handle extract(truncate(x)) for 0'th index.
40891   // TODO: Treat this as a faux shuffle?
40892   // TODO: When can we use this for general indices?
40893   if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
40894       (SrcVT.getSizeInBits() % 128) == 0) {
40895     Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
40896     MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
40897     return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
40898                        Idx);
40899   }
40900 
40901   // We can only legally extract other elements from 128-bit vectors and in
40902   // certain circumstances, depending on SSE-level.
40903   // TODO: Investigate float/double extraction if it will be just stored.
40904   auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
40905                                                  unsigned Idx) {
40906     EVT VecSVT = VecVT.getScalarType();
40907     if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
40908         (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
40909          VecSVT == MVT::i64)) {
40910       unsigned EltSizeInBits = VecSVT.getSizeInBits();
40911       unsigned NumEltsPerLane = 128 / EltSizeInBits;
40912       unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
40913       unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
40914       VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
40915       Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
40916       Idx &= (NumEltsPerLane - 1);
40917     }
40918     if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
40919         ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
40920       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
40921                          DAG.getBitcast(VecVT, Vec),
40922                          DAG.getIntPtrConstant(Idx, dl));
40923     }
40924     if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
40925         (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
40926       unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
40927       return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
40928                          DAG.getTargetConstant(Idx, dl, MVT::i8));
40929     }
40930     return SDValue();
40931   };
40932 
40933   // Resolve the target shuffle inputs and mask.
40934   SmallVector<int, 16> Mask;
40935   SmallVector<SDValue, 2> Ops;
40936   if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
40937     return SDValue();
40938 
40939   // Shuffle inputs must be the same size as the result.
40940   if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
40941         return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
40942       }))
40943     return SDValue();
40944 
40945   // Attempt to narrow/widen the shuffle mask to the correct size.
40946   if (Mask.size() != NumSrcElts) {
40947     if ((NumSrcElts % Mask.size()) == 0) {
40948       SmallVector<int, 16> ScaledMask;
40949       int Scale = NumSrcElts / Mask.size();
40950       narrowShuffleMaskElts(Scale, Mask, ScaledMask);
40951       Mask = std::move(ScaledMask);
40952     } else if ((Mask.size() % NumSrcElts) == 0) {
40953       // Simplify Mask based on demanded element.
40954       int ExtractIdx = (int)IdxC.getZExtValue();
40955       int Scale = Mask.size() / NumSrcElts;
40956       int Lo = Scale * ExtractIdx;
40957       int Hi = Scale * (ExtractIdx + 1);
40958       for (int i = 0, e = (int)Mask.size(); i != e; ++i)
40959         if (i < Lo || Hi <= i)
40960           Mask[i] = SM_SentinelUndef;
40961 
40962       SmallVector<int, 16> WidenedMask;
40963       while (Mask.size() > NumSrcElts &&
40964              canWidenShuffleElements(Mask, WidenedMask))
40965         Mask = std::move(WidenedMask);
40966     }
40967   }
40968 
40969   // If narrowing/widening failed, see if we can extract+zero-extend.
40970   int ExtractIdx;
40971   EVT ExtractVT;
40972   if (Mask.size() == NumSrcElts) {
40973     ExtractIdx = Mask[IdxC.getZExtValue()];
40974     ExtractVT = SrcVT;
40975   } else {
40976     unsigned Scale = Mask.size() / NumSrcElts;
40977     if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
40978       return SDValue();
40979     unsigned ScaledIdx = Scale * IdxC.getZExtValue();
40980     if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
40981       return SDValue();
40982     ExtractIdx = Mask[ScaledIdx];
40983     EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
40984     ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
40985     assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
40986            "Failed to widen vector type");
40987   }
40988 
40989   // If the shuffle source element is undef/zero then we can just accept it.
40990   if (ExtractIdx == SM_SentinelUndef)
40991     return DAG.getUNDEF(VT);
40992 
40993   if (ExtractIdx == SM_SentinelZero)
40994     return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
40995                                 : DAG.getConstant(0, dl, VT);
40996 
40997   SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
40998   ExtractIdx = ExtractIdx % Mask.size();
40999   if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
41000     return DAG.getZExtOrTrunc(V, dl, VT);
41001 
41002   return SDValue();
41003 }
41004 
41005 /// Extracting a scalar FP value from vector element 0 is free, so extract each
41006 /// operand first, then perform the math as a scalar op.
scalarizeExtEltFP(SDNode * ExtElt,SelectionDAG & DAG)41007 static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
41008   assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
41009   SDValue Vec = ExtElt->getOperand(0);
41010   SDValue Index = ExtElt->getOperand(1);
41011   EVT VT = ExtElt->getValueType(0);
41012   EVT VecVT = Vec.getValueType();
41013 
41014   // TODO: If this is a unary/expensive/expand op, allow extraction from a
41015   // non-zero element because the shuffle+scalar op will be cheaper?
41016   if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
41017     return SDValue();
41018 
41019   // Vector FP compares don't fit the pattern of FP math ops (propagate, not
41020   // extract, the condition code), so deal with those as a special-case.
41021   if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
41022     EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
41023     if (OpVT != MVT::f32 && OpVT != MVT::f64)
41024       return SDValue();
41025 
41026     // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
41027     SDLoc DL(ExtElt);
41028     SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
41029                                Vec.getOperand(0), Index);
41030     SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
41031                                Vec.getOperand(1), Index);
41032     return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
41033   }
41034 
41035   if (VT != MVT::f32 && VT != MVT::f64)
41036     return SDValue();
41037 
41038   // Vector FP selects don't fit the pattern of FP math ops (because the
41039   // condition has a different type and we have to change the opcode), so deal
41040   // with those here.
41041   // FIXME: This is restricted to pre type legalization by ensuring the setcc
41042   // has i1 elements. If we loosen this we need to convert vector bool to a
41043   // scalar bool.
41044   if (Vec.getOpcode() == ISD::VSELECT &&
41045       Vec.getOperand(0).getOpcode() == ISD::SETCC &&
41046       Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
41047       Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
41048     // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
41049     SDLoc DL(ExtElt);
41050     SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
41051                                Vec.getOperand(0).getValueType().getScalarType(),
41052                                Vec.getOperand(0), Index);
41053     SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
41054                                Vec.getOperand(1), Index);
41055     SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
41056                                Vec.getOperand(2), Index);
41057     return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
41058   }
41059 
41060   // TODO: This switch could include FNEG and the x86-specific FP logic ops
41061   // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
41062   // missed load folding and fma+fneg combining.
41063   switch (Vec.getOpcode()) {
41064   case ISD::FMA: // Begin 3 operands
41065   case ISD::FMAD:
41066   case ISD::FADD: // Begin 2 operands
41067   case ISD::FSUB:
41068   case ISD::FMUL:
41069   case ISD::FDIV:
41070   case ISD::FREM:
41071   case ISD::FCOPYSIGN:
41072   case ISD::FMINNUM:
41073   case ISD::FMAXNUM:
41074   case ISD::FMINNUM_IEEE:
41075   case ISD::FMAXNUM_IEEE:
41076   case ISD::FMAXIMUM:
41077   case ISD::FMINIMUM:
41078   case X86ISD::FMAX:
41079   case X86ISD::FMIN:
41080   case ISD::FABS: // Begin 1 operand
41081   case ISD::FSQRT:
41082   case ISD::FRINT:
41083   case ISD::FCEIL:
41084   case ISD::FTRUNC:
41085   case ISD::FNEARBYINT:
41086   case ISD::FROUND:
41087   case ISD::FFLOOR:
41088   case X86ISD::FRCP:
41089   case X86ISD::FRSQRT: {
41090     // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
41091     SDLoc DL(ExtElt);
41092     SmallVector<SDValue, 4> ExtOps;
41093     for (SDValue Op : Vec->ops())
41094       ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
41095     return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
41096   }
41097   default:
41098     return SDValue();
41099   }
41100   llvm_unreachable("All opcodes should return within switch");
41101 }
41102 
41103 /// Try to convert a vector reduction sequence composed of binops and shuffles
41104 /// into horizontal ops.
combineArithReduction(SDNode * ExtElt,SelectionDAG & DAG,const X86Subtarget & Subtarget)41105 static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
41106                                      const X86Subtarget &Subtarget) {
41107   assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
41108 
41109   // We need at least SSE2 to anything here.
41110   if (!Subtarget.hasSSE2())
41111     return SDValue();
41112 
41113   ISD::NodeType Opc;
41114   SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
41115                                         {ISD::ADD, ISD::MUL, ISD::FADD}, true);
41116   if (!Rdx)
41117     return SDValue();
41118 
41119   SDValue Index = ExtElt->getOperand(1);
41120   assert(isNullConstant(Index) &&
41121          "Reduction doesn't end in an extract from index 0");
41122 
41123   EVT VT = ExtElt->getValueType(0);
41124   EVT VecVT = Rdx.getValueType();
41125   if (VecVT.getScalarType() != VT)
41126     return SDValue();
41127 
41128   SDLoc DL(ExtElt);
41129 
41130   // vXi8 mul reduction - promote to vXi16 mul reduction.
41131   if (Opc == ISD::MUL) {
41132     unsigned NumElts = VecVT.getVectorNumElements();
41133     if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
41134       return SDValue();
41135     if (VecVT.getSizeInBits() >= 128) {
41136       EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
41137       SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
41138       SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
41139       Lo = DAG.getBitcast(WideVT, Lo);
41140       Hi = DAG.getBitcast(WideVT, Hi);
41141       Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
41142       while (Rdx.getValueSizeInBits() > 128) {
41143         std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
41144         Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
41145       }
41146     } else {
41147       if (VecVT == MVT::v4i8)
41148         Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
41149                           DAG.getUNDEF(MVT::v4i8));
41150       Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
41151                         DAG.getUNDEF(MVT::v8i8));
41152       Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
41153       Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
41154     }
41155     if (NumElts >= 8)
41156       Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
41157                         DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
41158                                              {4, 5, 6, 7, -1, -1, -1, -1}));
41159     Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
41160                       DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
41161                                            {2, 3, -1, -1, -1, -1, -1, -1}));
41162     Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
41163                       DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
41164                                            {1, -1, -1, -1, -1, -1, -1, -1}));
41165     Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
41166     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
41167   }
41168 
41169   // vXi8 add reduction - sub 128-bit vector.
41170   if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
41171     if (VecVT == MVT::v4i8) {
41172       // Pad with zero.
41173       if (Subtarget.hasSSE41()) {
41174         Rdx = DAG.getBitcast(MVT::i32, Rdx);
41175         Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
41176                           DAG.getConstant(0, DL, MVT::v4i32), Rdx,
41177                           DAG.getIntPtrConstant(0, DL));
41178         Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
41179       } else {
41180         Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
41181                           DAG.getConstant(0, DL, VecVT));
41182       }
41183     }
41184     if (Rdx.getValueType() == MVT::v8i8) {
41185       // Pad with undef.
41186       Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
41187                         DAG.getUNDEF(MVT::v8i8));
41188     }
41189     Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
41190                       DAG.getConstant(0, DL, MVT::v16i8));
41191     Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
41192     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
41193   }
41194 
41195   // Must be a >=128-bit vector with pow2 elements.
41196   if ((VecVT.getSizeInBits() % 128) != 0 ||
41197       !isPowerOf2_32(VecVT.getVectorNumElements()))
41198     return SDValue();
41199 
41200   // vXi8 add reduction - sum lo/hi halves then use PSADBW.
41201   if (VT == MVT::i8) {
41202     while (Rdx.getValueSizeInBits() > 128) {
41203       SDValue Lo, Hi;
41204       std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
41205       VecVT = Lo.getValueType();
41206       Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
41207     }
41208     assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
41209 
41210     SDValue Hi = DAG.getVectorShuffle(
41211         MVT::v16i8, DL, Rdx, Rdx,
41212         {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
41213     Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
41214     Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
41215                       getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
41216     Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
41217     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
41218   }
41219 
41220   // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
41221   if (!shouldUseHorizontalOp(true, DAG, Subtarget))
41222     return SDValue();
41223 
41224   unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
41225 
41226   // 256-bit horizontal instructions operate on 128-bit chunks rather than
41227   // across the whole vector, so we need an extract + hop preliminary stage.
41228   // This is the only step where the operands of the hop are not the same value.
41229   // TODO: We could extend this to handle 512-bit or even longer vectors.
41230   if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
41231       ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
41232     unsigned NumElts = VecVT.getVectorNumElements();
41233     SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
41234     SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
41235     Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
41236     VecVT = Rdx.getValueType();
41237   }
41238   if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
41239       !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
41240     return SDValue();
41241 
41242   // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
41243   unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
41244   for (unsigned i = 0; i != ReductionSteps; ++i)
41245     Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
41246 
41247   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
41248 }
41249 
41250 /// Detect vector gather/scatter index generation and convert it from being a
41251 /// bunch of shuffles and extracts into a somewhat faster sequence.
41252 /// For i686, the best sequence is apparently storing the value and loading
41253 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
combineExtractVectorElt(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)41254 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
41255                                        TargetLowering::DAGCombinerInfo &DCI,
41256                                        const X86Subtarget &Subtarget) {
41257   if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
41258     return NewOp;
41259 
41260   SDValue InputVector = N->getOperand(0);
41261   SDValue EltIdx = N->getOperand(1);
41262   auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
41263 
41264   EVT SrcVT = InputVector.getValueType();
41265   EVT VT = N->getValueType(0);
41266   SDLoc dl(InputVector);
41267   bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
41268   unsigned NumSrcElts = SrcVT.getVectorNumElements();
41269 
41270   if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
41271     return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
41272 
41273   // Integer Constant Folding.
41274   if (CIdx && VT.isInteger()) {
41275     APInt UndefVecElts;
41276     SmallVector<APInt, 16> EltBits;
41277     unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
41278     if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
41279                                       EltBits, true, false)) {
41280       uint64_t Idx = CIdx->getZExtValue();
41281       if (UndefVecElts[Idx])
41282         return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
41283       return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
41284                              dl, VT);
41285     }
41286   }
41287 
41288   if (IsPextr) {
41289     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41290     if (TLI.SimplifyDemandedBits(
41291             SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
41292       return SDValue(N, 0);
41293 
41294     // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
41295     if ((InputVector.getOpcode() == X86ISD::PINSRB ||
41296          InputVector.getOpcode() == X86ISD::PINSRW) &&
41297         InputVector.getOperand(2) == EltIdx) {
41298       assert(SrcVT == InputVector.getOperand(0).getValueType() &&
41299              "Vector type mismatch");
41300       SDValue Scl = InputVector.getOperand(1);
41301       Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
41302       return DAG.getZExtOrTrunc(Scl, dl, VT);
41303     }
41304 
41305     // TODO - Remove this once we can handle the implicit zero-extension of
41306     // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
41307     // combineBasicSADPattern.
41308     return SDValue();
41309   }
41310 
41311   // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
41312   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
41313       VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
41314     SDValue MMXSrc = InputVector.getOperand(0);
41315 
41316     // The bitcast source is a direct mmx result.
41317     if (MMXSrc.getValueType() == MVT::x86mmx)
41318       return DAG.getBitcast(VT, InputVector);
41319   }
41320 
41321   // Detect mmx to i32 conversion through a v2i32 elt extract.
41322   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
41323       VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
41324     SDValue MMXSrc = InputVector.getOperand(0);
41325 
41326     // The bitcast source is a direct mmx result.
41327     if (MMXSrc.getValueType() == MVT::x86mmx)
41328       return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
41329   }
41330 
41331   // Check whether this extract is the root of a sum of absolute differences
41332   // pattern. This has to be done here because we really want it to happen
41333   // pre-legalization,
41334   if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
41335     return SAD;
41336 
41337   // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
41338   if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
41339     return Cmp;
41340 
41341   // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
41342   if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
41343     return MinMax;
41344 
41345   // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
41346   if (SDValue V = combineArithReduction(N, DAG, Subtarget))
41347     return V;
41348 
41349   if (SDValue V = scalarizeExtEltFP(N, DAG))
41350     return V;
41351 
41352   // Attempt to extract a i1 element by using MOVMSK to extract the signbits
41353   // and then testing the relevant element.
41354   //
41355   // Note that we only combine extracts on the *same* result number, i.e.
41356   //   t0 = merge_values a0, a1, a2, a3
41357   //   i1 = extract_vector_elt t0, Constant:i64<2>
41358   //   i1 = extract_vector_elt t0, Constant:i64<3>
41359   // but not
41360   //   i1 = extract_vector_elt t0:1, Constant:i64<2>
41361   // since the latter would need its own MOVMSK.
41362   if (CIdx && SrcVT.getScalarType() == MVT::i1) {
41363     SmallVector<SDNode *, 16> BoolExtracts;
41364     unsigned ResNo = InputVector.getResNo();
41365     auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) {
41366       if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
41367           isa<ConstantSDNode>(Use->getOperand(1)) &&
41368           Use->getOperand(0).getResNo() == ResNo &&
41369           Use->getValueType(0) == MVT::i1) {
41370         BoolExtracts.push_back(Use);
41371         return true;
41372       }
41373       return false;
41374     };
41375     if (all_of(InputVector->uses(), IsBoolExtract) &&
41376         BoolExtracts.size() > 1) {
41377       EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
41378       if (SDValue BC =
41379               combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
41380         for (SDNode *Use : BoolExtracts) {
41381           // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
41382           unsigned MaskIdx = Use->getConstantOperandVal(1);
41383           APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
41384           SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
41385           SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
41386           Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
41387           DCI.CombineTo(Use, Res);
41388         }
41389         return SDValue(N, 0);
41390       }
41391     }
41392   }
41393 
41394   return SDValue();
41395 }
41396 
41397 /// If a vector select has an operand that is -1 or 0, try to simplify the
41398 /// select to a bitwise logic operation.
41399 /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
41400 static SDValue
combineVSelectWithAllOnesOrZeros(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)41401 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
41402                                  TargetLowering::DAGCombinerInfo &DCI,
41403                                  const X86Subtarget &Subtarget) {
41404   SDValue Cond = N->getOperand(0);
41405   SDValue LHS = N->getOperand(1);
41406   SDValue RHS = N->getOperand(2);
41407   EVT VT = LHS.getValueType();
41408   EVT CondVT = Cond.getValueType();
41409   SDLoc DL(N);
41410   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41411 
41412   if (N->getOpcode() != ISD::VSELECT)
41413     return SDValue();
41414 
41415   assert(CondVT.isVector() && "Vector select expects a vector selector!");
41416 
41417   // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
41418   // TODO: Can we assert that both operands are not zeros (because that should
41419   //       get simplified at node creation time)?
41420   bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
41421   bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
41422 
41423   // If both inputs are 0/undef, create a complete zero vector.
41424   // FIXME: As noted above this should be handled by DAGCombiner/getNode.
41425   if (TValIsAllZeros && FValIsAllZeros) {
41426     if (VT.isFloatingPoint())
41427       return DAG.getConstantFP(0.0, DL, VT);
41428     return DAG.getConstant(0, DL, VT);
41429   }
41430 
41431   // To use the condition operand as a bitwise mask, it must have elements that
41432   // are the same size as the select elements. Ie, the condition operand must
41433   // have already been promoted from the IR select condition type <N x i1>.
41434   // Don't check if the types themselves are equal because that excludes
41435   // vector floating-point selects.
41436   if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
41437     return SDValue();
41438 
41439   // Try to invert the condition if true value is not all 1s and false value is
41440   // not all 0s. Only do this if the condition has one use.
41441   bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
41442   if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
41443       // Check if the selector will be produced by CMPP*/PCMP*.
41444       Cond.getOpcode() == ISD::SETCC &&
41445       // Check if SETCC has already been promoted.
41446       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
41447           CondVT) {
41448     bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
41449 
41450     if (TValIsAllZeros || FValIsAllOnes) {
41451       SDValue CC = Cond.getOperand(2);
41452       ISD::CondCode NewCC = ISD::getSetCCInverse(
41453           cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
41454       Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
41455                           NewCC);
41456       std::swap(LHS, RHS);
41457       TValIsAllOnes = FValIsAllOnes;
41458       FValIsAllZeros = TValIsAllZeros;
41459     }
41460   }
41461 
41462   // Cond value must be 'sign splat' to be converted to a logical op.
41463   if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
41464     return SDValue();
41465 
41466   // vselect Cond, 111..., 000... -> Cond
41467   if (TValIsAllOnes && FValIsAllZeros)
41468     return DAG.getBitcast(VT, Cond);
41469 
41470   if (!TLI.isTypeLegal(CondVT))
41471     return SDValue();
41472 
41473   // vselect Cond, 111..., X -> or Cond, X
41474   if (TValIsAllOnes) {
41475     SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
41476     SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
41477     return DAG.getBitcast(VT, Or);
41478   }
41479 
41480   // vselect Cond, X, 000... -> and Cond, X
41481   if (FValIsAllZeros) {
41482     SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
41483     SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
41484     return DAG.getBitcast(VT, And);
41485   }
41486 
41487   // vselect Cond, 000..., X -> andn Cond, X
41488   if (TValIsAllZeros) {
41489     SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
41490     SDValue AndN;
41491     // The canonical form differs for i1 vectors - x86andnp is not used
41492     if (CondVT.getScalarType() == MVT::i1)
41493       AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
41494                          CastRHS);
41495     else
41496       AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
41497     return DAG.getBitcast(VT, AndN);
41498   }
41499 
41500   return SDValue();
41501 }
41502 
41503 /// If both arms of a vector select are concatenated vectors, split the select,
41504 /// and concatenate the result to eliminate a wide (256-bit) vector instruction:
41505 ///   vselect Cond, (concat T0, T1), (concat F0, F1) -->
41506 ///   concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
narrowVectorSelect(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)41507 static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
41508                                   const X86Subtarget &Subtarget) {
41509   unsigned Opcode = N->getOpcode();
41510   if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
41511     return SDValue();
41512 
41513   // TODO: Split 512-bit vectors too?
41514   EVT VT = N->getValueType(0);
41515   if (!VT.is256BitVector())
41516     return SDValue();
41517 
41518   // TODO: Split as long as any 2 of the 3 operands are concatenated?
41519   SDValue Cond = N->getOperand(0);
41520   SDValue TVal = N->getOperand(1);
41521   SDValue FVal = N->getOperand(2);
41522   SmallVector<SDValue, 4> CatOpsT, CatOpsF;
41523   if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
41524       !collectConcatOps(TVal.getNode(), CatOpsT) ||
41525       !collectConcatOps(FVal.getNode(), CatOpsF))
41526     return SDValue();
41527 
41528   auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
41529                             ArrayRef<SDValue> Ops) {
41530     return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
41531   };
41532   return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
41533                           makeBlend, /*CheckBWI*/ false);
41534 }
41535 
combineSelectOfTwoConstants(SDNode * N,SelectionDAG & DAG)41536 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
41537   SDValue Cond = N->getOperand(0);
41538   SDValue LHS = N->getOperand(1);
41539   SDValue RHS = N->getOperand(2);
41540   SDLoc DL(N);
41541 
41542   auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
41543   auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
41544   if (!TrueC || !FalseC)
41545     return SDValue();
41546 
41547   // Don't do this for crazy integer types.
41548   EVT VT = N->getValueType(0);
41549   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
41550     return SDValue();
41551 
41552   // We're going to use the condition bit in math or logic ops. We could allow
41553   // this with a wider condition value (post-legalization it becomes an i8),
41554   // but if nothing is creating selects that late, it doesn't matter.
41555   if (Cond.getValueType() != MVT::i1)
41556     return SDValue();
41557 
41558   // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
41559   // 3, 5, or 9 with i32/i64, so those get transformed too.
41560   // TODO: For constants that overflow or do not differ by power-of-2 or small
41561   // multiplier, convert to 'and' + 'add'.
41562   const APInt &TrueVal = TrueC->getAPIntValue();
41563   const APInt &FalseVal = FalseC->getAPIntValue();
41564   bool OV;
41565   APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
41566   if (OV)
41567     return SDValue();
41568 
41569   APInt AbsDiff = Diff.abs();
41570   if (AbsDiff.isPowerOf2() ||
41571       ((VT == MVT::i32 || VT == MVT::i64) &&
41572        (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
41573 
41574     // We need a positive multiplier constant for shift/LEA codegen. The 'not'
41575     // of the condition can usually be folded into a compare predicate, but even
41576     // without that, the sequence should be cheaper than a CMOV alternative.
41577     if (TrueVal.slt(FalseVal)) {
41578       Cond = DAG.getNOT(DL, Cond, MVT::i1);
41579       std::swap(TrueC, FalseC);
41580     }
41581 
41582     // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
41583     SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
41584 
41585     // Multiply condition by the difference if non-one.
41586     if (!AbsDiff.isOneValue())
41587       R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
41588 
41589     // Add the base if non-zero.
41590     if (!FalseC->isNullValue())
41591       R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
41592 
41593     return R;
41594   }
41595 
41596   return SDValue();
41597 }
41598 
41599 /// If this is a *dynamic* select (non-constant condition) and we can match
41600 /// this node with one of the variable blend instructions, restructure the
41601 /// condition so that blends can use the high (sign) bit of each element.
41602 /// This function will also call SimplifyDemandedBits on already created
41603 /// BLENDV to perform additional simplifications.
combineVSelectToBLENDV(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)41604 static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
41605                                            TargetLowering::DAGCombinerInfo &DCI,
41606                                            const X86Subtarget &Subtarget) {
41607   SDValue Cond = N->getOperand(0);
41608   if ((N->getOpcode() != ISD::VSELECT &&
41609        N->getOpcode() != X86ISD::BLENDV) ||
41610       ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
41611     return SDValue();
41612 
41613   // Don't optimize before the condition has been transformed to a legal type
41614   // and don't ever optimize vector selects that map to AVX512 mask-registers.
41615   unsigned BitWidth = Cond.getScalarValueSizeInBits();
41616   if (BitWidth < 8 || BitWidth > 64)
41617     return SDValue();
41618 
41619   // We can only handle the cases where VSELECT is directly legal on the
41620   // subtarget. We custom lower VSELECT nodes with constant conditions and
41621   // this makes it hard to see whether a dynamic VSELECT will correctly
41622   // lower, so we both check the operation's status and explicitly handle the
41623   // cases where a *dynamic* blend will fail even though a constant-condition
41624   // blend could be custom lowered.
41625   // FIXME: We should find a better way to handle this class of problems.
41626   // Potentially, we should combine constant-condition vselect nodes
41627   // pre-legalization into shuffles and not mark as many types as custom
41628   // lowered.
41629   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41630   EVT VT = N->getValueType(0);
41631   if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
41632     return SDValue();
41633   // FIXME: We don't support i16-element blends currently. We could and
41634   // should support them by making *all* the bits in the condition be set
41635   // rather than just the high bit and using an i8-element blend.
41636   if (VT.getVectorElementType() == MVT::i16)
41637     return SDValue();
41638   // Dynamic blending was only available from SSE4.1 onward.
41639   if (VT.is128BitVector() && !Subtarget.hasSSE41())
41640     return SDValue();
41641   // Byte blends are only available in AVX2
41642   if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
41643     return SDValue();
41644   // There are no 512-bit blend instructions that use sign bits.
41645   if (VT.is512BitVector())
41646     return SDValue();
41647 
41648   auto OnlyUsedAsSelectCond = [](SDValue Cond) {
41649     for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
41650          UI != UE; ++UI)
41651       if ((UI->getOpcode() != ISD::VSELECT &&
41652            UI->getOpcode() != X86ISD::BLENDV) ||
41653           UI.getOperandNo() != 0)
41654         return false;
41655 
41656     return true;
41657   };
41658 
41659   APInt DemandedBits(APInt::getSignMask(BitWidth));
41660 
41661   if (OnlyUsedAsSelectCond(Cond)) {
41662     KnownBits Known;
41663     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
41664                                           !DCI.isBeforeLegalizeOps());
41665     if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
41666       return SDValue();
41667 
41668     // If we changed the computation somewhere in the DAG, this change will
41669     // affect all users of Cond. Update all the nodes so that we do not use
41670     // the generic VSELECT anymore. Otherwise, we may perform wrong
41671     // optimizations as we messed with the actual expectation for the vector
41672     // boolean values.
41673     for (SDNode *U : Cond->uses()) {
41674       if (U->getOpcode() == X86ISD::BLENDV)
41675         continue;
41676 
41677       SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
41678                                Cond, U->getOperand(1), U->getOperand(2));
41679       DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
41680       DCI.AddToWorklist(U);
41681     }
41682     DCI.CommitTargetLoweringOpt(TLO);
41683     return SDValue(N, 0);
41684   }
41685 
41686   // Otherwise we can still at least try to simplify multiple use bits.
41687   if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
41688       return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
41689                          N->getOperand(1), N->getOperand(2));
41690 
41691   return SDValue();
41692 }
41693 
41694 // Try to match:
41695 //   (or (and (M, (sub 0, X)), (pandn M, X)))
41696 // which is a special case of:
41697 //   (select M, (sub 0, X), X)
41698 // Per:
41699 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
41700 // We know that, if fNegate is 0 or 1:
41701 //   (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
41702 //
41703 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
41704 //   ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
41705 //   ( M      ? -X : X) == ((X ^   M     ) + (M & 1))
41706 // This lets us transform our vselect to:
41707 //   (add (xor X, M), (and M, 1))
41708 // And further to:
41709 //   (sub (xor X, M), M)
combineLogicBlendIntoConditionalNegate(EVT VT,SDValue Mask,SDValue X,SDValue Y,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget)41710 static SDValue combineLogicBlendIntoConditionalNegate(
41711     EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
41712     SelectionDAG &DAG, const X86Subtarget &Subtarget) {
41713   EVT MaskVT = Mask.getValueType();
41714   assert(MaskVT.isInteger() &&
41715          DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
41716          "Mask must be zero/all-bits");
41717 
41718   if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
41719     return SDValue();
41720   if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
41721     return SDValue();
41722 
41723   auto IsNegV = [](SDNode *N, SDValue V) {
41724     return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
41725            ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
41726   };
41727 
41728   SDValue V;
41729   if (IsNegV(Y.getNode(), X))
41730     V = X;
41731   else if (IsNegV(X.getNode(), Y))
41732     V = Y;
41733   else
41734     return SDValue();
41735 
41736   SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
41737   SDValue SubOp2 = Mask;
41738 
41739   // If the negate was on the false side of the select, then
41740   // the operands of the SUB need to be swapped. PR 27251.
41741   // This is because the pattern being matched above is
41742   // (vselect M, (sub (0, X), X)  -> (sub (xor X, M), M)
41743   // but if the pattern matched was
41744   // (vselect M, X, (sub (0, X))), that is really negation of the pattern
41745   // above, -(vselect M, (sub 0, X), X), and therefore the replacement
41746   // pattern also needs to be a negation of the replacement pattern above.
41747   // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
41748   // sub accomplishes the negation of the replacement pattern.
41749   if (V == Y)
41750     std::swap(SubOp1, SubOp2);
41751 
41752   SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
41753   return DAG.getBitcast(VT, Res);
41754 }
41755 
41756 /// Do target-specific dag combines on SELECT and VSELECT nodes.
combineSelect(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)41757 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
41758                              TargetLowering::DAGCombinerInfo &DCI,
41759                              const X86Subtarget &Subtarget) {
41760   SDLoc DL(N);
41761   SDValue Cond = N->getOperand(0);
41762   SDValue LHS = N->getOperand(1);
41763   SDValue RHS = N->getOperand(2);
41764 
41765   // Try simplification again because we use this function to optimize
41766   // BLENDV nodes that are not handled by the generic combiner.
41767   if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
41768     return V;
41769 
41770   EVT VT = LHS.getValueType();
41771   EVT CondVT = Cond.getValueType();
41772   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41773   bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
41774 
41775   // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
41776   // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
41777   // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
41778   if (CondVT.isVector() && CondVT.isInteger() &&
41779       CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
41780       (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
41781       DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
41782     if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
41783                                                            DL, DAG, Subtarget))
41784       return V;
41785 
41786   // Convert vselects with constant condition into shuffles.
41787   if (CondConstantVector && DCI.isBeforeLegalizeOps()) {
41788     SmallVector<int, 64> Mask;
41789     if (createShuffleMaskFromVSELECT(Mask, Cond))
41790       return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
41791   }
41792 
41793   // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
41794   // by forcing the unselected elements to zero.
41795   // TODO: Can we handle more shuffles with this?
41796   if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
41797       LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
41798       LHS.hasOneUse() && RHS.hasOneUse()) {
41799     MVT SimpleVT = VT.getSimpleVT();
41800     SmallVector<SDValue, 1> LHSOps, RHSOps;
41801     SmallVector<int, 64> LHSMask, RHSMask, CondMask;
41802     if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
41803         getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&
41804         getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {
41805       int NumElts = VT.getVectorNumElements();
41806       for (int i = 0; i != NumElts; ++i) {
41807         if (CondMask[i] < NumElts)
41808           RHSMask[i] = 0x80;
41809         else
41810           LHSMask[i] = 0x80;
41811       }
41812       LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
41813                         getConstVector(LHSMask, SimpleVT, DAG, DL, true));
41814       RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
41815                         getConstVector(RHSMask, SimpleVT, DAG, DL, true));
41816       return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
41817     }
41818   }
41819 
41820   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
41821   // instructions match the semantics of the common C idiom x<y?x:y but not
41822   // x<=y?x:y, because of how they handle negative zero (which can be
41823   // ignored in unsafe-math mode).
41824   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
41825   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
41826       VT != MVT::f80 && VT != MVT::f128 &&
41827       (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
41828       (Subtarget.hasSSE2() ||
41829        (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
41830     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
41831 
41832     unsigned Opcode = 0;
41833     // Check for x CC y ? x : y.
41834     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
41835         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
41836       switch (CC) {
41837       default: break;
41838       case ISD::SETULT:
41839         // Converting this to a min would handle NaNs incorrectly, and swapping
41840         // the operands would cause it to handle comparisons between positive
41841         // and negative zero incorrectly.
41842         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
41843           if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41844               !(DAG.isKnownNeverZeroFloat(LHS) ||
41845                 DAG.isKnownNeverZeroFloat(RHS)))
41846             break;
41847           std::swap(LHS, RHS);
41848         }
41849         Opcode = X86ISD::FMIN;
41850         break;
41851       case ISD::SETOLE:
41852         // Converting this to a min would handle comparisons between positive
41853         // and negative zero incorrectly.
41854         if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41855             !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
41856           break;
41857         Opcode = X86ISD::FMIN;
41858         break;
41859       case ISD::SETULE:
41860         // Converting this to a min would handle both negative zeros and NaNs
41861         // incorrectly, but we can swap the operands to fix both.
41862         std::swap(LHS, RHS);
41863         LLVM_FALLTHROUGH;
41864       case ISD::SETOLT:
41865       case ISD::SETLT:
41866       case ISD::SETLE:
41867         Opcode = X86ISD::FMIN;
41868         break;
41869 
41870       case ISD::SETOGE:
41871         // Converting this to a max would handle comparisons between positive
41872         // and negative zero incorrectly.
41873         if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41874             !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
41875           break;
41876         Opcode = X86ISD::FMAX;
41877         break;
41878       case ISD::SETUGT:
41879         // Converting this to a max would handle NaNs incorrectly, and swapping
41880         // the operands would cause it to handle comparisons between positive
41881         // and negative zero incorrectly.
41882         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
41883           if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41884               !(DAG.isKnownNeverZeroFloat(LHS) ||
41885                 DAG.isKnownNeverZeroFloat(RHS)))
41886             break;
41887           std::swap(LHS, RHS);
41888         }
41889         Opcode = X86ISD::FMAX;
41890         break;
41891       case ISD::SETUGE:
41892         // Converting this to a max would handle both negative zeros and NaNs
41893         // incorrectly, but we can swap the operands to fix both.
41894         std::swap(LHS, RHS);
41895         LLVM_FALLTHROUGH;
41896       case ISD::SETOGT:
41897       case ISD::SETGT:
41898       case ISD::SETGE:
41899         Opcode = X86ISD::FMAX;
41900         break;
41901       }
41902     // Check for x CC y ? y : x -- a min/max with reversed arms.
41903     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
41904                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
41905       switch (CC) {
41906       default: break;
41907       case ISD::SETOGE:
41908         // Converting this to a min would handle comparisons between positive
41909         // and negative zero incorrectly, and swapping the operands would
41910         // cause it to handle NaNs incorrectly.
41911         if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41912             !(DAG.isKnownNeverZeroFloat(LHS) ||
41913               DAG.isKnownNeverZeroFloat(RHS))) {
41914           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
41915             break;
41916           std::swap(LHS, RHS);
41917         }
41918         Opcode = X86ISD::FMIN;
41919         break;
41920       case ISD::SETUGT:
41921         // Converting this to a min would handle NaNs incorrectly.
41922         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
41923           break;
41924         Opcode = X86ISD::FMIN;
41925         break;
41926       case ISD::SETUGE:
41927         // Converting this to a min would handle both negative zeros and NaNs
41928         // incorrectly, but we can swap the operands to fix both.
41929         std::swap(LHS, RHS);
41930         LLVM_FALLTHROUGH;
41931       case ISD::SETOGT:
41932       case ISD::SETGT:
41933       case ISD::SETGE:
41934         Opcode = X86ISD::FMIN;
41935         break;
41936 
41937       case ISD::SETULT:
41938         // Converting this to a max would handle NaNs incorrectly.
41939         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
41940           break;
41941         Opcode = X86ISD::FMAX;
41942         break;
41943       case ISD::SETOLE:
41944         // Converting this to a max would handle comparisons between positive
41945         // and negative zero incorrectly, and swapping the operands would
41946         // cause it to handle NaNs incorrectly.
41947         if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41948             !DAG.isKnownNeverZeroFloat(LHS) &&
41949             !DAG.isKnownNeverZeroFloat(RHS)) {
41950           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
41951             break;
41952           std::swap(LHS, RHS);
41953         }
41954         Opcode = X86ISD::FMAX;
41955         break;
41956       case ISD::SETULE:
41957         // Converting this to a max would handle both negative zeros and NaNs
41958         // incorrectly, but we can swap the operands to fix both.
41959         std::swap(LHS, RHS);
41960         LLVM_FALLTHROUGH;
41961       case ISD::SETOLT:
41962       case ISD::SETLT:
41963       case ISD::SETLE:
41964         Opcode = X86ISD::FMAX;
41965         break;
41966       }
41967     }
41968 
41969     if (Opcode)
41970       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
41971   }
41972 
41973   // Some mask scalar intrinsics rely on checking if only one bit is set
41974   // and implement it in C code like this:
41975   // A[0] = (U & 1) ? A[0] : W[0];
41976   // This creates some redundant instructions that break pattern matching.
41977   // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
41978   if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
41979       Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
41980     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
41981     SDValue AndNode = Cond.getOperand(0);
41982     if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
41983         isNullConstant(Cond.getOperand(1)) &&
41984         isOneConstant(AndNode.getOperand(1))) {
41985       // LHS and RHS swapped due to
41986       // setcc outputting 1 when AND resulted in 0 and vice versa.
41987       AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
41988       return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
41989     }
41990   }
41991 
41992   // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
41993   // lowering on KNL. In this case we convert it to
41994   // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
41995   // The same situation all vectors of i8 and i16 without BWI.
41996   // Make sure we extend these even before type legalization gets a chance to
41997   // split wide vectors.
41998   // Since SKX these selects have a proper lowering.
41999   if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
42000       CondVT.getVectorElementType() == MVT::i1 &&
42001       (VT.getVectorElementType() == MVT::i8 ||
42002        VT.getVectorElementType() == MVT::i16)) {
42003     Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
42004     return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
42005   }
42006 
42007   // AVX512 - Extend select with zero to merge with target shuffle.
42008   // select(mask, extract_subvector(shuffle(x)), zero) -->
42009   // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
42010   // TODO - support non target shuffles as well.
42011   if (Subtarget.hasAVX512() && CondVT.isVector() &&
42012       CondVT.getVectorElementType() == MVT::i1) {
42013     auto SelectableOp = [&TLI](SDValue Op) {
42014       return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42015              isTargetShuffle(Op.getOperand(0).getOpcode()) &&
42016              isNullConstant(Op.getOperand(1)) &&
42017              TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
42018              Op.hasOneUse() && Op.getOperand(0).hasOneUse();
42019     };
42020 
42021     bool SelectableLHS = SelectableOp(LHS);
42022     bool SelectableRHS = SelectableOp(RHS);
42023     bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
42024     bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
42025 
42026     if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
42027       EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
42028                                 : RHS.getOperand(0).getValueType();
42029       EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
42030       LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
42031                             VT.getSizeInBits());
42032       RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
42033                             VT.getSizeInBits());
42034       Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
42035                          DAG.getUNDEF(SrcCondVT), Cond,
42036                          DAG.getIntPtrConstant(0, DL));
42037       SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
42038       return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
42039     }
42040   }
42041 
42042   if (SDValue V = combineSelectOfTwoConstants(N, DAG))
42043     return V;
42044 
42045   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
42046       Cond.hasOneUse()) {
42047     EVT CondVT = Cond.getValueType();
42048     SDValue Cond0 = Cond.getOperand(0);
42049     SDValue Cond1 = Cond.getOperand(1);
42050     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
42051 
42052     // Canonicalize min/max:
42053     // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
42054     // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
42055     // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
42056     // the need for an extra compare against zero. e.g.
42057     // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
42058     // subl   %esi, %edi
42059     // testl  %edi, %edi
42060     // movl   $0, %eax
42061     // cmovgl %edi, %eax
42062     // =>
42063     // xorl   %eax, %eax
42064     // subl   %esi, $edi
42065     // cmovsl %eax, %edi
42066     //
42067     // We can also canonicalize
42068     //  (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
42069     //  (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
42070     // This allows the use of a test instruction for the compare.
42071     if (LHS == Cond0 && RHS == Cond1) {
42072       if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
42073           (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
42074         ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
42075         Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
42076         return DAG.getSelect(DL, VT, Cond, LHS, RHS);
42077       }
42078       if (CC == ISD::SETUGT && isOneConstant(RHS)) {
42079         ISD::CondCode NewCC = ISD::SETUGE;
42080         Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
42081         return DAG.getSelect(DL, VT, Cond, LHS, RHS);
42082       }
42083     }
42084 
42085     // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
42086     // fold eq + gt/lt nested selects into ge/le selects
42087     // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
42088     // --> (select (cmpuge Cond0, Cond1), LHS, Y)
42089     // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
42090     // --> (select (cmpsle Cond0, Cond1), LHS, Y)
42091     // .. etc ..
42092     if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
42093         RHS.getOperand(0).getOpcode() == ISD::SETCC) {
42094       SDValue InnerSetCC = RHS.getOperand(0);
42095       ISD::CondCode InnerCC =
42096           cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
42097       if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
42098           Cond0 == InnerSetCC.getOperand(0) &&
42099           Cond1 == InnerSetCC.getOperand(1)) {
42100         ISD::CondCode NewCC;
42101         switch (CC == ISD::SETEQ ? InnerCC : CC) {
42102         case ISD::SETGT:  NewCC = ISD::SETGE; break;
42103         case ISD::SETLT:  NewCC = ISD::SETLE; break;
42104         case ISD::SETUGT: NewCC = ISD::SETUGE; break;
42105         case ISD::SETULT: NewCC = ISD::SETULE; break;
42106         default: NewCC = ISD::SETCC_INVALID; break;
42107         }
42108         if (NewCC != ISD::SETCC_INVALID) {
42109           Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
42110           return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
42111         }
42112       }
42113     }
42114   }
42115 
42116   // Check if the first operand is all zeros and Cond type is vXi1.
42117   // If this an avx512 target we can improve the use of zero masking by
42118   // swapping the operands and inverting the condition.
42119   if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
42120        Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
42121       ISD::isBuildVectorAllZeros(LHS.getNode()) &&
42122       !ISD::isBuildVectorAllZeros(RHS.getNode())) {
42123     // Invert the cond to not(cond) : xor(op,allones)=not(op)
42124     SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
42125     // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
42126     return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
42127   }
42128 
42129   // Early exit check
42130   if (!TLI.isTypeLegal(VT))
42131     return SDValue();
42132 
42133   if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
42134     return V;
42135 
42136   if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
42137     return V;
42138 
42139   if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
42140     return V;
42141 
42142   // select(~Cond, X, Y) -> select(Cond, Y, X)
42143   if (CondVT.getScalarType() != MVT::i1) {
42144     if (SDValue CondNot = IsNOT(Cond, DAG))
42145       return DAG.getNode(N->getOpcode(), DL, VT,
42146                          DAG.getBitcast(CondVT, CondNot), RHS, LHS);
42147     // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the signbit.
42148     if (Cond.getOpcode() == X86ISD::PCMPGT && Cond.hasOneUse() &&
42149         ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode())) {
42150       Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
42151                          DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
42152       return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
42153     }
42154   }
42155 
42156   // Try to optimize vXi1 selects if both operands are either all constants or
42157   // bitcasts from scalar integer type. In that case we can convert the operands
42158   // to integer and use an integer select which will be converted to a CMOV.
42159   // We need to take a little bit of care to avoid creating an i64 type after
42160   // type legalization.
42161   if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
42162       VT.getVectorElementType() == MVT::i1 &&
42163       (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
42164     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
42165     bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
42166     bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
42167 
42168     if ((LHSIsConst ||
42169          (LHS.getOpcode() == ISD::BITCAST &&
42170           LHS.getOperand(0).getValueType() == IntVT)) &&
42171         (RHSIsConst ||
42172          (RHS.getOpcode() == ISD::BITCAST &&
42173           RHS.getOperand(0).getValueType() == IntVT))) {
42174       if (LHSIsConst)
42175         LHS = combinevXi1ConstantToInteger(LHS, DAG);
42176       else
42177         LHS = LHS.getOperand(0);
42178 
42179       if (RHSIsConst)
42180         RHS = combinevXi1ConstantToInteger(RHS, DAG);
42181       else
42182         RHS = RHS.getOperand(0);
42183 
42184       SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
42185       return DAG.getBitcast(VT, Select);
42186     }
42187   }
42188 
42189   // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
42190   // single bits, then invert the predicate and swap the select operands.
42191   // This can lower using a vector shift bit-hack rather than mask and compare.
42192   if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
42193       N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
42194       Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
42195       Cond.getOperand(0).getOpcode() == ISD::AND &&
42196       isNullOrNullSplat(Cond.getOperand(1)) &&
42197       cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
42198       Cond.getOperand(0).getValueType() == VT) {
42199     // The 'and' mask must be composed of power-of-2 constants.
42200     SDValue And = Cond.getOperand(0);
42201     auto *C = isConstOrConstSplat(And.getOperand(1));
42202     if (C && C->getAPIntValue().isPowerOf2()) {
42203       // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
42204       SDValue NotCond =
42205           DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
42206       return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
42207     }
42208 
42209     // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
42210     // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
42211     // 16-bit lacks a proper blendv.
42212     unsigned EltBitWidth = VT.getScalarSizeInBits();
42213     bool CanShiftBlend =
42214         TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
42215                                 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
42216                                 (Subtarget.hasXOP()));
42217     if (CanShiftBlend &&
42218         ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
42219           return C->getAPIntValue().isPowerOf2();
42220         })) {
42221       // Create a left-shift constant to get the mask bits over to the sign-bit.
42222       SDValue Mask = And.getOperand(1);
42223       SmallVector<int, 32> ShlVals;
42224       for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
42225         auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
42226         ShlVals.push_back(EltBitWidth - 1 -
42227                           MaskVal->getAPIntValue().exactLogBase2());
42228       }
42229       // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
42230       SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
42231       SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
42232       SDValue NewCond =
42233           DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
42234       return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
42235     }
42236   }
42237 
42238   return SDValue();
42239 }
42240 
42241 /// Combine:
42242 ///   (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
42243 /// to:
42244 ///   (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
42245 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
42246 /// Note that this is only legal for some op/cc combinations.
combineSetCCAtomicArith(SDValue Cmp,X86::CondCode & CC,SelectionDAG & DAG,const X86Subtarget & Subtarget)42247 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
42248                                        SelectionDAG &DAG,
42249                                        const X86Subtarget &Subtarget) {
42250   // This combine only operates on CMP-like nodes.
42251   if (!(Cmp.getOpcode() == X86ISD::CMP ||
42252         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
42253     return SDValue();
42254 
42255   // Can't replace the cmp if it has more uses than the one we're looking at.
42256   // FIXME: We would like to be able to handle this, but would need to make sure
42257   // all uses were updated.
42258   if (!Cmp.hasOneUse())
42259     return SDValue();
42260 
42261   // This only applies to variations of the common case:
42262   //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
42263   //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
42264   //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
42265   //   (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
42266   // Using the proper condcodes (see below), overflow is checked for.
42267 
42268   // FIXME: We can generalize both constraints:
42269   // - XOR/OR/AND (if they were made to survive AtomicExpand)
42270   // - LHS != 1
42271   // if the result is compared.
42272 
42273   SDValue CmpLHS = Cmp.getOperand(0);
42274   SDValue CmpRHS = Cmp.getOperand(1);
42275   EVT CmpVT = CmpLHS.getValueType();
42276 
42277   if (!CmpLHS.hasOneUse())
42278     return SDValue();
42279 
42280   unsigned Opc = CmpLHS.getOpcode();
42281   if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
42282     return SDValue();
42283 
42284   SDValue OpRHS = CmpLHS.getOperand(2);
42285   auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
42286   if (!OpRHSC)
42287     return SDValue();
42288 
42289   APInt Addend = OpRHSC->getAPIntValue();
42290   if (Opc == ISD::ATOMIC_LOAD_SUB)
42291     Addend = -Addend;
42292 
42293   auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
42294   if (!CmpRHSC)
42295     return SDValue();
42296 
42297   APInt Comparison = CmpRHSC->getAPIntValue();
42298   APInt NegAddend = -Addend;
42299 
42300   // See if we can adjust the CC to make the comparison match the negated
42301   // addend.
42302   if (Comparison != NegAddend) {
42303     APInt IncComparison = Comparison + 1;
42304     if (IncComparison == NegAddend) {
42305       if (CC == X86::COND_A && !Comparison.isMaxValue()) {
42306         Comparison = IncComparison;
42307         CC = X86::COND_AE;
42308       } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
42309         Comparison = IncComparison;
42310         CC = X86::COND_L;
42311       }
42312     }
42313     APInt DecComparison = Comparison - 1;
42314     if (DecComparison == NegAddend) {
42315       if (CC == X86::COND_AE && !Comparison.isMinValue()) {
42316         Comparison = DecComparison;
42317         CC = X86::COND_A;
42318       } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
42319         Comparison = DecComparison;
42320         CC = X86::COND_LE;
42321       }
42322     }
42323   }
42324 
42325   // If the addend is the negation of the comparison value, then we can do
42326   // a full comparison by emitting the atomic arithmetic as a locked sub.
42327   if (Comparison == NegAddend) {
42328     // The CC is fine, but we need to rewrite the LHS of the comparison as an
42329     // atomic sub.
42330     auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
42331     auto AtomicSub = DAG.getAtomic(
42332         ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
42333         /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
42334         /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
42335         AN->getMemOperand());
42336     auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
42337     DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
42338     DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
42339     return LockOp;
42340   }
42341 
42342   // We can handle comparisons with zero in a number of cases by manipulating
42343   // the CC used.
42344   if (!Comparison.isNullValue())
42345     return SDValue();
42346 
42347   if (CC == X86::COND_S && Addend == 1)
42348     CC = X86::COND_LE;
42349   else if (CC == X86::COND_NS && Addend == 1)
42350     CC = X86::COND_G;
42351   else if (CC == X86::COND_G && Addend == -1)
42352     CC = X86::COND_GE;
42353   else if (CC == X86::COND_LE && Addend == -1)
42354     CC = X86::COND_L;
42355   else
42356     return SDValue();
42357 
42358   SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
42359   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
42360   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
42361   return LockOp;
42362 }
42363 
42364 // Check whether a boolean test is testing a boolean value generated by
42365 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
42366 // code.
42367 //
42368 // Simplify the following patterns:
42369 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
42370 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
42371 // to (Op EFLAGS Cond)
42372 //
42373 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
42374 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
42375 // to (Op EFLAGS !Cond)
42376 //
42377 // where Op could be BRCOND or CMOV.
42378 //
checkBoolTestSetCCCombine(SDValue Cmp,X86::CondCode & CC)42379 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
42380   // This combine only operates on CMP-like nodes.
42381   if (!(Cmp.getOpcode() == X86ISD::CMP ||
42382         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
42383     return SDValue();
42384 
42385   // Quit if not used as a boolean value.
42386   if (CC != X86::COND_E && CC != X86::COND_NE)
42387     return SDValue();
42388 
42389   // Check CMP operands. One of them should be 0 or 1 and the other should be
42390   // an SetCC or extended from it.
42391   SDValue Op1 = Cmp.getOperand(0);
42392   SDValue Op2 = Cmp.getOperand(1);
42393 
42394   SDValue SetCC;
42395   const ConstantSDNode* C = nullptr;
42396   bool needOppositeCond = (CC == X86::COND_E);
42397   bool checkAgainstTrue = false; // Is it a comparison against 1?
42398 
42399   if ((C = dyn_cast<ConstantSDNode>(Op1)))
42400     SetCC = Op2;
42401   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
42402     SetCC = Op1;
42403   else // Quit if all operands are not constants.
42404     return SDValue();
42405 
42406   if (C->getZExtValue() == 1) {
42407     needOppositeCond = !needOppositeCond;
42408     checkAgainstTrue = true;
42409   } else if (C->getZExtValue() != 0)
42410     // Quit if the constant is neither 0 or 1.
42411     return SDValue();
42412 
42413   bool truncatedToBoolWithAnd = false;
42414   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
42415   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
42416          SetCC.getOpcode() == ISD::TRUNCATE ||
42417          SetCC.getOpcode() == ISD::AND) {
42418     if (SetCC.getOpcode() == ISD::AND) {
42419       int OpIdx = -1;
42420       if (isOneConstant(SetCC.getOperand(0)))
42421         OpIdx = 1;
42422       if (isOneConstant(SetCC.getOperand(1)))
42423         OpIdx = 0;
42424       if (OpIdx < 0)
42425         break;
42426       SetCC = SetCC.getOperand(OpIdx);
42427       truncatedToBoolWithAnd = true;
42428     } else
42429       SetCC = SetCC.getOperand(0);
42430   }
42431 
42432   switch (SetCC.getOpcode()) {
42433   case X86ISD::SETCC_CARRY:
42434     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
42435     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
42436     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
42437     // truncated to i1 using 'and'.
42438     if (checkAgainstTrue && !truncatedToBoolWithAnd)
42439       break;
42440     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
42441            "Invalid use of SETCC_CARRY!");
42442     LLVM_FALLTHROUGH;
42443   case X86ISD::SETCC:
42444     // Set the condition code or opposite one if necessary.
42445     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
42446     if (needOppositeCond)
42447       CC = X86::GetOppositeBranchCondition(CC);
42448     return SetCC.getOperand(1);
42449   case X86ISD::CMOV: {
42450     // Check whether false/true value has canonical one, i.e. 0 or 1.
42451     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
42452     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
42453     // Quit if true value is not a constant.
42454     if (!TVal)
42455       return SDValue();
42456     // Quit if false value is not a constant.
42457     if (!FVal) {
42458       SDValue Op = SetCC.getOperand(0);
42459       // Skip 'zext' or 'trunc' node.
42460       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
42461           Op.getOpcode() == ISD::TRUNCATE)
42462         Op = Op.getOperand(0);
42463       // A special case for rdrand/rdseed, where 0 is set if false cond is
42464       // found.
42465       if ((Op.getOpcode() != X86ISD::RDRAND &&
42466            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
42467         return SDValue();
42468     }
42469     // Quit if false value is not the constant 0 or 1.
42470     bool FValIsFalse = true;
42471     if (FVal && FVal->getZExtValue() != 0) {
42472       if (FVal->getZExtValue() != 1)
42473         return SDValue();
42474       // If FVal is 1, opposite cond is needed.
42475       needOppositeCond = !needOppositeCond;
42476       FValIsFalse = false;
42477     }
42478     // Quit if TVal is not the constant opposite of FVal.
42479     if (FValIsFalse && TVal->getZExtValue() != 1)
42480       return SDValue();
42481     if (!FValIsFalse && TVal->getZExtValue() != 0)
42482       return SDValue();
42483     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
42484     if (needOppositeCond)
42485       CC = X86::GetOppositeBranchCondition(CC);
42486     return SetCC.getOperand(3);
42487   }
42488   }
42489 
42490   return SDValue();
42491 }
42492 
42493 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
42494 /// Match:
42495 ///   (X86or (X86setcc) (X86setcc))
42496 ///   (X86cmp (and (X86setcc) (X86setcc)), 0)
checkBoolTestAndOrSetCCCombine(SDValue Cond,X86::CondCode & CC0,X86::CondCode & CC1,SDValue & Flags,bool & isAnd)42497 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
42498                                            X86::CondCode &CC1, SDValue &Flags,
42499                                            bool &isAnd) {
42500   if (Cond->getOpcode() == X86ISD::CMP) {
42501     if (!isNullConstant(Cond->getOperand(1)))
42502       return false;
42503 
42504     Cond = Cond->getOperand(0);
42505   }
42506 
42507   isAnd = false;
42508 
42509   SDValue SetCC0, SetCC1;
42510   switch (Cond->getOpcode()) {
42511   default: return false;
42512   case ISD::AND:
42513   case X86ISD::AND:
42514     isAnd = true;
42515     LLVM_FALLTHROUGH;
42516   case ISD::OR:
42517   case X86ISD::OR:
42518     SetCC0 = Cond->getOperand(0);
42519     SetCC1 = Cond->getOperand(1);
42520     break;
42521   };
42522 
42523   // Make sure we have SETCC nodes, using the same flags value.
42524   if (SetCC0.getOpcode() != X86ISD::SETCC ||
42525       SetCC1.getOpcode() != X86ISD::SETCC ||
42526       SetCC0->getOperand(1) != SetCC1->getOperand(1))
42527     return false;
42528 
42529   CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
42530   CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
42531   Flags = SetCC0->getOperand(1);
42532   return true;
42533 }
42534 
42535 // When legalizing carry, we create carries via add X, -1
42536 // If that comes from an actual carry, via setcc, we use the
42537 // carry directly.
combineCarryThroughADD(SDValue EFLAGS,SelectionDAG & DAG)42538 static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
42539   if (EFLAGS.getOpcode() == X86ISD::ADD) {
42540     if (isAllOnesConstant(EFLAGS.getOperand(1))) {
42541       SDValue Carry = EFLAGS.getOperand(0);
42542       while (Carry.getOpcode() == ISD::TRUNCATE ||
42543              Carry.getOpcode() == ISD::ZERO_EXTEND ||
42544              Carry.getOpcode() == ISD::SIGN_EXTEND ||
42545              Carry.getOpcode() == ISD::ANY_EXTEND ||
42546              (Carry.getOpcode() == ISD::AND &&
42547               isOneConstant(Carry.getOperand(1))))
42548         Carry = Carry.getOperand(0);
42549       if (Carry.getOpcode() == X86ISD::SETCC ||
42550           Carry.getOpcode() == X86ISD::SETCC_CARRY) {
42551         // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
42552         uint64_t CarryCC = Carry.getConstantOperandVal(0);
42553         SDValue CarryOp1 = Carry.getOperand(1);
42554         if (CarryCC == X86::COND_B)
42555           return CarryOp1;
42556         if (CarryCC == X86::COND_A) {
42557           // Try to convert COND_A into COND_B in an attempt to facilitate
42558           // materializing "setb reg".
42559           //
42560           // Do not flip "e > c", where "c" is a constant, because Cmp
42561           // instruction cannot take an immediate as its first operand.
42562           //
42563           if (CarryOp1.getOpcode() == X86ISD::SUB &&
42564               CarryOp1.getNode()->hasOneUse() &&
42565               CarryOp1.getValueType().isInteger() &&
42566               !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
42567             SDValue SubCommute =
42568                 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
42569                             CarryOp1.getOperand(1), CarryOp1.getOperand(0));
42570             return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
42571           }
42572         }
42573         // If this is a check of the z flag of an add with 1, switch to the
42574         // C flag.
42575         if (CarryCC == X86::COND_E &&
42576             CarryOp1.getOpcode() == X86ISD::ADD &&
42577             isOneConstant(CarryOp1.getOperand(1)))
42578           return CarryOp1;
42579       }
42580     }
42581   }
42582 
42583   return SDValue();
42584 }
42585 
42586 /// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
42587 /// to avoid the inversion.
combinePTESTCC(SDValue EFLAGS,X86::CondCode & CC,SelectionDAG & DAG,const X86Subtarget & Subtarget)42588 static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
42589                               SelectionDAG &DAG,
42590                               const X86Subtarget &Subtarget) {
42591   // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
42592   if (EFLAGS.getOpcode() != X86ISD::PTEST &&
42593       EFLAGS.getOpcode() != X86ISD::TESTP)
42594     return SDValue();
42595 
42596   // PTEST/TESTP sets EFLAGS as:
42597   // TESTZ: ZF = (Op0 & Op1) == 0
42598   // TESTC: CF = (~Op0 & Op1) == 0
42599   // TESTNZC: ZF == 0 && CF == 0
42600   EVT VT = EFLAGS.getValueType();
42601   SDValue Op0 = EFLAGS.getOperand(0);
42602   SDValue Op1 = EFLAGS.getOperand(1);
42603   EVT OpVT = Op0.getValueType();
42604 
42605   // TEST*(~X,Y) == TEST*(X,Y)
42606   if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
42607     X86::CondCode InvCC;
42608     switch (CC) {
42609     case X86::COND_B:
42610       // testc -> testz.
42611       InvCC = X86::COND_E;
42612       break;
42613     case X86::COND_AE:
42614       // !testc -> !testz.
42615       InvCC = X86::COND_NE;
42616       break;
42617     case X86::COND_E:
42618       // testz -> testc.
42619       InvCC = X86::COND_B;
42620       break;
42621     case X86::COND_NE:
42622       // !testz -> !testc.
42623       InvCC = X86::COND_AE;
42624       break;
42625     case X86::COND_A:
42626     case X86::COND_BE:
42627       // testnzc -> testnzc (no change).
42628       InvCC = CC;
42629       break;
42630     default:
42631       InvCC = X86::COND_INVALID;
42632       break;
42633     }
42634 
42635     if (InvCC != X86::COND_INVALID) {
42636       CC = InvCC;
42637       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
42638                          DAG.getBitcast(OpVT, NotOp0), Op1);
42639     }
42640   }
42641 
42642   if (CC == X86::COND_E || CC == X86::COND_NE) {
42643     // TESTZ(X,~Y) == TESTC(Y,X)
42644     if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
42645       CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
42646       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
42647                          DAG.getBitcast(OpVT, NotOp1), Op0);
42648     }
42649 
42650     if (Op0 == Op1) {
42651       SDValue BC = peekThroughBitcasts(Op0);
42652       EVT BCVT = BC.getValueType();
42653       assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
42654              "Unexpected vector type");
42655 
42656       // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
42657       if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
42658         return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
42659                            DAG.getBitcast(OpVT, BC.getOperand(0)),
42660                            DAG.getBitcast(OpVT, BC.getOperand(1)));
42661       }
42662 
42663       // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
42664       if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
42665         CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
42666         return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
42667                            DAG.getBitcast(OpVT, BC.getOperand(0)),
42668                            DAG.getBitcast(OpVT, BC.getOperand(1)));
42669       }
42670 
42671       // If every element is an all-sign value, see if we can use MOVMSK to
42672       // more efficiently extract the sign bits and compare that.
42673       // TODO: Handle TESTC with comparison inversion.
42674       // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
42675       // MOVMSK combines to make sure its never worse than PTEST?
42676       unsigned EltBits = BCVT.getScalarSizeInBits();
42677       if (DAG.ComputeNumSignBits(BC) == EltBits) {
42678         assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
42679         APInt SignMask = APInt::getSignMask(EltBits);
42680         const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42681         if (SDValue Res =
42682                 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
42683           // For vXi16 cases we need to use pmovmksb and extract every other
42684           // sign bit.
42685           SDLoc DL(EFLAGS);
42686           if (EltBits == 16) {
42687             MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
42688             Res = DAG.getBitcast(MovmskVT, Res);
42689             Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
42690             Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
42691                               DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
42692           } else {
42693             Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
42694           }
42695           return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
42696                              DAG.getConstant(0, DL, MVT::i32));
42697         }
42698       }
42699     }
42700 
42701     // TESTZ(-1,X) == TESTZ(X,X)
42702     if (ISD::isBuildVectorAllOnes(Op0.getNode()))
42703       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
42704 
42705     // TESTZ(X,-1) == TESTZ(X,X)
42706     if (ISD::isBuildVectorAllOnes(Op1.getNode()))
42707       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
42708   }
42709 
42710   return SDValue();
42711 }
42712 
42713 // Attempt to simplify the MOVMSK input based on the comparison type.
combineSetCCMOVMSK(SDValue EFLAGS,X86::CondCode & CC,SelectionDAG & DAG,const X86Subtarget & Subtarget)42714 static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
42715                                   SelectionDAG &DAG,
42716                                   const X86Subtarget &Subtarget) {
42717   // Handle eq/ne against zero (any_of).
42718   // Handle eq/ne against -1 (all_of).
42719   if (!(CC == X86::COND_E || CC == X86::COND_NE))
42720     return SDValue();
42721   if (EFLAGS.getValueType() != MVT::i32)
42722     return SDValue();
42723   unsigned CmpOpcode = EFLAGS.getOpcode();
42724   if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
42725     return SDValue();
42726   auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
42727   if (!CmpConstant)
42728     return SDValue();
42729   const APInt &CmpVal = CmpConstant->getAPIntValue();
42730 
42731   SDValue CmpOp = EFLAGS.getOperand(0);
42732   unsigned CmpBits = CmpOp.getValueSizeInBits();
42733   assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
42734 
42735   // Peek through any truncate.
42736   if (CmpOp.getOpcode() == ISD::TRUNCATE)
42737     CmpOp = CmpOp.getOperand(0);
42738 
42739   // Bail if we don't find a MOVMSK.
42740   if (CmpOp.getOpcode() != X86ISD::MOVMSK)
42741     return SDValue();
42742 
42743   SDValue Vec = CmpOp.getOperand(0);
42744   MVT VecVT = Vec.getSimpleValueType();
42745   assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
42746          "Unexpected MOVMSK operand");
42747   unsigned NumElts = VecVT.getVectorNumElements();
42748   unsigned NumEltBits = VecVT.getScalarSizeInBits();
42749 
42750   bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isNullValue();
42751   bool IsAllOf = CmpOpcode == X86ISD::SUB && NumElts <= CmpBits &&
42752                  CmpVal.isMask(NumElts);
42753   if (!IsAnyOf && !IsAllOf)
42754     return SDValue();
42755 
42756   // See if we can peek through to a vector with a wider element type, if the
42757   // signbits extend down to all the sub-elements as well.
42758   // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
42759   // potential SimplifyDemandedBits/Elts cases.
42760   if (Vec.getOpcode() == ISD::BITCAST) {
42761     SDValue BC = peekThroughBitcasts(Vec);
42762     MVT BCVT = BC.getSimpleValueType();
42763     unsigned BCNumElts = BCVT.getVectorNumElements();
42764     unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
42765     if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
42766         BCNumEltBits > NumEltBits &&
42767         DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
42768       SDLoc DL(EFLAGS);
42769       unsigned CmpMask = IsAnyOf ? 0 : ((1 << BCNumElts) - 1);
42770       return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
42771                          DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
42772                          DAG.getConstant(CmpMask, DL, MVT::i32));
42773     }
42774   }
42775 
42776   // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
42777   // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
42778   if (IsAllOf && Subtarget.hasSSE41()) {
42779     SDValue BC = peekThroughBitcasts(Vec);
42780     if (BC.getOpcode() == X86ISD::PCMPEQ &&
42781         ISD::isBuildVectorAllZeros(BC.getOperand(1).getNode())) {
42782       MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
42783       SDValue V = DAG.getBitcast(TestVT, BC.getOperand(0));
42784       return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
42785     }
42786   }
42787 
42788   // See if we can avoid a PACKSS by calling MOVMSK on the sources.
42789   // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
42790   // sign bits prior to the comparison with zero unless we know that
42791   // the vXi16 splats the sign bit down to the lower i8 half.
42792   // TODO: Handle all_of patterns.
42793   if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
42794     SDValue VecOp0 = Vec.getOperand(0);
42795     SDValue VecOp1 = Vec.getOperand(1);
42796     bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
42797     bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
42798     // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
42799     if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
42800       SDLoc DL(EFLAGS);
42801       SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
42802       Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
42803       Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
42804       if (!SignExt0) {
42805         Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
42806                              DAG.getConstant(0xAAAA, DL, MVT::i16));
42807       }
42808       return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
42809                          DAG.getConstant(0, DL, MVT::i16));
42810     }
42811     // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
42812     // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
42813     if (CmpBits >= 16 && Subtarget.hasInt256() &&
42814         VecOp0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42815         VecOp1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42816         VecOp0.getOperand(0) == VecOp1.getOperand(0) &&
42817         VecOp0.getConstantOperandAPInt(1) == 0 &&
42818         VecOp1.getConstantOperandAPInt(1) == 8 &&
42819         (IsAnyOf || (SignExt0 && SignExt1))) {
42820       SDLoc DL(EFLAGS);
42821       SDValue Result = DAG.getBitcast(MVT::v32i8, VecOp0.getOperand(0));
42822       Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
42823       unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
42824       if (!SignExt0 || !SignExt1) {
42825         assert(IsAnyOf && "Only perform v16i16 signmasks for any_of patterns");
42826         Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
42827                              DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
42828       }
42829       return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
42830                          DAG.getConstant(CmpMask, DL, MVT::i32));
42831     }
42832   }
42833 
42834   // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
42835   SmallVector<int, 32> ShuffleMask;
42836   SmallVector<SDValue, 2> ShuffleInputs;
42837   if (NumElts <= CmpBits &&
42838       getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
42839                              ShuffleMask, DAG) &&
42840       ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
42841       ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
42842     unsigned NumShuffleElts = ShuffleMask.size();
42843     APInt DemandedElts = APInt::getNullValue(NumShuffleElts);
42844     for (int M : ShuffleMask) {
42845       assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index");
42846       DemandedElts.setBit(M);
42847     }
42848     if (DemandedElts.isAllOnesValue()) {
42849       SDLoc DL(EFLAGS);
42850       SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
42851       Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
42852       Result =
42853           DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
42854       return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
42855                          EFLAGS.getOperand(1));
42856     }
42857   }
42858 
42859   return SDValue();
42860 }
42861 
42862 /// Optimize an EFLAGS definition used according to the condition code \p CC
42863 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
42864 /// uses of chain values.
combineSetCCEFLAGS(SDValue EFLAGS,X86::CondCode & CC,SelectionDAG & DAG,const X86Subtarget & Subtarget)42865 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
42866                                   SelectionDAG &DAG,
42867                                   const X86Subtarget &Subtarget) {
42868   if (CC == X86::COND_B)
42869     if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
42870       return Flags;
42871 
42872   if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
42873     return R;
42874 
42875   if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
42876     return R;
42877 
42878   if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
42879     return R;
42880 
42881   return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
42882 }
42883 
42884 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
combineCMov(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)42885 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
42886                            TargetLowering::DAGCombinerInfo &DCI,
42887                            const X86Subtarget &Subtarget) {
42888   SDLoc DL(N);
42889 
42890   SDValue FalseOp = N->getOperand(0);
42891   SDValue TrueOp = N->getOperand(1);
42892   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
42893   SDValue Cond = N->getOperand(3);
42894 
42895   // cmov X, X, ?, ? --> X
42896   if (TrueOp == FalseOp)
42897     return TrueOp;
42898 
42899   // Try to simplify the EFLAGS and condition code operands.
42900   // We can't always do this as FCMOV only supports a subset of X86 cond.
42901   if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
42902     if (!(FalseOp.getValueType() == MVT::f80 ||
42903           (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
42904           (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
42905         !Subtarget.hasCMov() || hasFPCMov(CC)) {
42906       SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
42907                        Flags};
42908       return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
42909     }
42910   }
42911 
42912   // If this is a select between two integer constants, try to do some
42913   // optimizations.  Note that the operands are ordered the opposite of SELECT
42914   // operands.
42915   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
42916     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
42917       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
42918       // larger than FalseC (the false value).
42919       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
42920         CC = X86::GetOppositeBranchCondition(CC);
42921         std::swap(TrueC, FalseC);
42922         std::swap(TrueOp, FalseOp);
42923       }
42924 
42925       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
42926       // This is efficient for any integer data type (including i8/i16) and
42927       // shift amount.
42928       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
42929         Cond = getSETCC(CC, Cond, DL, DAG);
42930 
42931         // Zero extend the condition if needed.
42932         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
42933 
42934         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
42935         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
42936                            DAG.getConstant(ShAmt, DL, MVT::i8));
42937         return Cond;
42938       }
42939 
42940       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
42941       // for any integer data type, including i8/i16.
42942       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
42943         Cond = getSETCC(CC, Cond, DL, DAG);
42944 
42945         // Zero extend the condition if needed.
42946         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
42947                            FalseC->getValueType(0), Cond);
42948         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
42949                            SDValue(FalseC, 0));
42950         return Cond;
42951       }
42952 
42953       // Optimize cases that will turn into an LEA instruction.  This requires
42954       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
42955       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
42956         APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
42957         assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
42958                "Implicit constant truncation");
42959 
42960         bool isFastMultiplier = false;
42961         if (Diff.ult(10)) {
42962           switch (Diff.getZExtValue()) {
42963           default: break;
42964           case 1:  // result = add base, cond
42965           case 2:  // result = lea base(    , cond*2)
42966           case 3:  // result = lea base(cond, cond*2)
42967           case 4:  // result = lea base(    , cond*4)
42968           case 5:  // result = lea base(cond, cond*4)
42969           case 8:  // result = lea base(    , cond*8)
42970           case 9:  // result = lea base(cond, cond*8)
42971             isFastMultiplier = true;
42972             break;
42973           }
42974         }
42975 
42976         if (isFastMultiplier) {
42977           Cond = getSETCC(CC, Cond, DL ,DAG);
42978           // Zero extend the condition if needed.
42979           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
42980                              Cond);
42981           // Scale the condition by the difference.
42982           if (Diff != 1)
42983             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
42984                                DAG.getConstant(Diff, DL, Cond.getValueType()));
42985 
42986           // Add the base if non-zero.
42987           if (FalseC->getAPIntValue() != 0)
42988             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
42989                                SDValue(FalseC, 0));
42990           return Cond;
42991         }
42992       }
42993     }
42994   }
42995 
42996   // Handle these cases:
42997   //   (select (x != c), e, c) -> select (x != c), e, x),
42998   //   (select (x == c), c, e) -> select (x == c), x, e)
42999   // where the c is an integer constant, and the "select" is the combination
43000   // of CMOV and CMP.
43001   //
43002   // The rationale for this change is that the conditional-move from a constant
43003   // needs two instructions, however, conditional-move from a register needs
43004   // only one instruction.
43005   //
43006   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
43007   //  some instruction-combining opportunities. This opt needs to be
43008   //  postponed as late as possible.
43009   //
43010   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
43011     // the DCI.xxxx conditions are provided to postpone the optimization as
43012     // late as possible.
43013 
43014     ConstantSDNode *CmpAgainst = nullptr;
43015     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
43016         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
43017         !isa<ConstantSDNode>(Cond.getOperand(0))) {
43018 
43019       if (CC == X86::COND_NE &&
43020           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
43021         CC = X86::GetOppositeBranchCondition(CC);
43022         std::swap(TrueOp, FalseOp);
43023       }
43024 
43025       if (CC == X86::COND_E &&
43026           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
43027         SDValue Ops[] = {FalseOp, Cond.getOperand(0),
43028                          DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
43029         return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
43030       }
43031     }
43032   }
43033 
43034   // Fold and/or of setcc's to double CMOV:
43035   //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
43036   //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
43037   //
43038   // This combine lets us generate:
43039   //   cmovcc1 (jcc1 if we don't have CMOV)
43040   //   cmovcc2 (same)
43041   // instead of:
43042   //   setcc1
43043   //   setcc2
43044   //   and/or
43045   //   cmovne (jne if we don't have CMOV)
43046   // When we can't use the CMOV instruction, it might increase branch
43047   // mispredicts.
43048   // When we can use CMOV, or when there is no mispredict, this improves
43049   // throughput and reduces register pressure.
43050   //
43051   if (CC == X86::COND_NE) {
43052     SDValue Flags;
43053     X86::CondCode CC0, CC1;
43054     bool isAndSetCC;
43055     if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
43056       if (isAndSetCC) {
43057         std::swap(FalseOp, TrueOp);
43058         CC0 = X86::GetOppositeBranchCondition(CC0);
43059         CC1 = X86::GetOppositeBranchCondition(CC1);
43060       }
43061 
43062       SDValue LOps[] = {FalseOp, TrueOp,
43063                         DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
43064       SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
43065       SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
43066                        Flags};
43067       SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
43068       return CMOV;
43069     }
43070   }
43071 
43072   // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
43073   //      (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
43074   // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
43075   //    (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
43076   if ((CC == X86::COND_NE || CC == X86::COND_E) &&
43077       Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
43078     SDValue Add = TrueOp;
43079     SDValue Const = FalseOp;
43080     // Canonicalize the condition code for easier matching and output.
43081     if (CC == X86::COND_E)
43082       std::swap(Add, Const);
43083 
43084     // We might have replaced the constant in the cmov with the LHS of the
43085     // compare. If so change it to the RHS of the compare.
43086     if (Const == Cond.getOperand(0))
43087       Const = Cond.getOperand(1);
43088 
43089     // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
43090     if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
43091         Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
43092         (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
43093          Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
43094         Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
43095       EVT VT = N->getValueType(0);
43096       // This should constant fold.
43097       SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
43098       SDValue CMov =
43099           DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
43100                       DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
43101       return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
43102     }
43103   }
43104 
43105   return SDValue();
43106 }
43107 
43108 /// Different mul shrinking modes.
43109 enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
43110 
canReduceVMulWidth(SDNode * N,SelectionDAG & DAG,ShrinkMode & Mode)43111 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
43112   EVT VT = N->getOperand(0).getValueType();
43113   if (VT.getScalarSizeInBits() != 32)
43114     return false;
43115 
43116   assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
43117   unsigned SignBits[2] = {1, 1};
43118   bool IsPositive[2] = {false, false};
43119   for (unsigned i = 0; i < 2; i++) {
43120     SDValue Opd = N->getOperand(i);
43121 
43122     SignBits[i] = DAG.ComputeNumSignBits(Opd);
43123     IsPositive[i] = DAG.SignBitIsZero(Opd);
43124   }
43125 
43126   bool AllPositive = IsPositive[0] && IsPositive[1];
43127   unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
43128   // When ranges are from -128 ~ 127, use MULS8 mode.
43129   if (MinSignBits >= 25)
43130     Mode = ShrinkMode::MULS8;
43131   // When ranges are from 0 ~ 255, use MULU8 mode.
43132   else if (AllPositive && MinSignBits >= 24)
43133     Mode = ShrinkMode::MULU8;
43134   // When ranges are from -32768 ~ 32767, use MULS16 mode.
43135   else if (MinSignBits >= 17)
43136     Mode = ShrinkMode::MULS16;
43137   // When ranges are from 0 ~ 65535, use MULU16 mode.
43138   else if (AllPositive && MinSignBits >= 16)
43139     Mode = ShrinkMode::MULU16;
43140   else
43141     return false;
43142   return true;
43143 }
43144 
43145 /// When the operands of vector mul are extended from smaller size values,
43146 /// like i8 and i16, the type of mul may be shrinked to generate more
43147 /// efficient code. Two typical patterns are handled:
43148 /// Pattern1:
43149 ///     %2 = sext/zext <N x i8> %1 to <N x i32>
43150 ///     %4 = sext/zext <N x i8> %3 to <N x i32>
43151 //   or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
43152 ///     %5 = mul <N x i32> %2, %4
43153 ///
43154 /// Pattern2:
43155 ///     %2 = zext/sext <N x i16> %1 to <N x i32>
43156 ///     %4 = zext/sext <N x i16> %3 to <N x i32>
43157 ///  or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
43158 ///     %5 = mul <N x i32> %2, %4
43159 ///
43160 /// There are four mul shrinking modes:
43161 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
43162 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
43163 /// generate pmullw+sext32 for it (MULS8 mode).
43164 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
43165 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
43166 /// generate pmullw+zext32 for it (MULU8 mode).
43167 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
43168 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
43169 /// generate pmullw+pmulhw for it (MULS16 mode).
43170 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
43171 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
43172 /// generate pmullw+pmulhuw for it (MULU16 mode).
reduceVMULWidth(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)43173 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
43174                                const X86Subtarget &Subtarget) {
43175   // Check for legality
43176   // pmullw/pmulhw are not supported by SSE.
43177   if (!Subtarget.hasSSE2())
43178     return SDValue();
43179 
43180   // Check for profitability
43181   // pmulld is supported since SSE41. It is better to use pmulld
43182   // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
43183   // the expansion.
43184   bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
43185   if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
43186     return SDValue();
43187 
43188   ShrinkMode Mode;
43189   if (!canReduceVMulWidth(N, DAG, Mode))
43190     return SDValue();
43191 
43192   SDLoc DL(N);
43193   SDValue N0 = N->getOperand(0);
43194   SDValue N1 = N->getOperand(1);
43195   EVT VT = N->getOperand(0).getValueType();
43196   unsigned NumElts = VT.getVectorNumElements();
43197   if ((NumElts % 2) != 0)
43198     return SDValue();
43199 
43200   EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
43201 
43202   // Shrink the operands of mul.
43203   SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
43204   SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
43205 
43206   // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
43207   // lower part is needed.
43208   SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
43209   if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
43210     return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
43211                                                    : ISD::SIGN_EXTEND,
43212                        DL, VT, MulLo);
43213 
43214   EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
43215   // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
43216   // the higher part is also needed.
43217   SDValue MulHi =
43218       DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
43219                   ReducedVT, NewN0, NewN1);
43220 
43221   // Repack the lower part and higher part result of mul into a wider
43222   // result.
43223   // Generate shuffle functioning as punpcklwd.
43224   SmallVector<int, 16> ShuffleMask(NumElts);
43225   for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
43226     ShuffleMask[2 * i] = i;
43227     ShuffleMask[2 * i + 1] = i + NumElts;
43228   }
43229   SDValue ResLo =
43230       DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
43231   ResLo = DAG.getBitcast(ResVT, ResLo);
43232   // Generate shuffle functioning as punpckhwd.
43233   for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
43234     ShuffleMask[2 * i] = i + NumElts / 2;
43235     ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
43236   }
43237   SDValue ResHi =
43238       DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
43239   ResHi = DAG.getBitcast(ResVT, ResHi);
43240   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
43241 }
43242 
combineMulSpecial(uint64_t MulAmt,SDNode * N,SelectionDAG & DAG,EVT VT,const SDLoc & DL)43243 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
43244                                  EVT VT, const SDLoc &DL) {
43245 
43246   auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
43247     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
43248                                  DAG.getConstant(Mult, DL, VT));
43249     Result = DAG.getNode(ISD::SHL, DL, VT, Result,
43250                          DAG.getConstant(Shift, DL, MVT::i8));
43251     Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
43252                          N->getOperand(0));
43253     return Result;
43254   };
43255 
43256   auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
43257     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
43258                                  DAG.getConstant(Mul1, DL, VT));
43259     Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
43260                          DAG.getConstant(Mul2, DL, VT));
43261     Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
43262                          N->getOperand(0));
43263     return Result;
43264   };
43265 
43266   switch (MulAmt) {
43267   default:
43268     break;
43269   case 11:
43270     // mul x, 11 => add ((shl (mul x, 5), 1), x)
43271     return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
43272   case 21:
43273     // mul x, 21 => add ((shl (mul x, 5), 2), x)
43274     return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
43275   case 41:
43276     // mul x, 41 => add ((shl (mul x, 5), 3), x)
43277     return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
43278   case 22:
43279     // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
43280     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
43281                        combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
43282   case 19:
43283     // mul x, 19 => add ((shl (mul x, 9), 1), x)
43284     return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
43285   case 37:
43286     // mul x, 37 => add ((shl (mul x, 9), 2), x)
43287     return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
43288   case 73:
43289     // mul x, 73 => add ((shl (mul x, 9), 3), x)
43290     return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
43291   case 13:
43292     // mul x, 13 => add ((shl (mul x, 3), 2), x)
43293     return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
43294   case 23:
43295     // mul x, 23 => sub ((shl (mul x, 3), 3), x)
43296     return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
43297   case 26:
43298     // mul x, 26 => add ((mul (mul x, 5), 5), x)
43299     return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
43300   case 28:
43301     // mul x, 28 => add ((mul (mul x, 9), 3), x)
43302     return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
43303   case 29:
43304     // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
43305     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
43306                        combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
43307   }
43308 
43309   // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
43310   // by a single LEA.
43311   // First check if this a sum of two power of 2s because that's easy. Then
43312   // count how many zeros are up to the first bit.
43313   // TODO: We can do this even without LEA at a cost of two shifts and an add.
43314   if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
43315     unsigned ScaleShift = countTrailingZeros(MulAmt);
43316     if (ScaleShift >= 1 && ScaleShift < 4) {
43317       unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
43318       SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43319                                    DAG.getConstant(ShiftAmt, DL, MVT::i8));
43320       SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43321                                    DAG.getConstant(ScaleShift, DL, MVT::i8));
43322       return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
43323     }
43324   }
43325 
43326   return SDValue();
43327 }
43328 
43329 // If the upper 17 bits of each element are zero then we can use PMADDWD,
43330 // which is always at least as quick as PMULLD, except on KNL.
combineMulToPMADDWD(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)43331 static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
43332                                    const X86Subtarget &Subtarget) {
43333   if (!Subtarget.hasSSE2())
43334     return SDValue();
43335 
43336   if (Subtarget.isPMADDWDSlow())
43337     return SDValue();
43338 
43339   EVT VT = N->getValueType(0);
43340 
43341   // Only support vXi32 vectors.
43342   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
43343     return SDValue();
43344 
43345   // Make sure the type is legal or will be widened to a legal type.
43346   if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT))
43347     return SDValue();
43348 
43349   MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
43350 
43351   // Without BWI, we would need to split v32i16.
43352   if (WVT == MVT::v32i16 && !Subtarget.hasBWI())
43353     return SDValue();
43354 
43355   SDValue N0 = N->getOperand(0);
43356   SDValue N1 = N->getOperand(1);
43357 
43358   // If we are zero extending two steps without SSE4.1, its better to reduce
43359   // the vmul width instead.
43360   if (!Subtarget.hasSSE41() &&
43361       (N0.getOpcode() == ISD::ZERO_EXTEND &&
43362        N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
43363       (N1.getOpcode() == ISD::ZERO_EXTEND &&
43364        N1.getOperand(0).getScalarValueSizeInBits() <= 8))
43365     return SDValue();
43366 
43367   APInt Mask17 = APInt::getHighBitsSet(32, 17);
43368   if (!DAG.MaskedValueIsZero(N1, Mask17) ||
43369       !DAG.MaskedValueIsZero(N0, Mask17))
43370     return SDValue();
43371 
43372   // Use SplitOpsAndApply to handle AVX splitting.
43373   auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43374                            ArrayRef<SDValue> Ops) {
43375     MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
43376     return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
43377   };
43378   return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
43379                           { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
43380                           PMADDWDBuilder);
43381 }
43382 
combineMulToPMULDQ(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)43383 static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
43384                                   const X86Subtarget &Subtarget) {
43385   if (!Subtarget.hasSSE2())
43386     return SDValue();
43387 
43388   EVT VT = N->getValueType(0);
43389 
43390   // Only support vXi64 vectors.
43391   if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
43392       VT.getVectorNumElements() < 2 ||
43393       !isPowerOf2_32(VT.getVectorNumElements()))
43394     return SDValue();
43395 
43396   SDValue N0 = N->getOperand(0);
43397   SDValue N1 = N->getOperand(1);
43398 
43399   // MULDQ returns the 64-bit result of the signed multiplication of the lower
43400   // 32-bits. We can lower with this if the sign bits stretch that far.
43401   if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
43402       DAG.ComputeNumSignBits(N1) > 32) {
43403     auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43404                             ArrayRef<SDValue> Ops) {
43405       return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
43406     };
43407     return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
43408                             PMULDQBuilder, /*CheckBWI*/false);
43409   }
43410 
43411   // If the upper bits are zero we can use a single pmuludq.
43412   APInt Mask = APInt::getHighBitsSet(64, 32);
43413   if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
43414     auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43415                              ArrayRef<SDValue> Ops) {
43416       return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
43417     };
43418     return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
43419                             PMULUDQBuilder, /*CheckBWI*/false);
43420   }
43421 
43422   return SDValue();
43423 }
43424 
43425 /// Optimize a single multiply with constant into two operations in order to
43426 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
combineMul(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)43427 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
43428                           TargetLowering::DAGCombinerInfo &DCI,
43429                           const X86Subtarget &Subtarget) {
43430   EVT VT = N->getValueType(0);
43431 
43432   if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
43433     return V;
43434 
43435   if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
43436     return V;
43437 
43438   if (DCI.isBeforeLegalize() && VT.isVector())
43439     return reduceVMULWidth(N, DAG, Subtarget);
43440 
43441   if (!MulConstantOptimization)
43442     return SDValue();
43443   // An imul is usually smaller than the alternative sequence.
43444   if (DAG.getMachineFunction().getFunction().hasMinSize())
43445     return SDValue();
43446 
43447   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
43448     return SDValue();
43449 
43450   if (VT != MVT::i64 && VT != MVT::i32)
43451     return SDValue();
43452 
43453   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
43454   if (!C)
43455     return SDValue();
43456   if (isPowerOf2_64(C->getZExtValue()))
43457     return SDValue();
43458 
43459   int64_t SignMulAmt = C->getSExtValue();
43460   assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
43461   uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
43462 
43463   SDLoc DL(N);
43464   if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
43465     SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
43466                                  DAG.getConstant(AbsMulAmt, DL, VT));
43467     if (SignMulAmt < 0)
43468       NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
43469                            NewMul);
43470 
43471     return NewMul;
43472   }
43473 
43474   uint64_t MulAmt1 = 0;
43475   uint64_t MulAmt2 = 0;
43476   if ((AbsMulAmt % 9) == 0) {
43477     MulAmt1 = 9;
43478     MulAmt2 = AbsMulAmt / 9;
43479   } else if ((AbsMulAmt % 5) == 0) {
43480     MulAmt1 = 5;
43481     MulAmt2 = AbsMulAmt / 5;
43482   } else if ((AbsMulAmt % 3) == 0) {
43483     MulAmt1 = 3;
43484     MulAmt2 = AbsMulAmt / 3;
43485   }
43486 
43487   SDValue NewMul;
43488   // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
43489   if (MulAmt2 &&
43490       (isPowerOf2_64(MulAmt2) ||
43491        (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
43492 
43493     if (isPowerOf2_64(MulAmt2) &&
43494         !(SignMulAmt >= 0 && N->hasOneUse() &&
43495           N->use_begin()->getOpcode() == ISD::ADD))
43496       // If second multiplifer is pow2, issue it first. We want the multiply by
43497       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
43498       // is an add. Only do this for positive multiply amounts since the
43499       // negate would prevent it from being used as an address mode anyway.
43500       std::swap(MulAmt1, MulAmt2);
43501 
43502     if (isPowerOf2_64(MulAmt1))
43503       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43504                            DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
43505     else
43506       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
43507                            DAG.getConstant(MulAmt1, DL, VT));
43508 
43509     if (isPowerOf2_64(MulAmt2))
43510       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
43511                            DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
43512     else
43513       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
43514                            DAG.getConstant(MulAmt2, DL, VT));
43515 
43516     // Negate the result.
43517     if (SignMulAmt < 0)
43518       NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
43519                            NewMul);
43520   } else if (!Subtarget.slowLEA())
43521     NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
43522 
43523   if (!NewMul) {
43524     assert(C->getZExtValue() != 0 &&
43525            C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
43526            "Both cases that could cause potential overflows should have "
43527            "already been handled.");
43528     if (isPowerOf2_64(AbsMulAmt - 1)) {
43529       // (mul x, 2^N + 1) => (add (shl x, N), x)
43530       NewMul = DAG.getNode(
43531           ISD::ADD, DL, VT, N->getOperand(0),
43532           DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43533                       DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
43534                                       MVT::i8)));
43535       // To negate, subtract the number from zero
43536       if (SignMulAmt < 0)
43537         NewMul = DAG.getNode(ISD::SUB, DL, VT,
43538                              DAG.getConstant(0, DL, VT), NewMul);
43539     } else if (isPowerOf2_64(AbsMulAmt + 1)) {
43540       // (mul x, 2^N - 1) => (sub (shl x, N), x)
43541       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43542                            DAG.getConstant(Log2_64(AbsMulAmt + 1),
43543                                            DL, MVT::i8));
43544       // To negate, reverse the operands of the subtract.
43545       if (SignMulAmt < 0)
43546         NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
43547       else
43548         NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
43549     } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
43550       // (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
43551       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43552                            DAG.getConstant(Log2_64(AbsMulAmt - 2),
43553                                            DL, MVT::i8));
43554       NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
43555       NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
43556     } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
43557       // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
43558       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43559                            DAG.getConstant(Log2_64(AbsMulAmt + 2),
43560                                            DL, MVT::i8));
43561       NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
43562       NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
43563     }
43564   }
43565 
43566   return NewMul;
43567 }
43568 
43569 // Try to form a MULHU or MULHS node by looking for
43570 // (srl (mul ext, ext), 16)
43571 // TODO: This is X86 specific because we want to be able to handle wide types
43572 // before type legalization. But we can only do it if the vector will be
43573 // legalized via widening/splitting. Type legalization can't handle promotion
43574 // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
43575 // combiner.
combineShiftToPMULH(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)43576 static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
43577                                    const X86Subtarget &Subtarget) {
43578   assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
43579            "SRL or SRA node is required here!");
43580   SDLoc DL(N);
43581 
43582   // Only do this with SSE4.1. On earlier targets reduceVMULWidth will expand
43583   // the multiply.
43584   if (!Subtarget.hasSSE41())
43585     return SDValue();
43586 
43587   // The operation feeding into the shift must be a multiply.
43588   SDValue ShiftOperand = N->getOperand(0);
43589   if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
43590     return SDValue();
43591 
43592   // Input type should be at least vXi32.
43593   EVT VT = N->getValueType(0);
43594   if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
43595     return SDValue();
43596 
43597   // Need a shift by 16.
43598   APInt ShiftAmt;
43599   if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
43600       ShiftAmt != 16)
43601     return SDValue();
43602 
43603   SDValue LHS = ShiftOperand.getOperand(0);
43604   SDValue RHS = ShiftOperand.getOperand(1);
43605 
43606   unsigned ExtOpc = LHS.getOpcode();
43607   if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
43608       RHS.getOpcode() != ExtOpc)
43609     return SDValue();
43610 
43611   // Peek through the extends.
43612   LHS = LHS.getOperand(0);
43613   RHS = RHS.getOperand(0);
43614 
43615   // Ensure the input types match.
43616   EVT MulVT = LHS.getValueType();
43617   if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
43618     return SDValue();
43619 
43620   unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
43621   SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
43622 
43623   ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
43624   return DAG.getNode(ExtOpc, DL, VT, Mulh);
43625 }
43626 
combineShiftLeft(SDNode * N,SelectionDAG & DAG)43627 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
43628   SDValue N0 = N->getOperand(0);
43629   SDValue N1 = N->getOperand(1);
43630   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
43631   EVT VT = N0.getValueType();
43632 
43633   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
43634   // since the result of setcc_c is all zero's or all ones.
43635   if (VT.isInteger() && !VT.isVector() &&
43636       N1C && N0.getOpcode() == ISD::AND &&
43637       N0.getOperand(1).getOpcode() == ISD::Constant) {
43638     SDValue N00 = N0.getOperand(0);
43639     APInt Mask = N0.getConstantOperandAPInt(1);
43640     Mask <<= N1C->getAPIntValue();
43641     bool MaskOK = false;
43642     // We can handle cases concerning bit-widening nodes containing setcc_c if
43643     // we carefully interrogate the mask to make sure we are semantics
43644     // preserving.
43645     // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
43646     // of the underlying setcc_c operation if the setcc_c was zero extended.
43647     // Consider the following example:
43648     //   zext(setcc_c)                 -> i32 0x0000FFFF
43649     //   c1                            -> i32 0x0000FFFF
43650     //   c2                            -> i32 0x00000001
43651     //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
43652     //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE
43653     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
43654       MaskOK = true;
43655     } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
43656                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
43657       MaskOK = true;
43658     } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
43659                 N00.getOpcode() == ISD::ANY_EXTEND) &&
43660                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
43661       MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
43662     }
43663     if (MaskOK && Mask != 0) {
43664       SDLoc DL(N);
43665       return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
43666     }
43667   }
43668 
43669   // Hardware support for vector shifts is sparse which makes us scalarize the
43670   // vector operations in many cases. Also, on sandybridge ADD is faster than
43671   // shl.
43672   // (shl V, 1) -> add V,V
43673   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
43674     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
43675       assert(N0.getValueType().isVector() && "Invalid vector shift type");
43676       // We shift all of the values by one. In many cases we do not have
43677       // hardware support for this operation. This is better expressed as an ADD
43678       // of two values.
43679       if (N1SplatC->isOne())
43680         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
43681     }
43682 
43683   return SDValue();
43684 }
43685 
combineShiftRightArithmetic(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)43686 static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
43687                                            const X86Subtarget &Subtarget) {
43688   SDValue N0 = N->getOperand(0);
43689   SDValue N1 = N->getOperand(1);
43690   EVT VT = N0.getValueType();
43691   unsigned Size = VT.getSizeInBits();
43692 
43693   if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
43694     return V;
43695 
43696   // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
43697   // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
43698   // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
43699   // depending on sign of (SarConst - [56,48,32,24,16])
43700 
43701   // sexts in X86 are MOVs. The MOVs have the same code size
43702   // as above SHIFTs (only SHIFT on 1 has lower code size).
43703   // However the MOVs have 2 advantages to a SHIFT:
43704   // 1. MOVs can write to a register that differs from source
43705   // 2. MOVs accept memory operands
43706 
43707   if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
43708       N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
43709       N0.getOperand(1).getOpcode() != ISD::Constant)
43710     return SDValue();
43711 
43712   SDValue N00 = N0.getOperand(0);
43713   SDValue N01 = N0.getOperand(1);
43714   APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
43715   APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
43716   EVT CVT = N1.getValueType();
43717 
43718   if (SarConst.isNegative())
43719     return SDValue();
43720 
43721   for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
43722     unsigned ShiftSize = SVT.getSizeInBits();
43723     // skipping types without corresponding sext/zext and
43724     // ShlConst that is not one of [56,48,32,24,16]
43725     if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
43726       continue;
43727     SDLoc DL(N);
43728     SDValue NN =
43729         DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
43730     SarConst = SarConst - (Size - ShiftSize);
43731     if (SarConst == 0)
43732       return NN;
43733     else if (SarConst.isNegative())
43734       return DAG.getNode(ISD::SHL, DL, VT, NN,
43735                          DAG.getConstant(-SarConst, DL, CVT));
43736     else
43737       return DAG.getNode(ISD::SRA, DL, VT, NN,
43738                          DAG.getConstant(SarConst, DL, CVT));
43739   }
43740   return SDValue();
43741 }
43742 
combineShiftRightLogical(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)43743 static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
43744                                         TargetLowering::DAGCombinerInfo &DCI,
43745                                         const X86Subtarget &Subtarget) {
43746   SDValue N0 = N->getOperand(0);
43747   SDValue N1 = N->getOperand(1);
43748   EVT VT = N0.getValueType();
43749 
43750   if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
43751     return V;
43752 
43753   // Only do this on the last DAG combine as it can interfere with other
43754   // combines.
43755   if (!DCI.isAfterLegalizeDAG())
43756     return SDValue();
43757 
43758   // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
43759   // TODO: This is a generic DAG combine that became an x86-only combine to
43760   // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
43761   // and-not ('andn').
43762   if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
43763     return SDValue();
43764 
43765   auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
43766   auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
43767   if (!ShiftC || !AndC)
43768     return SDValue();
43769 
43770   // If we can shrink the constant mask below 8-bits or 32-bits, then this
43771   // transform should reduce code size. It may also enable secondary transforms
43772   // from improved known-bits analysis or instruction selection.
43773   APInt MaskVal = AndC->getAPIntValue();
43774 
43775   // If this can be matched by a zero extend, don't optimize.
43776   if (MaskVal.isMask()) {
43777     unsigned TO = MaskVal.countTrailingOnes();
43778     if (TO >= 8 && isPowerOf2_32(TO))
43779       return SDValue();
43780   }
43781 
43782   APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
43783   unsigned OldMaskSize = MaskVal.getMinSignedBits();
43784   unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
43785   if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
43786       (OldMaskSize > 32 && NewMaskSize <= 32)) {
43787     // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
43788     SDLoc DL(N);
43789     SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
43790     SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
43791     return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
43792   }
43793   return SDValue();
43794 }
43795 
combineHorizOpWithShuffle(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)43796 static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
43797                                          const X86Subtarget &Subtarget) {
43798   unsigned Opcode = N->getOpcode();
43799   assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
43800 
43801   SDLoc DL(N);
43802   EVT VT = N->getValueType(0);
43803   SDValue N0 = N->getOperand(0);
43804   SDValue N1 = N->getOperand(1);
43805   EVT SrcVT = N0.getValueType();
43806 
43807   SDValue BC0 =
43808       N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
43809   SDValue BC1 =
43810       N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
43811 
43812   // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
43813   // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
43814   // truncation trees that help us avoid lane crossing shuffles.
43815   // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
43816   // TODO: We don't handle vXf64 shuffles yet.
43817   if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32 &&
43818       BC0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
43819       BC1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
43820       BC0.getOperand(0) == BC1.getOperand(0) &&
43821       BC0.getOperand(0).getValueType().is256BitVector() &&
43822       BC0.getConstantOperandAPInt(1) == 0 &&
43823       BC1.getConstantOperandAPInt(1) ==
43824           BC0.getValueType().getVectorNumElements()) {
43825     SmallVector<SDValue> ShuffleOps;
43826     SmallVector<int> ShuffleMask, ScaledMask;
43827     SDValue Vec = peekThroughBitcasts(BC0.getOperand(0));
43828     if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
43829       resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);
43830       // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
43831       // shuffle to a v4X64 width - we can probably relax this in the future.
43832       if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
43833           ShuffleOps[0].getValueType().is256BitVector() &&
43834           scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
43835         SDValue Lo, Hi;
43836         MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
43837         std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
43838         Lo = DAG.getBitcast(SrcVT, Lo);
43839         Hi = DAG.getBitcast(SrcVT, Hi);
43840         SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
43841         Res = DAG.getBitcast(ShufVT, Res);
43842         Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
43843         return DAG.getBitcast(VT, Res);
43844       }
43845     }
43846   }
43847 
43848   // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
43849   if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
43850     // If either/both ops are a shuffle that can scale to v2x64,
43851     // then see if we can perform this as a v4x32 post shuffle.
43852     SmallVector<SDValue> Ops0, Ops1;
43853     SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
43854     bool IsShuf0 =
43855         getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
43856         scaleShuffleElements(Mask0, 2, ScaledMask0) &&
43857         all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
43858     bool IsShuf1 =
43859         getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
43860         scaleShuffleElements(Mask1, 2, ScaledMask1) &&
43861         all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
43862     if (IsShuf0 || IsShuf1) {
43863       if (!IsShuf0) {
43864         Ops0.assign({BC0});
43865         ScaledMask0.assign({0, 1});
43866       }
43867       if (!IsShuf1) {
43868         Ops1.assign({BC1});
43869         ScaledMask1.assign({0, 1});
43870       }
43871 
43872       SDValue LHS, RHS;
43873       int PostShuffle[4] = {-1, -1, -1, -1};
43874       auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
43875         if (M < 0)
43876           return true;
43877         Idx = M % 2;
43878         SDValue Src = Ops[M / 2];
43879         if (!LHS || LHS == Src) {
43880           LHS = Src;
43881           return true;
43882         }
43883         if (!RHS || RHS == Src) {
43884           Idx += 2;
43885           RHS = Src;
43886           return true;
43887         }
43888         return false;
43889       };
43890       if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
43891           FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
43892           FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
43893           FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
43894         LHS = DAG.getBitcast(SrcVT, LHS);
43895         RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
43896         MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
43897         SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
43898         Res = DAG.getBitcast(ShufVT, Res);
43899         Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
43900         return DAG.getBitcast(VT, Res);
43901       }
43902     }
43903   }
43904 
43905   // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
43906   if (VT.is256BitVector() && Subtarget.hasInt256()) {
43907     SmallVector<int> Mask0, Mask1;
43908     SmallVector<SDValue> Ops0, Ops1;
43909     SmallVector<int, 2> ScaledMask0, ScaledMask1;
43910     if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
43911         getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
43912         !Ops0.empty() && !Ops1.empty() &&
43913         all_of(Ops0,
43914                [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
43915         all_of(Ops1,
43916                [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
43917         scaleShuffleElements(Mask0, 2, ScaledMask0) &&
43918         scaleShuffleElements(Mask1, 2, ScaledMask1)) {
43919       SDValue Op00 = peekThroughBitcasts(Ops0.front());
43920       SDValue Op10 = peekThroughBitcasts(Ops1.front());
43921       SDValue Op01 = peekThroughBitcasts(Ops0.back());
43922       SDValue Op11 = peekThroughBitcasts(Ops1.back());
43923       if ((Op00 == Op11) && (Op01 == Op10)) {
43924         std::swap(Op10, Op11);
43925         ShuffleVectorSDNode::commuteMask(ScaledMask1);
43926       }
43927       if ((Op00 == Op10) && (Op01 == Op11)) {
43928         const int Map[4] = {0, 2, 1, 3};
43929         SmallVector<int, 4> ShuffleMask(
43930             {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
43931              Map[ScaledMask1[1]]});
43932         MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
43933         SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
43934                                   DAG.getBitcast(SrcVT, Op01));
43935         Res = DAG.getBitcast(ShufVT, Res);
43936         Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
43937         return DAG.getBitcast(VT, Res);
43938       }
43939     }
43940   }
43941 
43942   return SDValue();
43943 }
43944 
combineVectorPack(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)43945 static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
43946                                  TargetLowering::DAGCombinerInfo &DCI,
43947                                  const X86Subtarget &Subtarget) {
43948   unsigned Opcode = N->getOpcode();
43949   assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
43950          "Unexpected pack opcode");
43951 
43952   EVT VT = N->getValueType(0);
43953   SDValue N0 = N->getOperand(0);
43954   SDValue N1 = N->getOperand(1);
43955   unsigned NumDstElts = VT.getVectorNumElements();
43956   unsigned DstBitsPerElt = VT.getScalarSizeInBits();
43957   unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
43958   assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
43959          N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
43960          "Unexpected PACKSS/PACKUS input type");
43961 
43962   bool IsSigned = (X86ISD::PACKSS == Opcode);
43963 
43964   // Constant Folding.
43965   APInt UndefElts0, UndefElts1;
43966   SmallVector<APInt, 32> EltBits0, EltBits1;
43967   if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
43968       (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
43969       getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
43970       getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
43971     unsigned NumLanes = VT.getSizeInBits() / 128;
43972     unsigned NumSrcElts = NumDstElts / 2;
43973     unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
43974     unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
43975 
43976     APInt Undefs(NumDstElts, 0);
43977     SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
43978     for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
43979       for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
43980         unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
43981         auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
43982         auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
43983 
43984         if (UndefElts[SrcIdx]) {
43985           Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
43986           continue;
43987         }
43988 
43989         APInt &Val = EltBits[SrcIdx];
43990         if (IsSigned) {
43991           // PACKSS: Truncate signed value with signed saturation.
43992           // Source values less than dst minint are saturated to minint.
43993           // Source values greater than dst maxint are saturated to maxint.
43994           if (Val.isSignedIntN(DstBitsPerElt))
43995             Val = Val.trunc(DstBitsPerElt);
43996           else if (Val.isNegative())
43997             Val = APInt::getSignedMinValue(DstBitsPerElt);
43998           else
43999             Val = APInt::getSignedMaxValue(DstBitsPerElt);
44000         } else {
44001           // PACKUS: Truncate signed value with unsigned saturation.
44002           // Source values less than zero are saturated to zero.
44003           // Source values greater than dst maxuint are saturated to maxuint.
44004           if (Val.isIntN(DstBitsPerElt))
44005             Val = Val.trunc(DstBitsPerElt);
44006           else if (Val.isNegative())
44007             Val = APInt::getNullValue(DstBitsPerElt);
44008           else
44009             Val = APInt::getAllOnesValue(DstBitsPerElt);
44010         }
44011         Bits[Lane * NumDstEltsPerLane + Elt] = Val;
44012       }
44013     }
44014 
44015     return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
44016   }
44017 
44018   // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
44019   if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
44020     return V;
44021 
44022   // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
44023   // truncate to create a larger truncate.
44024   if (Subtarget.hasAVX512() &&
44025       N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
44026       N0.getOperand(0).getValueType() == MVT::v8i32) {
44027     if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
44028         (!IsSigned &&
44029          DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
44030       if (Subtarget.hasVLX())
44031         return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
44032 
44033       // Widen input to v16i32 so we can truncate that.
44034       SDLoc dl(N);
44035       SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
44036                                    N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
44037       return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
44038     }
44039   }
44040 
44041   // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
44042   if (VT.is128BitVector()) {
44043     unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
44044     SDValue Src0, Src1;
44045     if (N0.getOpcode() == ExtOpc &&
44046         N0.getOperand(0).getValueType().is64BitVector() &&
44047         N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
44048       Src0 = N0.getOperand(0);
44049     }
44050     if (N1.getOpcode() == ExtOpc &&
44051         N1.getOperand(0).getValueType().is64BitVector() &&
44052         N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
44053       Src1 = N1.getOperand(0);
44054     }
44055     if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
44056       assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
44057       Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
44058       Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
44059       return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
44060     }
44061   }
44062 
44063   // Attempt to combine as shuffle.
44064   SDValue Op(N, 0);
44065   if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44066     return Res;
44067 
44068   return SDValue();
44069 }
44070 
combineVectorHADDSUB(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)44071 static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
44072                                     TargetLowering::DAGCombinerInfo &DCI,
44073                                     const X86Subtarget &Subtarget) {
44074   assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
44075           X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
44076          "Unexpected horizontal add/sub opcode");
44077 
44078   if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
44079     MVT VT = N->getSimpleValueType(0);
44080     SDValue LHS = N->getOperand(0);
44081     SDValue RHS = N->getOperand(1);
44082 
44083     // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
44084     if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
44085         LHS.getOpcode() == RHS.getOpcode() &&
44086         LHS.getValueType() == RHS.getValueType()) {
44087       SDValue LHS0 = LHS.getOperand(0);
44088       SDValue RHS0 = LHS.getOperand(1);
44089       SDValue LHS1 = RHS.getOperand(0);
44090       SDValue RHS1 = RHS.getOperand(1);
44091       if ((LHS0 == RHS0 || LHS0.isUndef() || RHS0.isUndef()) &&
44092           (LHS1 == RHS1 || LHS1.isUndef() || RHS1.isUndef())) {
44093         SDLoc DL(N);
44094         SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
44095                                   LHS0.isUndef() ? RHS0 : LHS0,
44096                                   LHS1.isUndef() ? RHS1 : LHS1);
44097         MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
44098         Res = DAG.getBitcast(ShufVT, Res);
44099         SDValue NewLHS =
44100             DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
44101                         getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
44102         SDValue NewRHS =
44103             DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
44104                         getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
44105         DAG.ReplaceAllUsesOfValueWith(LHS, DAG.getBitcast(VT, NewLHS));
44106         DAG.ReplaceAllUsesOfValueWith(RHS, DAG.getBitcast(VT, NewRHS));
44107         return SDValue(N, 0);
44108       }
44109     }
44110   }
44111 
44112   // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
44113   if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
44114     return V;
44115 
44116   return SDValue();
44117 }
44118 
combineVectorShiftVar(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)44119 static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
44120                                      TargetLowering::DAGCombinerInfo &DCI,
44121                                      const X86Subtarget &Subtarget) {
44122   assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
44123           X86ISD::VSRL == N->getOpcode()) &&
44124          "Unexpected shift opcode");
44125   EVT VT = N->getValueType(0);
44126   SDValue N0 = N->getOperand(0);
44127   SDValue N1 = N->getOperand(1);
44128 
44129   // Shift zero -> zero.
44130   if (ISD::isBuildVectorAllZeros(N0.getNode()))
44131     return DAG.getConstant(0, SDLoc(N), VT);
44132 
44133   // Detect constant shift amounts.
44134   APInt UndefElts;
44135   SmallVector<APInt, 32> EltBits;
44136   if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
44137     unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
44138     return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
44139                                       EltBits[0].getZExtValue(), DAG);
44140   }
44141 
44142   APInt KnownUndef, KnownZero;
44143   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44144   APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
44145   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
44146                                      KnownZero, DCI))
44147     return SDValue(N, 0);
44148 
44149   return SDValue();
44150 }
44151 
combineVectorShiftImm(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)44152 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
44153                                      TargetLowering::DAGCombinerInfo &DCI,
44154                                      const X86Subtarget &Subtarget) {
44155   unsigned Opcode = N->getOpcode();
44156   assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
44157           X86ISD::VSRLI == Opcode) &&
44158          "Unexpected shift opcode");
44159   bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
44160   EVT VT = N->getValueType(0);
44161   SDValue N0 = N->getOperand(0);
44162   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
44163   assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
44164          "Unexpected value type");
44165   assert(N->getOperand(1).getValueType() == MVT::i8 &&
44166          "Unexpected shift amount type");
44167 
44168   // (shift undef, X) -> 0
44169   if (N0.isUndef())
44170     return DAG.getConstant(0, SDLoc(N), VT);
44171 
44172   // Out of range logical bit shifts are guaranteed to be zero.
44173   // Out of range arithmetic bit shifts splat the sign bit.
44174   unsigned ShiftVal = N->getConstantOperandVal(1);
44175   if (ShiftVal >= NumBitsPerElt) {
44176     if (LogicalShift)
44177       return DAG.getConstant(0, SDLoc(N), VT);
44178     ShiftVal = NumBitsPerElt - 1;
44179   }
44180 
44181   // (shift X, 0) -> X
44182   if (!ShiftVal)
44183     return N0;
44184 
44185   // (shift 0, C) -> 0
44186   if (ISD::isBuildVectorAllZeros(N0.getNode()))
44187     // N0 is all zeros or undef. We guarantee that the bits shifted into the
44188     // result are all zeros, not undef.
44189     return DAG.getConstant(0, SDLoc(N), VT);
44190 
44191   // (VSRAI -1, C) -> -1
44192   if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
44193     // N0 is all ones or undef. We guarantee that the bits shifted into the
44194     // result are all ones, not undef.
44195     return DAG.getConstant(-1, SDLoc(N), VT);
44196 
44197   // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
44198   if (Opcode == N0.getOpcode()) {
44199     unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
44200     unsigned NewShiftVal = ShiftVal + ShiftVal2;
44201     if (NewShiftVal >= NumBitsPerElt) {
44202       // Out of range logical bit shifts are guaranteed to be zero.
44203       // Out of range arithmetic bit shifts splat the sign bit.
44204       if (LogicalShift)
44205         return DAG.getConstant(0, SDLoc(N), VT);
44206       NewShiftVal = NumBitsPerElt - 1;
44207     }
44208     return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
44209                        DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
44210   }
44211 
44212   // We can decode 'whole byte' logical bit shifts as shuffles.
44213   if (LogicalShift && (ShiftVal % 8) == 0) {
44214     SDValue Op(N, 0);
44215     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44216       return Res;
44217   }
44218 
44219   // Constant Folding.
44220   APInt UndefElts;
44221   SmallVector<APInt, 32> EltBits;
44222   if (N->isOnlyUserOf(N0.getNode()) &&
44223       getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
44224     assert(EltBits.size() == VT.getVectorNumElements() &&
44225            "Unexpected shift value type");
44226     // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
44227     // created an undef input due to no input bits being demanded, but user
44228     // still expects 0 in other bits.
44229     for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
44230       APInt &Elt = EltBits[i];
44231       if (UndefElts[i])
44232         Elt = 0;
44233       else if (X86ISD::VSHLI == Opcode)
44234         Elt <<= ShiftVal;
44235       else if (X86ISD::VSRAI == Opcode)
44236         Elt.ashrInPlace(ShiftVal);
44237       else
44238         Elt.lshrInPlace(ShiftVal);
44239     }
44240     // Reset undef elements since they were zeroed above.
44241     UndefElts = 0;
44242     return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
44243   }
44244 
44245   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44246   if (TLI.SimplifyDemandedBits(SDValue(N, 0),
44247                                APInt::getAllOnesValue(NumBitsPerElt), DCI))
44248     return SDValue(N, 0);
44249 
44250   return SDValue();
44251 }
44252 
combineVectorInsert(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)44253 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
44254                                    TargetLowering::DAGCombinerInfo &DCI,
44255                                    const X86Subtarget &Subtarget) {
44256   EVT VT = N->getValueType(0);
44257   assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||
44258           (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) ||
44259           N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&
44260          "Unexpected vector insertion");
44261 
44262   if (N->getOpcode() == X86ISD::PINSRB || N->getOpcode() == X86ISD::PINSRW) {
44263     unsigned NumBitsPerElt = VT.getScalarSizeInBits();
44264     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44265     if (TLI.SimplifyDemandedBits(SDValue(N, 0),
44266                                  APInt::getAllOnesValue(NumBitsPerElt), DCI))
44267       return SDValue(N, 0);
44268   }
44269 
44270   // Attempt to combine insertion patterns to a shuffle.
44271   if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
44272     SDValue Op(N, 0);
44273     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44274       return Res;
44275   }
44276 
44277   return SDValue();
44278 }
44279 
44280 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
44281 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
44282 /// OR -> CMPNEQSS.
combineCompareEqual(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)44283 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
44284                                    TargetLowering::DAGCombinerInfo &DCI,
44285                                    const X86Subtarget &Subtarget) {
44286   unsigned opcode;
44287 
44288   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
44289   // we're requiring SSE2 for both.
44290   if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
44291     SDValue N0 = N->getOperand(0);
44292     SDValue N1 = N->getOperand(1);
44293     SDValue CMP0 = N0.getOperand(1);
44294     SDValue CMP1 = N1.getOperand(1);
44295     SDLoc DL(N);
44296 
44297     // The SETCCs should both refer to the same CMP.
44298     if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
44299       return SDValue();
44300 
44301     SDValue CMP00 = CMP0->getOperand(0);
44302     SDValue CMP01 = CMP0->getOperand(1);
44303     EVT     VT    = CMP00.getValueType();
44304 
44305     if (VT == MVT::f32 || VT == MVT::f64) {
44306       bool ExpectingFlags = false;
44307       // Check for any users that want flags:
44308       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
44309            !ExpectingFlags && UI != UE; ++UI)
44310         switch (UI->getOpcode()) {
44311         default:
44312         case ISD::BR_CC:
44313         case ISD::BRCOND:
44314         case ISD::SELECT:
44315           ExpectingFlags = true;
44316           break;
44317         case ISD::CopyToReg:
44318         case ISD::SIGN_EXTEND:
44319         case ISD::ZERO_EXTEND:
44320         case ISD::ANY_EXTEND:
44321           break;
44322         }
44323 
44324       if (!ExpectingFlags) {
44325         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
44326         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
44327 
44328         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
44329           X86::CondCode tmp = cc0;
44330           cc0 = cc1;
44331           cc1 = tmp;
44332         }
44333 
44334         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
44335             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
44336           // FIXME: need symbolic constants for these magic numbers.
44337           // See X86ATTInstPrinter.cpp:printSSECC().
44338           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
44339           if (Subtarget.hasAVX512()) {
44340             SDValue FSetCC =
44341                 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
44342                             DAG.getTargetConstant(x86cc, DL, MVT::i8));
44343             // Need to fill with zeros to ensure the bitcast will produce zeroes
44344             // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
44345             SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
44346                                       DAG.getConstant(0, DL, MVT::v16i1),
44347                                       FSetCC, DAG.getIntPtrConstant(0, DL));
44348             return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
44349                                       N->getSimpleValueType(0));
44350           }
44351           SDValue OnesOrZeroesF =
44352               DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
44353                           CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
44354 
44355           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
44356           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
44357 
44358           if (is64BitFP && !Subtarget.is64Bit()) {
44359             // On a 32-bit target, we cannot bitcast the 64-bit float to a
44360             // 64-bit integer, since that's not a legal type. Since
44361             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
44362             // bits, but can do this little dance to extract the lowest 32 bits
44363             // and work with those going forward.
44364             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
44365                                            OnesOrZeroesF);
44366             SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
44367             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
44368                                         Vector32, DAG.getIntPtrConstant(0, DL));
44369             IntVT = MVT::i32;
44370           }
44371 
44372           SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
44373           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
44374                                       DAG.getConstant(1, DL, IntVT));
44375           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
44376                                               ANDed);
44377           return OneBitOfTruth;
44378         }
44379       }
44380     }
44381   }
44382   return SDValue();
44383 }
44384 
44385 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
combineANDXORWithAllOnesIntoANDNP(SDNode * N,SelectionDAG & DAG)44386 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
44387   assert(N->getOpcode() == ISD::AND);
44388 
44389   MVT VT = N->getSimpleValueType(0);
44390   if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
44391     return SDValue();
44392 
44393   SDValue X, Y;
44394   SDValue N0 = N->getOperand(0);
44395   SDValue N1 = N->getOperand(1);
44396 
44397   auto GetNot = [&VT, &DAG](SDValue V) {
44398     // Basic X = NOT(Y) detection.
44399     if (SDValue Not = IsNOT(V, DAG))
44400       return Not;
44401     // Fold BROADCAST(NOT(Y)) -> BROADCAST(Y).
44402     if (V.getOpcode() == X86ISD::VBROADCAST) {
44403       SDValue Src = V.getOperand(0);
44404       EVT SrcVT = Src.getValueType();
44405       if (!SrcVT.isVector())
44406         return SDValue();
44407       if (SDValue Not = IsNOT(Src, DAG))
44408         return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT,
44409                            DAG.getBitcast(SrcVT, Not));
44410     }
44411     return SDValue();
44412   };
44413 
44414   if (SDValue Not = GetNot(N0)) {
44415     X = Not;
44416     Y = N1;
44417   } else if (SDValue Not = GetNot(N1)) {
44418     X = Not;
44419     Y = N0;
44420   } else
44421     return SDValue();
44422 
44423   X = DAG.getBitcast(VT, X);
44424   Y = DAG.getBitcast(VT, Y);
44425   return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
44426 }
44427 
44428 // Try to widen AND, OR and XOR nodes to VT in order to remove casts around
44429 // logical operations, like in the example below.
44430 //   or (and (truncate x, truncate y)),
44431 //      (xor (truncate z, build_vector (constants)))
44432 // Given a target type \p VT, we generate
44433 //   or (and x, y), (xor z, zext(build_vector (constants)))
44434 // given x, y and z are of type \p VT. We can do so, if operands are either
44435 // truncates from VT types, the second operand is a vector of constants or can
44436 // be recursively promoted.
PromoteMaskArithmetic(SDNode * N,EVT VT,SelectionDAG & DAG,unsigned Depth)44437 static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
44438                                      unsigned Depth) {
44439   // Limit recursion to avoid excessive compile times.
44440   if (Depth >= SelectionDAG::MaxRecursionDepth)
44441     return SDValue();
44442 
44443   if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
44444       N->getOpcode() != ISD::OR)
44445     return SDValue();
44446 
44447   SDValue N0 = N->getOperand(0);
44448   SDValue N1 = N->getOperand(1);
44449   SDLoc DL(N);
44450 
44451   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44452   if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
44453     return SDValue();
44454 
44455   if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
44456     N0 = NN0;
44457   else {
44458     // The Left side has to be a trunc.
44459     if (N0.getOpcode() != ISD::TRUNCATE)
44460       return SDValue();
44461 
44462     // The type of the truncated inputs.
44463     if (N0.getOperand(0).getValueType() != VT)
44464       return SDValue();
44465 
44466     N0 = N0.getOperand(0);
44467   }
44468 
44469   if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
44470     N1 = NN1;
44471   else {
44472     // The right side has to be a 'trunc' or a constant vector.
44473     bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
44474                     N1.getOperand(0).getValueType() == VT;
44475     if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
44476       return SDValue();
44477 
44478     if (RHSTrunc)
44479       N1 = N1.getOperand(0);
44480     else
44481       N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
44482   }
44483 
44484   return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
44485 }
44486 
44487 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
44488 // register. In most cases we actually compare or select YMM-sized registers
44489 // and mixing the two types creates horrible code. This method optimizes
44490 // some of the transition sequences.
44491 // Even with AVX-512 this is still useful for removing casts around logical
44492 // operations on vXi1 mask types.
PromoteMaskArithmetic(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)44493 static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
44494                                      const X86Subtarget &Subtarget) {
44495   EVT VT = N->getValueType(0);
44496   assert(VT.isVector() && "Expected vector type");
44497 
44498   SDLoc DL(N);
44499   assert((N->getOpcode() == ISD::ANY_EXTEND ||
44500           N->getOpcode() == ISD::ZERO_EXTEND ||
44501           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
44502 
44503   SDValue Narrow = N->getOperand(0);
44504   EVT NarrowVT = Narrow.getValueType();
44505 
44506   // Generate the wide operation.
44507   SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
44508   if (!Op)
44509     return SDValue();
44510   switch (N->getOpcode()) {
44511   default: llvm_unreachable("Unexpected opcode");
44512   case ISD::ANY_EXTEND:
44513     return Op;
44514   case ISD::ZERO_EXTEND:
44515     return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
44516   case ISD::SIGN_EXTEND:
44517     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
44518                        Op, DAG.getValueType(NarrowVT));
44519   }
44520 }
44521 
convertIntLogicToFPLogicOpcode(unsigned Opcode)44522 static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
44523   unsigned FPOpcode;
44524   switch (Opcode) {
44525   default: llvm_unreachable("Unexpected input node for FP logic conversion");
44526   case ISD::AND: FPOpcode = X86ISD::FAND; break;
44527   case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
44528   case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
44529   }
44530   return FPOpcode;
44531 }
44532 
44533 /// If both input operands of a logic op are being cast from floating point
44534 /// types, try to convert this into a floating point logic node to avoid
44535 /// unnecessary moves from SSE to integer registers.
convertIntLogicToFPLogic(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)44536 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
44537                                         const X86Subtarget &Subtarget) {
44538   EVT VT = N->getValueType(0);
44539   SDValue N0 = N->getOperand(0);
44540   SDValue N1 = N->getOperand(1);
44541   SDLoc DL(N);
44542 
44543   if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST)
44544     return SDValue();
44545 
44546   SDValue N00 = N0.getOperand(0);
44547   SDValue N10 = N1.getOperand(0);
44548   EVT N00Type = N00.getValueType();
44549   EVT N10Type = N10.getValueType();
44550 
44551   // Ensure that both types are the same and are legal scalar fp types.
44552   if (N00Type != N10Type ||
44553       !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
44554         (Subtarget.hasSSE2() && N00Type == MVT::f64)))
44555     return SDValue();
44556 
44557   unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
44558   SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
44559   return DAG.getBitcast(VT, FPLogic);
44560 }
44561 
44562 // Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
44563 // to reduce XMM->GPR traffic.
combineBitOpWithMOVMSK(SDNode * N,SelectionDAG & DAG)44564 static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
44565   unsigned Opc = N->getOpcode();
44566   assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
44567          "Unexpected bit opcode");
44568 
44569   SDValue N0 = N->getOperand(0);
44570   SDValue N1 = N->getOperand(1);
44571 
44572   // Both operands must be single use MOVMSK.
44573   if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
44574       N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
44575     return SDValue();
44576 
44577   SDValue Vec0 = N0.getOperand(0);
44578   SDValue Vec1 = N1.getOperand(0);
44579   EVT VecVT0 = Vec0.getValueType();
44580   EVT VecVT1 = Vec1.getValueType();
44581 
44582   // Both MOVMSK operands must be from vectors of the same size and same element
44583   // size, but its OK for a fp/int diff.
44584   if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
44585       VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
44586     return SDValue();
44587 
44588   SDLoc DL(N);
44589   unsigned VecOpc =
44590       VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
44591   SDValue Result =
44592       DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
44593   return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
44594 }
44595 
44596 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
44597 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
44598 /// with a shift-right to eliminate loading the vector constant mask value.
combineAndMaskToShift(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)44599 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
44600                                      const X86Subtarget &Subtarget) {
44601   SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
44602   SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
44603   EVT VT0 = Op0.getValueType();
44604   EVT VT1 = Op1.getValueType();
44605 
44606   if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
44607     return SDValue();
44608 
44609   APInt SplatVal;
44610   if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
44611       !SplatVal.isMask())
44612     return SDValue();
44613 
44614   // Don't prevent creation of ANDN.
44615   if (isBitwiseNot(Op0))
44616     return SDValue();
44617 
44618   if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
44619     return SDValue();
44620 
44621   unsigned EltBitWidth = VT0.getScalarSizeInBits();
44622   if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
44623     return SDValue();
44624 
44625   SDLoc DL(N);
44626   unsigned ShiftVal = SplatVal.countTrailingOnes();
44627   SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
44628   SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
44629   return DAG.getBitcast(N->getValueType(0), Shift);
44630 }
44631 
44632 // Get the index node from the lowered DAG of a GEP IR instruction with one
44633 // indexing dimension.
getIndexFromUnindexedLoad(LoadSDNode * Ld)44634 static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
44635   if (Ld->isIndexed())
44636     return SDValue();
44637 
44638   SDValue Base = Ld->getBasePtr();
44639 
44640   if (Base.getOpcode() != ISD::ADD)
44641     return SDValue();
44642 
44643   SDValue ShiftedIndex = Base.getOperand(0);
44644 
44645   if (ShiftedIndex.getOpcode() != ISD::SHL)
44646     return SDValue();
44647 
44648   return ShiftedIndex.getOperand(0);
44649 
44650 }
44651 
hasBZHI(const X86Subtarget & Subtarget,MVT VT)44652 static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
44653   if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
44654     switch (VT.getSizeInBits()) {
44655     default: return false;
44656     case 64: return Subtarget.is64Bit() ? true : false;
44657     case 32: return true;
44658     }
44659   }
44660   return false;
44661 }
44662 
44663 // This function recognizes cases where X86 bzhi instruction can replace and
44664 // 'and-load' sequence.
44665 // In case of loading integer value from an array of constants which is defined
44666 // as follows:
44667 //
44668 //   int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
44669 //
44670 // then applying a bitwise and on the result with another input.
44671 // It's equivalent to performing bzhi (zero high bits) on the input, with the
44672 // same index of the load.
combineAndLoadToBZHI(SDNode * Node,SelectionDAG & DAG,const X86Subtarget & Subtarget)44673 static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
44674                                     const X86Subtarget &Subtarget) {
44675   MVT VT = Node->getSimpleValueType(0);
44676   SDLoc dl(Node);
44677 
44678   // Check if subtarget has BZHI instruction for the node's type
44679   if (!hasBZHI(Subtarget, VT))
44680     return SDValue();
44681 
44682   // Try matching the pattern for both operands.
44683   for (unsigned i = 0; i < 2; i++) {
44684     SDValue N = Node->getOperand(i);
44685     LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
44686 
44687      // continue if the operand is not a load instruction
44688     if (!Ld)
44689       return SDValue();
44690 
44691     const Value *MemOp = Ld->getMemOperand()->getValue();
44692 
44693     if (!MemOp)
44694       return SDValue();
44695 
44696     if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
44697       if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
44698         if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
44699 
44700           Constant *Init = GV->getInitializer();
44701           Type *Ty = Init->getType();
44702           if (!isa<ConstantDataArray>(Init) ||
44703               !Ty->getArrayElementType()->isIntegerTy() ||
44704               Ty->getArrayElementType()->getScalarSizeInBits() !=
44705                   VT.getSizeInBits() ||
44706               Ty->getArrayNumElements() >
44707                   Ty->getArrayElementType()->getScalarSizeInBits())
44708             continue;
44709 
44710           // Check if the array's constant elements are suitable to our case.
44711           uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
44712           bool ConstantsMatch = true;
44713           for (uint64_t j = 0; j < ArrayElementCount; j++) {
44714             auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
44715             if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
44716               ConstantsMatch = false;
44717               break;
44718             }
44719           }
44720           if (!ConstantsMatch)
44721             continue;
44722 
44723           // Do the transformation (For 32-bit type):
44724           // -> (and (load arr[idx]), inp)
44725           // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
44726           //    that will be replaced with one bzhi instruction.
44727           SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
44728           SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
44729 
44730           // Get the Node which indexes into the array.
44731           SDValue Index = getIndexFromUnindexedLoad(Ld);
44732           if (!Index)
44733             return SDValue();
44734           Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
44735 
44736           SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
44737           Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
44738 
44739           SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
44740           SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
44741 
44742           return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
44743         }
44744       }
44745     }
44746   }
44747   return SDValue();
44748 }
44749 
44750 // Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
44751 // Where C is a mask containing the same number of bits as the setcc and
44752 // where the setcc will freely 0 upper bits of k-register. We can replace the
44753 // undef in the concat with 0s and remove the AND. This mainly helps with
44754 // v2i1/v4i1 setcc being casted to scalar.
combineScalarAndWithMaskSetcc(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)44755 static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
44756                                              const X86Subtarget &Subtarget) {
44757   assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
44758 
44759   EVT VT = N->getValueType(0);
44760 
44761   // Make sure this is an AND with constant. We will check the value of the
44762   // constant later.
44763   if (!isa<ConstantSDNode>(N->getOperand(1)))
44764     return SDValue();
44765 
44766   // This is implied by the ConstantSDNode.
44767   assert(!VT.isVector() && "Expected scalar VT!");
44768 
44769   if (N->getOperand(0).getOpcode() != ISD::BITCAST ||
44770       !N->getOperand(0).hasOneUse() ||
44771       !N->getOperand(0).getOperand(0).hasOneUse())
44772     return SDValue();
44773 
44774   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44775   SDValue Src = N->getOperand(0).getOperand(0);
44776   EVT SrcVT = Src.getValueType();
44777   if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
44778       !TLI.isTypeLegal(SrcVT))
44779     return SDValue();
44780 
44781   if (Src.getOpcode() != ISD::CONCAT_VECTORS)
44782     return SDValue();
44783 
44784   // We only care about the first subvector of the concat, we expect the
44785   // other subvectors to be ignored due to the AND if we make the change.
44786   SDValue SubVec = Src.getOperand(0);
44787   EVT SubVecVT = SubVec.getValueType();
44788 
44789   // First subvector should be a setcc with a legal result type. The RHS of the
44790   // AND should be a mask with this many bits.
44791   if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) ||
44792       !N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements()))
44793     return SDValue();
44794 
44795   EVT SetccVT = SubVec.getOperand(0).getValueType();
44796   if (!TLI.isTypeLegal(SetccVT) ||
44797       !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
44798     return SDValue();
44799 
44800   if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
44801     return SDValue();
44802 
44803   // We passed all the checks. Rebuild the concat_vectors with zeroes
44804   // and cast it back to VT.
44805   SDLoc dl(N);
44806   SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
44807                               DAG.getConstant(0, dl, SubVecVT));
44808   Ops[0] = SubVec;
44809   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
44810                                Ops);
44811   return DAG.getBitcast(VT, Concat);
44812 }
44813 
combineAnd(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)44814 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
44815                           TargetLowering::DAGCombinerInfo &DCI,
44816                           const X86Subtarget &Subtarget) {
44817   EVT VT = N->getValueType(0);
44818 
44819   // If this is SSE1 only convert to FAND to avoid scalarization.
44820   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
44821     return DAG.getBitcast(
44822         MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
44823                                 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
44824                                 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
44825   }
44826 
44827   // Use a 32-bit and+zext if upper bits known zero.
44828   if (VT == MVT::i64 && Subtarget.is64Bit() &&
44829       !isa<ConstantSDNode>(N->getOperand(1))) {
44830     APInt HiMask = APInt::getHighBitsSet(64, 32);
44831     if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
44832         DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
44833       SDLoc dl(N);
44834       SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
44835       SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
44836       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
44837                          DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
44838     }
44839   }
44840 
44841   // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
44842   // TODO: Support multiple SrcOps.
44843   if (VT == MVT::i1) {
44844     SmallVector<SDValue, 2> SrcOps;
44845     SmallVector<APInt, 2> SrcPartials;
44846     if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
44847         SrcOps.size() == 1) {
44848       SDLoc dl(N);
44849       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44850       unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
44851       EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
44852       SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
44853       if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
44854         Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
44855       if (Mask) {
44856         assert(SrcPartials[0].getBitWidth() == NumElts &&
44857                "Unexpected partial reduction mask");
44858         SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
44859         Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
44860         return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
44861       }
44862     }
44863   }
44864 
44865   if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
44866     return V;
44867 
44868   if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
44869     return R;
44870 
44871   if (DCI.isBeforeLegalizeOps())
44872     return SDValue();
44873 
44874   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
44875     return R;
44876 
44877   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
44878     return FPLogic;
44879 
44880   if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
44881     return R;
44882 
44883   if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
44884     return ShiftRight;
44885 
44886   if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
44887     return R;
44888 
44889   // Attempt to recursively combine a bitmask AND with shuffles.
44890   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
44891     SDValue Op(N, 0);
44892     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44893       return Res;
44894   }
44895 
44896   // Attempt to combine a scalar bitmask AND with an extracted shuffle.
44897   if ((VT.getScalarSizeInBits() % 8) == 0 &&
44898       N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
44899       isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
44900     SDValue BitMask = N->getOperand(1);
44901     SDValue SrcVec = N->getOperand(0).getOperand(0);
44902     EVT SrcVecVT = SrcVec.getValueType();
44903 
44904     // Check that the constant bitmask masks whole bytes.
44905     APInt UndefElts;
44906     SmallVector<APInt, 64> EltBits;
44907     if (VT == SrcVecVT.getScalarType() &&
44908         N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
44909         getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
44910         llvm::all_of(EltBits, [](const APInt &M) {
44911           return M.isNullValue() || M.isAllOnesValue();
44912         })) {
44913       unsigned NumElts = SrcVecVT.getVectorNumElements();
44914       unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
44915       unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
44916 
44917       // Create a root shuffle mask from the byte mask and the extracted index.
44918       SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
44919       for (unsigned i = 0; i != Scale; ++i) {
44920         if (UndefElts[i])
44921           continue;
44922         int VecIdx = Scale * Idx + i;
44923         ShuffleMask[VecIdx] =
44924             EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
44925       }
44926 
44927       if (SDValue Shuffle = combineX86ShufflesRecursively(
44928               {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
44929               X86::MaxShuffleCombineDepth,
44930               /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
44931               /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
44932         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
44933                            N->getOperand(0).getOperand(1));
44934     }
44935   }
44936 
44937   return SDValue();
44938 }
44939 
44940 // Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
canonicalizeBitSelect(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)44941 static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
44942                                      const X86Subtarget &Subtarget) {
44943   assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
44944 
44945   MVT VT = N->getSimpleValueType(0);
44946   if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0)
44947     return SDValue();
44948 
44949   SDValue N0 = peekThroughBitcasts(N->getOperand(0));
44950   SDValue N1 = peekThroughBitcasts(N->getOperand(1));
44951   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
44952     return SDValue();
44953 
44954   // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
44955   // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
44956   bool UseVPTERNLOG = (Subtarget.hasAVX512() && VT.is512BitVector()) ||
44957                       Subtarget.hasVLX();
44958   if (!(Subtarget.hasXOP() || UseVPTERNLOG ||
44959         !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
44960     return SDValue();
44961 
44962   // Attempt to extract constant byte masks.
44963   APInt UndefElts0, UndefElts1;
44964   SmallVector<APInt, 32> EltBits0, EltBits1;
44965   if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
44966                                      false, false))
44967     return SDValue();
44968   if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
44969                                      false, false))
44970     return SDValue();
44971 
44972   for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
44973     // TODO - add UNDEF elts support.
44974     if (UndefElts0[i] || UndefElts1[i])
44975       return SDValue();
44976     if (EltBits0[i] != ~EltBits1[i])
44977       return SDValue();
44978   }
44979 
44980   SDLoc DL(N);
44981 
44982   if (UseVPTERNLOG) {
44983     // Emit a VPTERNLOG node directly.
44984     SDValue A = DAG.getBitcast(VT, N0.getOperand(1));
44985     SDValue B = DAG.getBitcast(VT, N0.getOperand(0));
44986     SDValue C = DAG.getBitcast(VT, N1.getOperand(0));
44987     SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
44988     return DAG.getNode(X86ISD::VPTERNLOG, DL, VT, A, B, C, Imm);
44989   }
44990 
44991   SDValue X = N->getOperand(0);
44992   SDValue Y =
44993       DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
44994                   DAG.getBitcast(VT, N1.getOperand(0)));
44995   return DAG.getNode(ISD::OR, DL, VT, X, Y);
44996 }
44997 
44998 // Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
matchLogicBlend(SDNode * N,SDValue & X,SDValue & Y,SDValue & Mask)44999 static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
45000   if (N->getOpcode() != ISD::OR)
45001     return false;
45002 
45003   SDValue N0 = N->getOperand(0);
45004   SDValue N1 = N->getOperand(1);
45005 
45006   // Canonicalize AND to LHS.
45007   if (N1.getOpcode() == ISD::AND)
45008     std::swap(N0, N1);
45009 
45010   // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
45011   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
45012     return false;
45013 
45014   Mask = N1.getOperand(0);
45015   X = N1.getOperand(1);
45016 
45017   // Check to see if the mask appeared in both the AND and ANDNP.
45018   if (N0.getOperand(0) == Mask)
45019     Y = N0.getOperand(1);
45020   else if (N0.getOperand(1) == Mask)
45021     Y = N0.getOperand(0);
45022   else
45023     return false;
45024 
45025   // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
45026   // ANDNP combine allows other combines to happen that prevent matching.
45027   return true;
45028 }
45029 
45030 // Try to fold:
45031 //   (or (and (m, y), (pandn m, x)))
45032 // into:
45033 //   (vselect m, x, y)
45034 // As a special case, try to fold:
45035 //   (or (and (m, (sub 0, x)), (pandn m, x)))
45036 // into:
45037 //   (sub (xor X, M), M)
combineLogicBlendIntoPBLENDV(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)45038 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
45039                                             const X86Subtarget &Subtarget) {
45040   assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
45041 
45042   EVT VT = N->getValueType(0);
45043   if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
45044         (VT.is256BitVector() && Subtarget.hasInt256())))
45045     return SDValue();
45046 
45047   SDValue X, Y, Mask;
45048   if (!matchLogicBlend(N, X, Y, Mask))
45049     return SDValue();
45050 
45051   // Validate that X, Y, and Mask are bitcasts, and see through them.
45052   Mask = peekThroughBitcasts(Mask);
45053   X = peekThroughBitcasts(X);
45054   Y = peekThroughBitcasts(Y);
45055 
45056   EVT MaskVT = Mask.getValueType();
45057   unsigned EltBits = MaskVT.getScalarSizeInBits();
45058 
45059   // TODO: Attempt to handle floating point cases as well?
45060   if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
45061     return SDValue();
45062 
45063   SDLoc DL(N);
45064 
45065   // Attempt to combine to conditional negate: (sub (xor X, M), M)
45066   if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
45067                                                            DAG, Subtarget))
45068     return Res;
45069 
45070   // PBLENDVB is only available on SSE 4.1.
45071   if (!Subtarget.hasSSE41())
45072     return SDValue();
45073 
45074   // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
45075   if (Subtarget.hasVLX())
45076     return SDValue();
45077 
45078   MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
45079 
45080   X = DAG.getBitcast(BlendVT, X);
45081   Y = DAG.getBitcast(BlendVT, Y);
45082   Mask = DAG.getBitcast(BlendVT, Mask);
45083   Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
45084   return DAG.getBitcast(VT, Mask);
45085 }
45086 
45087 // Helper function for combineOrCmpEqZeroToCtlzSrl
45088 // Transforms:
45089 //   seteq(cmp x, 0)
45090 //   into:
45091 //   srl(ctlz x), log2(bitsize(x))
45092 // Input pattern is checked by caller.
lowerX86CmpEqZeroToCtlzSrl(SDValue Op,EVT ExtTy,SelectionDAG & DAG)45093 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
45094                                           SelectionDAG &DAG) {
45095   SDValue Cmp = Op.getOperand(1);
45096   EVT VT = Cmp.getOperand(0).getValueType();
45097   unsigned Log2b = Log2_32(VT.getSizeInBits());
45098   SDLoc dl(Op);
45099   SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
45100   // The result of the shift is true or false, and on X86, the 32-bit
45101   // encoding of shr and lzcnt is more desirable.
45102   SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
45103   SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
45104                             DAG.getConstant(Log2b, dl, MVT::i8));
45105   return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
45106 }
45107 
45108 // Try to transform:
45109 //   zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
45110 //   into:
45111 //   srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
45112 // Will also attempt to match more generic cases, eg:
45113 //   zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
45114 // Only applies if the target supports the FastLZCNT feature.
combineOrCmpEqZeroToCtlzSrl(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)45115 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
45116                                            TargetLowering::DAGCombinerInfo &DCI,
45117                                            const X86Subtarget &Subtarget) {
45118   if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
45119     return SDValue();
45120 
45121   auto isORCandidate = [](SDValue N) {
45122     return (N->getOpcode() == ISD::OR && N->hasOneUse());
45123   };
45124 
45125   // Check the zero extend is extending to 32-bit or more. The code generated by
45126   // srl(ctlz) for 16-bit or less variants of the pattern would require extra
45127   // instructions to clear the upper bits.
45128   if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
45129       !isORCandidate(N->getOperand(0)))
45130     return SDValue();
45131 
45132   // Check the node matches: setcc(eq, cmp 0)
45133   auto isSetCCCandidate = [](SDValue N) {
45134     return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
45135            X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
45136            N->getOperand(1).getOpcode() == X86ISD::CMP &&
45137            isNullConstant(N->getOperand(1).getOperand(1)) &&
45138            N->getOperand(1).getValueType().bitsGE(MVT::i32);
45139   };
45140 
45141   SDNode *OR = N->getOperand(0).getNode();
45142   SDValue LHS = OR->getOperand(0);
45143   SDValue RHS = OR->getOperand(1);
45144 
45145   // Save nodes matching or(or, setcc(eq, cmp 0)).
45146   SmallVector<SDNode *, 2> ORNodes;
45147   while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
45148           (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
45149     ORNodes.push_back(OR);
45150     OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
45151     LHS = OR->getOperand(0);
45152     RHS = OR->getOperand(1);
45153   }
45154 
45155   // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
45156   if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
45157       !isORCandidate(SDValue(OR, 0)))
45158     return SDValue();
45159 
45160   // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
45161   // to
45162   // or(srl(ctlz),srl(ctlz)).
45163   // The dag combiner can then fold it into:
45164   // srl(or(ctlz, ctlz)).
45165   EVT VT = OR->getValueType(0);
45166   SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
45167   SDValue Ret, NewRHS;
45168   if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
45169     Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
45170 
45171   if (!Ret)
45172     return SDValue();
45173 
45174   // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
45175   while (ORNodes.size() > 0) {
45176     OR = ORNodes.pop_back_val();
45177     LHS = OR->getOperand(0);
45178     RHS = OR->getOperand(1);
45179     // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
45180     if (RHS->getOpcode() == ISD::OR)
45181       std::swap(LHS, RHS);
45182     NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
45183     if (!NewRHS)
45184       return SDValue();
45185     Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
45186   }
45187 
45188   if (Ret)
45189     Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
45190 
45191   return Ret;
45192 }
45193 
combineOr(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)45194 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
45195                          TargetLowering::DAGCombinerInfo &DCI,
45196                          const X86Subtarget &Subtarget) {
45197   SDValue N0 = N->getOperand(0);
45198   SDValue N1 = N->getOperand(1);
45199   EVT VT = N->getValueType(0);
45200 
45201   // If this is SSE1 only convert to FOR to avoid scalarization.
45202   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
45203     return DAG.getBitcast(MVT::v4i32,
45204                           DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
45205                                       DAG.getBitcast(MVT::v4f32, N0),
45206                                       DAG.getBitcast(MVT::v4f32, N1)));
45207   }
45208 
45209   // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
45210   // TODO: Support multiple SrcOps.
45211   if (VT == MVT::i1) {
45212     SmallVector<SDValue, 2> SrcOps;
45213     SmallVector<APInt, 2> SrcPartials;
45214     if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
45215         SrcOps.size() == 1) {
45216       SDLoc dl(N);
45217       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45218       unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
45219       EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
45220       SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
45221       if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
45222         Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
45223       if (Mask) {
45224         assert(SrcPartials[0].getBitWidth() == NumElts &&
45225                "Unexpected partial reduction mask");
45226         SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
45227         SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
45228         Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
45229         return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
45230       }
45231     }
45232   }
45233 
45234   if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
45235     return R;
45236 
45237   if (DCI.isBeforeLegalizeOps())
45238     return SDValue();
45239 
45240   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
45241     return R;
45242 
45243   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
45244     return FPLogic;
45245 
45246   if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
45247     return R;
45248 
45249   if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
45250     return R;
45251 
45252   // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
45253   // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
45254   // iff the upper elements of the non-shifted arg are zero.
45255   // KUNPCK require 16+ bool vector elements.
45256   if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
45257     unsigned NumElts = VT.getVectorNumElements();
45258     unsigned HalfElts = NumElts / 2;
45259     APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
45260     if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
45261         N1.getConstantOperandAPInt(1) == HalfElts &&
45262         DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {
45263       SDLoc dl(N);
45264       return DAG.getNode(
45265           ISD::CONCAT_VECTORS, dl, VT,
45266           extractSubVector(N0, 0, DAG, dl, HalfElts),
45267           extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
45268     }
45269     if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
45270         N0.getConstantOperandAPInt(1) == HalfElts &&
45271         DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {
45272       SDLoc dl(N);
45273       return DAG.getNode(
45274           ISD::CONCAT_VECTORS, dl, VT,
45275           extractSubVector(N1, 0, DAG, dl, HalfElts),
45276           extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
45277     }
45278   }
45279 
45280   // Attempt to recursively combine an OR of shuffles.
45281   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
45282     SDValue Op(N, 0);
45283     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
45284       return Res;
45285   }
45286 
45287   return SDValue();
45288 }
45289 
45290 /// Try to turn tests against the signbit in the form of:
45291 ///   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
45292 /// into:
45293 ///   SETGT(X, -1)
foldXorTruncShiftIntoCmp(SDNode * N,SelectionDAG & DAG)45294 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
45295   // This is only worth doing if the output type is i8 or i1.
45296   EVT ResultType = N->getValueType(0);
45297   if (ResultType != MVT::i8 && ResultType != MVT::i1)
45298     return SDValue();
45299 
45300   SDValue N0 = N->getOperand(0);
45301   SDValue N1 = N->getOperand(1);
45302 
45303   // We should be performing an xor against a truncated shift.
45304   if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
45305     return SDValue();
45306 
45307   // Make sure we are performing an xor against one.
45308   if (!isOneConstant(N1))
45309     return SDValue();
45310 
45311   // SetCC on x86 zero extends so only act on this if it's a logical shift.
45312   SDValue Shift = N0.getOperand(0);
45313   if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
45314     return SDValue();
45315 
45316   // Make sure we are truncating from one of i16, i32 or i64.
45317   EVT ShiftTy = Shift.getValueType();
45318   if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
45319     return SDValue();
45320 
45321   // Make sure the shift amount extracts the sign bit.
45322   if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
45323       Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
45324     return SDValue();
45325 
45326   // Create a greater-than comparison against -1.
45327   // N.B. Using SETGE against 0 works but we want a canonical looking
45328   // comparison, using SETGT matches up with what TranslateX86CC.
45329   SDLoc DL(N);
45330   SDValue ShiftOp = Shift.getOperand(0);
45331   EVT ShiftOpTy = ShiftOp.getValueType();
45332   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45333   EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
45334                                                *DAG.getContext(), ResultType);
45335   SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
45336                               DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
45337   if (SetCCResultType != ResultType)
45338     Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
45339   return Cond;
45340 }
45341 
45342 /// Turn vector tests of the signbit in the form of:
45343 ///   xor (sra X, elt_size(X)-1), -1
45344 /// into:
45345 ///   pcmpgt X, -1
45346 ///
45347 /// This should be called before type legalization because the pattern may not
45348 /// persist after that.
foldVectorXorShiftIntoCmp(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)45349 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
45350                                          const X86Subtarget &Subtarget) {
45351   EVT VT = N->getValueType(0);
45352   if (!VT.isSimple())
45353     return SDValue();
45354 
45355   switch (VT.getSimpleVT().SimpleTy) {
45356   default: return SDValue();
45357   case MVT::v16i8:
45358   case MVT::v8i16:
45359   case MVT::v4i32:
45360   case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
45361   case MVT::v32i8:
45362   case MVT::v16i16:
45363   case MVT::v8i32:
45364   case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
45365   }
45366 
45367   // There must be a shift right algebraic before the xor, and the xor must be a
45368   // 'not' operation.
45369   SDValue Shift = N->getOperand(0);
45370   SDValue Ones = N->getOperand(1);
45371   if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
45372       !ISD::isBuildVectorAllOnes(Ones.getNode()))
45373     return SDValue();
45374 
45375   // The shift should be smearing the sign bit across each vector element.
45376   auto *ShiftAmt =
45377       isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
45378   if (!ShiftAmt ||
45379       ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
45380     return SDValue();
45381 
45382   // Create a greater-than comparison against -1. We don't use the more obvious
45383   // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
45384   return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
45385 }
45386 
45387 /// Detect patterns of truncation with unsigned saturation:
45388 ///
45389 /// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
45390 ///   Return the source value x to be truncated or SDValue() if the pattern was
45391 ///   not matched.
45392 ///
45393 /// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
45394 ///   where C1 >= 0 and C2 is unsigned max of destination type.
45395 ///
45396 ///    (truncate (smax (smin (x, C2), C1)) to dest_type)
45397 ///   where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
45398 ///
45399 ///   These two patterns are equivalent to:
45400 ///   (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
45401 ///   So return the smax(x, C1) value to be truncated or SDValue() if the
45402 ///   pattern was not matched.
detectUSatPattern(SDValue In,EVT VT,SelectionDAG & DAG,const SDLoc & DL)45403 static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
45404                                  const SDLoc &DL) {
45405   EVT InVT = In.getValueType();
45406 
45407   // Saturation with truncation. We truncate from InVT to VT.
45408   assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
45409          "Unexpected types for truncate operation");
45410 
45411   // Match min/max and return limit value as a parameter.
45412   auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
45413     if (V.getOpcode() == Opcode &&
45414         ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
45415       return V.getOperand(0);
45416     return SDValue();
45417   };
45418 
45419   APInt C1, C2;
45420   if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
45421     // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
45422     // the element size of the destination type.
45423     if (C2.isMask(VT.getScalarSizeInBits()))
45424       return UMin;
45425 
45426   if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
45427     if (MatchMinMax(SMin, ISD::SMAX, C1))
45428       if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
45429         return SMin;
45430 
45431   if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
45432     if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
45433       if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
45434           C2.uge(C1)) {
45435         return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
45436       }
45437 
45438   return SDValue();
45439 }
45440 
45441 /// Detect patterns of truncation with signed saturation:
45442 /// (truncate (smin ((smax (x, signed_min_of_dest_type)),
45443 ///                  signed_max_of_dest_type)) to dest_type)
45444 /// or:
45445 /// (truncate (smax ((smin (x, signed_max_of_dest_type)),
45446 ///                  signed_min_of_dest_type)) to dest_type).
45447 /// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
45448 /// Return the source value to be truncated or SDValue() if the pattern was not
45449 /// matched.
detectSSatPattern(SDValue In,EVT VT,bool MatchPackUS=false)45450 static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
45451   unsigned NumDstBits = VT.getScalarSizeInBits();
45452   unsigned NumSrcBits = In.getScalarValueSizeInBits();
45453   assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
45454 
45455   auto MatchMinMax = [](SDValue V, unsigned Opcode,
45456                         const APInt &Limit) -> SDValue {
45457     APInt C;
45458     if (V.getOpcode() == Opcode &&
45459         ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
45460       return V.getOperand(0);
45461     return SDValue();
45462   };
45463 
45464   APInt SignedMax, SignedMin;
45465   if (MatchPackUS) {
45466     SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
45467     SignedMin = APInt(NumSrcBits, 0);
45468   } else {
45469     SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
45470     SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
45471   }
45472 
45473   if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
45474     if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
45475       return SMax;
45476 
45477   if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
45478     if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
45479       return SMin;
45480 
45481   return SDValue();
45482 }
45483 
combineTruncateWithSat(SDValue In,EVT VT,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget)45484 static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
45485                                       SelectionDAG &DAG,
45486                                       const X86Subtarget &Subtarget) {
45487   if (!Subtarget.hasSSE2() || !VT.isVector())
45488     return SDValue();
45489 
45490   EVT SVT = VT.getVectorElementType();
45491   EVT InVT = In.getValueType();
45492   EVT InSVT = InVT.getVectorElementType();
45493 
45494   // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
45495   // split across two registers. We can use a packusdw+perm to clamp to 0-65535
45496   // and concatenate at the same time. Then we can use a final vpmovuswb to
45497   // clip to 0-255.
45498   if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
45499       InVT == MVT::v16i32 && VT == MVT::v16i8) {
45500     if (auto USatVal = detectSSatPattern(In, VT, true)) {
45501       // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
45502       SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
45503                                            DL, DAG, Subtarget);
45504       assert(Mid && "Failed to pack!");
45505       return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
45506     }
45507   }
45508 
45509   // vXi32 truncate instructions are available with AVX512F.
45510   // vXi16 truncate instructions are only available with AVX512BW.
45511   // For 256-bit or smaller vectors, we require VLX.
45512   // FIXME: We could widen truncates to 512 to remove the VLX restriction.
45513   // If the result type is 256-bits or larger and we have disable 512-bit
45514   // registers, we should go ahead and use the pack instructions if possible.
45515   bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
45516                        (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
45517                       (InVT.getSizeInBits() > 128) &&
45518                       (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
45519                       !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
45520 
45521   if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
45522       VT.getSizeInBits() >= 64 &&
45523       (SVT == MVT::i8 || SVT == MVT::i16) &&
45524       (InSVT == MVT::i16 || InSVT == MVT::i32)) {
45525     if (auto USatVal = detectSSatPattern(In, VT, true)) {
45526       // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
45527       // Only do this when the result is at least 64 bits or we'll leaving
45528       // dangling PACKSSDW nodes.
45529       if (SVT == MVT::i8 && InSVT == MVT::i32) {
45530         EVT MidVT = VT.changeVectorElementType(MVT::i16);
45531         SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
45532                                              DAG, Subtarget);
45533         assert(Mid && "Failed to pack!");
45534         SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
45535                                            Subtarget);
45536         assert(V && "Failed to pack!");
45537         return V;
45538       } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
45539         return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
45540                                       Subtarget);
45541     }
45542     if (auto SSatVal = detectSSatPattern(In, VT))
45543       return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
45544                                     Subtarget);
45545   }
45546 
45547   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45548   if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
45549       Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
45550       (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
45551     unsigned TruncOpc = 0;
45552     SDValue SatVal;
45553     if (auto SSatVal = detectSSatPattern(In, VT)) {
45554       SatVal = SSatVal;
45555       TruncOpc = X86ISD::VTRUNCS;
45556     } else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
45557       SatVal = USatVal;
45558       TruncOpc = X86ISD::VTRUNCUS;
45559     }
45560     if (SatVal) {
45561       unsigned ResElts = VT.getVectorNumElements();
45562       // If the input type is less than 512 bits and we don't have VLX, we need
45563       // to widen to 512 bits.
45564       if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
45565         unsigned NumConcats = 512 / InVT.getSizeInBits();
45566         ResElts *= NumConcats;
45567         SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
45568         ConcatOps[0] = SatVal;
45569         InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
45570                                 NumConcats * InVT.getVectorNumElements());
45571         SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
45572       }
45573       // Widen the result if its narrower than 128 bits.
45574       if (ResElts * SVT.getSizeInBits() < 128)
45575         ResElts = 128 / SVT.getSizeInBits();
45576       EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
45577       SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
45578       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
45579                          DAG.getIntPtrConstant(0, DL));
45580     }
45581   }
45582 
45583   return SDValue();
45584 }
45585 
45586 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
45587 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
45588 /// X86ISD::AVG instruction.
detectAVGPattern(SDValue In,EVT VT,SelectionDAG & DAG,const X86Subtarget & Subtarget,const SDLoc & DL)45589 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
45590                                 const X86Subtarget &Subtarget,
45591                                 const SDLoc &DL) {
45592   if (!VT.isVector())
45593     return SDValue();
45594   EVT InVT = In.getValueType();
45595   unsigned NumElems = VT.getVectorNumElements();
45596 
45597   EVT ScalarVT = VT.getVectorElementType();
45598   if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
45599     return SDValue();
45600 
45601   // InScalarVT is the intermediate type in AVG pattern and it should be greater
45602   // than the original input type (i8/i16).
45603   EVT InScalarVT = InVT.getVectorElementType();
45604   if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
45605     return SDValue();
45606 
45607   if (!Subtarget.hasSSE2())
45608     return SDValue();
45609 
45610   // Detect the following pattern:
45611   //
45612   //   %1 = zext <N x i8> %a to <N x i32>
45613   //   %2 = zext <N x i8> %b to <N x i32>
45614   //   %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
45615   //   %4 = add nuw nsw <N x i32> %3, %2
45616   //   %5 = lshr <N x i32> %N, <i32 1 x N>
45617   //   %6 = trunc <N x i32> %5 to <N x i8>
45618   //
45619   // In AVX512, the last instruction can also be a trunc store.
45620   if (In.getOpcode() != ISD::SRL)
45621     return SDValue();
45622 
45623   // A lambda checking the given SDValue is a constant vector and each element
45624   // is in the range [Min, Max].
45625   auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
45626     return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
45627       return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
45628     });
45629   };
45630 
45631   // Check if each element of the vector is right-shifted by one.
45632   SDValue LHS = In.getOperand(0);
45633   SDValue RHS = In.getOperand(1);
45634   if (!IsConstVectorInRange(RHS, 1, 1))
45635     return SDValue();
45636   if (LHS.getOpcode() != ISD::ADD)
45637     return SDValue();
45638 
45639   // Detect a pattern of a + b + 1 where the order doesn't matter.
45640   SDValue Operands[3];
45641   Operands[0] = LHS.getOperand(0);
45642   Operands[1] = LHS.getOperand(1);
45643 
45644   auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45645                        ArrayRef<SDValue> Ops) {
45646     return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
45647   };
45648 
45649   auto AVGSplitter = [&](SDValue Op0, SDValue Op1) {
45650     // Pad to a power-of-2 vector, split+apply and extract the original vector.
45651     unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
45652     EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
45653     if (NumElemsPow2 != NumElems) {
45654       SmallVector<SDValue, 32> Ops0(NumElemsPow2, DAG.getUNDEF(ScalarVT));
45655       SmallVector<SDValue, 32> Ops1(NumElemsPow2, DAG.getUNDEF(ScalarVT));
45656       for (unsigned i = 0; i != NumElems; ++i) {
45657         SDValue Idx = DAG.getIntPtrConstant(i, DL);
45658         Ops0[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op0, Idx);
45659         Ops1[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op1, Idx);
45660       }
45661       Op0 = DAG.getBuildVector(Pow2VT, DL, Ops0);
45662       Op1 = DAG.getBuildVector(Pow2VT, DL, Ops1);
45663     }
45664     SDValue Res =
45665         SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, {Op0, Op1}, AVGBuilder);
45666     if (NumElemsPow2 == NumElems)
45667       return Res;
45668     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
45669                        DAG.getIntPtrConstant(0, DL));
45670   };
45671 
45672   // Take care of the case when one of the operands is a constant vector whose
45673   // element is in the range [1, 256].
45674   if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
45675       Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
45676       Operands[0].getOperand(0).getValueType() == VT) {
45677     // The pattern is detected. Subtract one from the constant vector, then
45678     // demote it and emit X86ISD::AVG instruction.
45679     SDValue VecOnes = DAG.getConstant(1, DL, InVT);
45680     Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
45681     Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
45682     return AVGSplitter(Operands[0].getOperand(0), Operands[1]);
45683   }
45684 
45685   // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
45686   // Match the or case only if its 'add-like' - can be replaced by an add.
45687   auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
45688     if (ISD::ADD == V.getOpcode()) {
45689       Op0 = V.getOperand(0);
45690       Op1 = V.getOperand(1);
45691       return true;
45692     }
45693     if (ISD::ZERO_EXTEND != V.getOpcode())
45694       return false;
45695     V = V.getOperand(0);
45696     if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
45697         !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
45698       return false;
45699     Op0 = V.getOperand(0);
45700     Op1 = V.getOperand(1);
45701     return true;
45702   };
45703 
45704   SDValue Op0, Op1;
45705   if (FindAddLike(Operands[0], Op0, Op1))
45706     std::swap(Operands[0], Operands[1]);
45707   else if (!FindAddLike(Operands[1], Op0, Op1))
45708     return SDValue();
45709   Operands[2] = Op0;
45710   Operands[1] = Op1;
45711 
45712   // Now we have three operands of two additions. Check that one of them is a
45713   // constant vector with ones, and the other two can be promoted from i8/i16.
45714   for (int i = 0; i < 3; ++i) {
45715     if (!IsConstVectorInRange(Operands[i], 1, 1))
45716       continue;
45717     std::swap(Operands[i], Operands[2]);
45718 
45719     // Check if Operands[0] and Operands[1] are results of type promotion.
45720     for (int j = 0; j < 2; ++j)
45721       if (Operands[j].getValueType() != VT) {
45722         if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
45723             Operands[j].getOperand(0).getValueType() != VT)
45724           return SDValue();
45725         Operands[j] = Operands[j].getOperand(0);
45726       }
45727 
45728     // The pattern is detected, emit X86ISD::AVG instruction(s).
45729     return AVGSplitter(Operands[0], Operands[1]);
45730   }
45731 
45732   return SDValue();
45733 }
45734 
combineLoad(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)45735 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
45736                            TargetLowering::DAGCombinerInfo &DCI,
45737                            const X86Subtarget &Subtarget) {
45738   LoadSDNode *Ld = cast<LoadSDNode>(N);
45739   EVT RegVT = Ld->getValueType(0);
45740   EVT MemVT = Ld->getMemoryVT();
45741   SDLoc dl(Ld);
45742   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45743 
45744   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
45745   // into two 16-byte operations. Also split non-temporal aligned loads on
45746   // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
45747   ISD::LoadExtType Ext = Ld->getExtensionType();
45748   bool Fast;
45749   if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
45750       Ext == ISD::NON_EXTLOAD &&
45751       ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
45752         Ld->getAlignment() >= 16) ||
45753        (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
45754                                *Ld->getMemOperand(), &Fast) &&
45755         !Fast))) {
45756     unsigned NumElems = RegVT.getVectorNumElements();
45757     if (NumElems < 2)
45758       return SDValue();
45759 
45760     unsigned HalfOffset = 16;
45761     SDValue Ptr1 = Ld->getBasePtr();
45762     SDValue Ptr2 =
45763         DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
45764     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
45765                                   NumElems / 2);
45766     SDValue Load1 =
45767         DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
45768                     Ld->getOriginalAlign(),
45769                     Ld->getMemOperand()->getFlags());
45770     SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
45771                                 Ld->getPointerInfo().getWithOffset(HalfOffset),
45772                                 Ld->getOriginalAlign(),
45773                                 Ld->getMemOperand()->getFlags());
45774     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
45775                              Load1.getValue(1), Load2.getValue(1));
45776 
45777     SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
45778     return DCI.CombineTo(N, NewVec, TF, true);
45779   }
45780 
45781   // Bool vector load - attempt to cast to an integer, as we have good
45782   // (vXiY *ext(vXi1 bitcast(iX))) handling.
45783   if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
45784       RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
45785     unsigned NumElts = RegVT.getVectorNumElements();
45786     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
45787     if (TLI.isTypeLegal(IntVT)) {
45788       SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
45789                                     Ld->getPointerInfo(),
45790                                     Ld->getOriginalAlign(),
45791                                     Ld->getMemOperand()->getFlags());
45792       SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
45793       return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
45794     }
45795   }
45796 
45797   // If we also broadcast this as a subvector to a wider type, then just extract
45798   // the lowest subvector.
45799   if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
45800       (RegVT.is128BitVector() || RegVT.is256BitVector())) {
45801     SDValue Ptr = Ld->getBasePtr();
45802     SDValue Chain = Ld->getChain();
45803     for (SDNode *User : Ptr->uses()) {
45804       if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
45805           cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
45806           cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
45807           cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
45808               MemVT.getSizeInBits() &&
45809           !User->hasAnyUseOfValue(1) &&
45810           User->getValueSizeInBits(0).getFixedSize() >
45811               RegVT.getFixedSizeInBits()) {
45812         SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
45813                                            RegVT.getSizeInBits());
45814         Extract = DAG.getBitcast(RegVT, Extract);
45815         return DCI.CombineTo(N, Extract, SDValue(User, 1));
45816       }
45817     }
45818   }
45819 
45820   // Cast ptr32 and ptr64 pointers to the default address space before a load.
45821   unsigned AddrSpace = Ld->getAddressSpace();
45822   if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
45823       AddrSpace == X86AS::PTR32_UPTR) {
45824     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
45825     if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
45826       SDValue Cast =
45827           DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
45828       return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
45829                          Ld->getOriginalAlign(),
45830                          Ld->getMemOperand()->getFlags());
45831     }
45832   }
45833 
45834   return SDValue();
45835 }
45836 
45837 /// If V is a build vector of boolean constants and exactly one of those
45838 /// constants is true, return the operand index of that true element.
45839 /// Otherwise, return -1.
getOneTrueElt(SDValue V)45840 static int getOneTrueElt(SDValue V) {
45841   // This needs to be a build vector of booleans.
45842   // TODO: Checking for the i1 type matches the IR definition for the mask,
45843   // but the mask check could be loosened to i8 or other types. That might
45844   // also require checking more than 'allOnesValue'; eg, the x86 HW
45845   // instructions only require that the MSB is set for each mask element.
45846   // The ISD::MSTORE comments/definition do not specify how the mask operand
45847   // is formatted.
45848   auto *BV = dyn_cast<BuildVectorSDNode>(V);
45849   if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
45850     return -1;
45851 
45852   int TrueIndex = -1;
45853   unsigned NumElts = BV->getValueType(0).getVectorNumElements();
45854   for (unsigned i = 0; i < NumElts; ++i) {
45855     const SDValue &Op = BV->getOperand(i);
45856     if (Op.isUndef())
45857       continue;
45858     auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
45859     if (!ConstNode)
45860       return -1;
45861     if (ConstNode->getAPIntValue().countTrailingOnes() >= 1) {
45862       // If we already found a one, this is too many.
45863       if (TrueIndex >= 0)
45864         return -1;
45865       TrueIndex = i;
45866     }
45867   }
45868   return TrueIndex;
45869 }
45870 
45871 /// Given a masked memory load/store operation, return true if it has one mask
45872 /// bit set. If it has one mask bit set, then also return the memory address of
45873 /// the scalar element to load/store, the vector index to insert/extract that
45874 /// scalar element, and the alignment for the scalar memory access.
getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode * MaskedOp,SelectionDAG & DAG,SDValue & Addr,SDValue & Index,Align & Alignment,unsigned & Offset)45875 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
45876                                          SelectionDAG &DAG, SDValue &Addr,
45877                                          SDValue &Index, Align &Alignment,
45878                                          unsigned &Offset) {
45879   int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
45880   if (TrueMaskElt < 0)
45881     return false;
45882 
45883   // Get the address of the one scalar element that is specified by the mask
45884   // using the appropriate offset from the base pointer.
45885   EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
45886   Offset = 0;
45887   Addr = MaskedOp->getBasePtr();
45888   if (TrueMaskElt != 0) {
45889     Offset = TrueMaskElt * EltVT.getStoreSize();
45890     Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
45891                                     SDLoc(MaskedOp));
45892   }
45893 
45894   Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
45895   Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
45896                               EltVT.getStoreSize());
45897   return true;
45898 }
45899 
45900 /// If exactly one element of the mask is set for a non-extending masked load,
45901 /// it is a scalar load and vector insert.
45902 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
45903 /// mask have already been optimized in IR, so we don't bother with those here.
45904 static SDValue
reduceMaskedLoadToScalarLoad(MaskedLoadSDNode * ML,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)45905 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
45906                              TargetLowering::DAGCombinerInfo &DCI,
45907                              const X86Subtarget &Subtarget) {
45908   assert(ML->isUnindexed() && "Unexpected indexed masked load!");
45909   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
45910   // However, some target hooks may need to be added to know when the transform
45911   // is profitable. Endianness would also have to be considered.
45912 
45913   SDValue Addr, VecIndex;
45914   Align Alignment;
45915   unsigned Offset;
45916   if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
45917     return SDValue();
45918 
45919   // Load the one scalar element that is specified by the mask using the
45920   // appropriate offset from the base pointer.
45921   SDLoc DL(ML);
45922   EVT VT = ML->getValueType(0);
45923   EVT EltVT = VT.getVectorElementType();
45924 
45925   EVT CastVT = VT;
45926   if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
45927     EltVT = MVT::f64;
45928     CastVT = VT.changeVectorElementType(EltVT);
45929   }
45930 
45931   SDValue Load =
45932       DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
45933                   ML->getPointerInfo().getWithOffset(Offset),
45934                   Alignment, ML->getMemOperand()->getFlags());
45935 
45936   SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
45937 
45938   // Insert the loaded element into the appropriate place in the vector.
45939   SDValue Insert =
45940       DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
45941   Insert = DAG.getBitcast(VT, Insert);
45942   return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
45943 }
45944 
45945 static SDValue
combineMaskedLoadConstantMask(MaskedLoadSDNode * ML,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)45946 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
45947                               TargetLowering::DAGCombinerInfo &DCI) {
45948   assert(ML->isUnindexed() && "Unexpected indexed masked load!");
45949   if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
45950     return SDValue();
45951 
45952   SDLoc DL(ML);
45953   EVT VT = ML->getValueType(0);
45954 
45955   // If we are loading the first and last elements of a vector, it is safe and
45956   // always faster to load the whole vector. Replace the masked load with a
45957   // vector load and select.
45958   unsigned NumElts = VT.getVectorNumElements();
45959   BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
45960   bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
45961   bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
45962   if (LoadFirstElt && LoadLastElt) {
45963     SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
45964                                 ML->getMemOperand());
45965     SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
45966                                   ML->getPassThru());
45967     return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
45968   }
45969 
45970   // Convert a masked load with a constant mask into a masked load and a select.
45971   // This allows the select operation to use a faster kind of select instruction
45972   // (for example, vblendvps -> vblendps).
45973 
45974   // Don't try this if the pass-through operand is already undefined. That would
45975   // cause an infinite loop because that's what we're about to create.
45976   if (ML->getPassThru().isUndef())
45977     return SDValue();
45978 
45979   if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
45980     return SDValue();
45981 
45982   // The new masked load has an undef pass-through operand. The select uses the
45983   // original pass-through operand.
45984   SDValue NewML = DAG.getMaskedLoad(
45985       VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
45986       DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
45987       ML->getAddressingMode(), ML->getExtensionType());
45988   SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
45989                                 ML->getPassThru());
45990 
45991   return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
45992 }
45993 
combineMaskedLoad(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)45994 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
45995                                  TargetLowering::DAGCombinerInfo &DCI,
45996                                  const X86Subtarget &Subtarget) {
45997   auto *Mld = cast<MaskedLoadSDNode>(N);
45998 
45999   // TODO: Expanding load with constant mask may be optimized as well.
46000   if (Mld->isExpandingLoad())
46001     return SDValue();
46002 
46003   if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
46004     if (SDValue ScalarLoad =
46005             reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
46006       return ScalarLoad;
46007 
46008     // TODO: Do some AVX512 subsets benefit from this transform?
46009     if (!Subtarget.hasAVX512())
46010       if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
46011         return Blend;
46012   }
46013 
46014   // If the mask value has been legalized to a non-boolean vector, try to
46015   // simplify ops leading up to it. We only demand the MSB of each lane.
46016   SDValue Mask = Mld->getMask();
46017   if (Mask.getScalarValueSizeInBits() != 1) {
46018     EVT VT = Mld->getValueType(0);
46019     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46020     APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
46021     if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
46022       if (N->getOpcode() != ISD::DELETED_NODE)
46023         DCI.AddToWorklist(N);
46024       return SDValue(N, 0);
46025     }
46026     if (SDValue NewMask =
46027             TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
46028       return DAG.getMaskedLoad(
46029           VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
46030           NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
46031           Mld->getAddressingMode(), Mld->getExtensionType());
46032   }
46033 
46034   return SDValue();
46035 }
46036 
46037 /// If exactly one element of the mask is set for a non-truncating masked store,
46038 /// it is a vector extract and scalar store.
46039 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
46040 /// mask have already been optimized in IR, so we don't bother with those here.
reduceMaskedStoreToScalarStore(MaskedStoreSDNode * MS,SelectionDAG & DAG,const X86Subtarget & Subtarget)46041 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
46042                                               SelectionDAG &DAG,
46043                                               const X86Subtarget &Subtarget) {
46044   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
46045   // However, some target hooks may need to be added to know when the transform
46046   // is profitable. Endianness would also have to be considered.
46047 
46048   SDValue Addr, VecIndex;
46049   Align Alignment;
46050   unsigned Offset;
46051   if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
46052     return SDValue();
46053 
46054   // Extract the one scalar element that is actually being stored.
46055   SDLoc DL(MS);
46056   SDValue Value = MS->getValue();
46057   EVT VT = Value.getValueType();
46058   EVT EltVT = VT.getVectorElementType();
46059   if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
46060     EltVT = MVT::f64;
46061     EVT CastVT = VT.changeVectorElementType(EltVT);
46062     Value = DAG.getBitcast(CastVT, Value);
46063   }
46064   SDValue Extract =
46065       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
46066 
46067   // Store that element at the appropriate offset from the base pointer.
46068   return DAG.getStore(MS->getChain(), DL, Extract, Addr,
46069                       MS->getPointerInfo().getWithOffset(Offset),
46070                       Alignment, MS->getMemOperand()->getFlags());
46071 }
46072 
combineMaskedStore(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)46073 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
46074                                   TargetLowering::DAGCombinerInfo &DCI,
46075                                   const X86Subtarget &Subtarget) {
46076   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
46077   if (Mst->isCompressingStore())
46078     return SDValue();
46079 
46080   EVT VT = Mst->getValue().getValueType();
46081   SDLoc dl(Mst);
46082   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46083 
46084   if (Mst->isTruncatingStore())
46085     return SDValue();
46086 
46087   if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
46088     return ScalarStore;
46089 
46090   // If the mask value has been legalized to a non-boolean vector, try to
46091   // simplify ops leading up to it. We only demand the MSB of each lane.
46092   SDValue Mask = Mst->getMask();
46093   if (Mask.getScalarValueSizeInBits() != 1) {
46094     APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
46095     if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
46096       if (N->getOpcode() != ISD::DELETED_NODE)
46097         DCI.AddToWorklist(N);
46098       return SDValue(N, 0);
46099     }
46100     if (SDValue NewMask =
46101             TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
46102       return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
46103                                 Mst->getBasePtr(), Mst->getOffset(), NewMask,
46104                                 Mst->getMemoryVT(), Mst->getMemOperand(),
46105                                 Mst->getAddressingMode());
46106   }
46107 
46108   SDValue Value = Mst->getValue();
46109   if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
46110       TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
46111                             Mst->getMemoryVT())) {
46112     return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
46113                               Mst->getBasePtr(), Mst->getOffset(), Mask,
46114                               Mst->getMemoryVT(), Mst->getMemOperand(),
46115                               Mst->getAddressingMode(), true);
46116   }
46117 
46118   return SDValue();
46119 }
46120 
combineStore(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)46121 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
46122                             TargetLowering::DAGCombinerInfo &DCI,
46123                             const X86Subtarget &Subtarget) {
46124   StoreSDNode *St = cast<StoreSDNode>(N);
46125   EVT StVT = St->getMemoryVT();
46126   SDLoc dl(St);
46127   SDValue StoredVal = St->getValue();
46128   EVT VT = StoredVal.getValueType();
46129   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46130 
46131   // Convert a store of vXi1 into a store of iX and a bitcast.
46132   if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
46133       VT.getVectorElementType() == MVT::i1) {
46134 
46135     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
46136     StoredVal = DAG.getBitcast(NewVT, StoredVal);
46137 
46138     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
46139                         St->getPointerInfo(), St->getOriginalAlign(),
46140                         St->getMemOperand()->getFlags());
46141   }
46142 
46143   // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
46144   // This will avoid a copy to k-register.
46145   if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
46146       StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
46147       StoredVal.getOperand(0).getValueType() == MVT::i8) {
46148     SDValue Val = StoredVal.getOperand(0);
46149     // We must store zeros to the unused bits.
46150     Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
46151     return DAG.getStore(St->getChain(), dl, Val,
46152                         St->getBasePtr(), St->getPointerInfo(),
46153                         St->getOriginalAlign(),
46154                         St->getMemOperand()->getFlags());
46155   }
46156 
46157   // Widen v2i1/v4i1 stores to v8i1.
46158   if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
46159       Subtarget.hasAVX512()) {
46160     unsigned NumConcats = 8 / VT.getVectorNumElements();
46161     // We must store zeros to the unused bits.
46162     SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
46163     Ops[0] = StoredVal;
46164     StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
46165     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
46166                         St->getPointerInfo(), St->getOriginalAlign(),
46167                         St->getMemOperand()->getFlags());
46168   }
46169 
46170   // Turn vXi1 stores of constants into a scalar store.
46171   if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
46172        VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
46173       ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
46174     // If its a v64i1 store without 64-bit support, we need two stores.
46175     if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
46176       SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
46177                                       StoredVal->ops().slice(0, 32));
46178       Lo = combinevXi1ConstantToInteger(Lo, DAG);
46179       SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
46180                                       StoredVal->ops().slice(32, 32));
46181       Hi = combinevXi1ConstantToInteger(Hi, DAG);
46182 
46183       SDValue Ptr0 = St->getBasePtr();
46184       SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
46185 
46186       SDValue Ch0 =
46187           DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
46188                        St->getOriginalAlign(),
46189                        St->getMemOperand()->getFlags());
46190       SDValue Ch1 =
46191           DAG.getStore(St->getChain(), dl, Hi, Ptr1,
46192                        St->getPointerInfo().getWithOffset(4),
46193                        St->getOriginalAlign(),
46194                        St->getMemOperand()->getFlags());
46195       return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
46196     }
46197 
46198     StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
46199     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
46200                         St->getPointerInfo(), St->getOriginalAlign(),
46201                         St->getMemOperand()->getFlags());
46202   }
46203 
46204   // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
46205   // Sandy Bridge, perform two 16-byte stores.
46206   bool Fast;
46207   if (VT.is256BitVector() && StVT == VT &&
46208       TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
46209                              *St->getMemOperand(), &Fast) &&
46210       !Fast) {
46211     unsigned NumElems = VT.getVectorNumElements();
46212     if (NumElems < 2)
46213       return SDValue();
46214 
46215     return splitVectorStore(St, DAG);
46216   }
46217 
46218   // Split under-aligned vector non-temporal stores.
46219   if (St->isNonTemporal() && StVT == VT &&
46220       St->getAlignment() < VT.getStoreSize()) {
46221     // ZMM/YMM nt-stores - either it can be stored as a series of shorter
46222     // vectors or the legalizer can scalarize it to use MOVNTI.
46223     if (VT.is256BitVector() || VT.is512BitVector()) {
46224       unsigned NumElems = VT.getVectorNumElements();
46225       if (NumElems < 2)
46226         return SDValue();
46227       return splitVectorStore(St, DAG);
46228     }
46229 
46230     // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
46231     // to use MOVNTI.
46232     if (VT.is128BitVector() && Subtarget.hasSSE2()) {
46233       MVT NTVT = Subtarget.hasSSE4A()
46234                      ? MVT::v2f64
46235                      : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
46236       return scalarizeVectorStore(St, NTVT, DAG);
46237     }
46238   }
46239 
46240   // Try to optimize v16i16->v16i8 truncating stores when BWI is not
46241   // supported, but avx512f is by extending to v16i32 and truncating.
46242   if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
46243       St->getValue().getOpcode() == ISD::TRUNCATE &&
46244       St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
46245       TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
46246       St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
46247     SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
46248     return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
46249                              MVT::v16i8, St->getMemOperand());
46250   }
46251 
46252   // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
46253   if (!St->isTruncatingStore() && StoredVal.hasOneUse() &&
46254       (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
46255        StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
46256       TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
46257     bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
46258     return EmitTruncSStore(IsSigned, St->getChain(),
46259                            dl, StoredVal.getOperand(0), St->getBasePtr(),
46260                            VT, St->getMemOperand(), DAG);
46261   }
46262 
46263   // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
46264   if (!St->isTruncatingStore() && StoredVal.hasOneUse()) {
46265     auto IsExtractedElement = [](SDValue V) {
46266       if (V.getOpcode() == ISD::TRUNCATE && V.getOperand(0).hasOneUse())
46267         V = V.getOperand(0);
46268       unsigned Opc = V.getOpcode();
46269       if (Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) {
46270         if (V.getOperand(0).hasOneUse() && isNullConstant(V.getOperand(1)))
46271           return V.getOperand(0);
46272       }
46273       return SDValue();
46274     };
46275     if (SDValue Extract = IsExtractedElement(StoredVal)) {
46276       SDValue Trunc = peekThroughOneUseBitcasts(Extract);
46277       if (Trunc.getOpcode() == X86ISD::VTRUNC) {
46278         SDValue Src = Trunc.getOperand(0);
46279         MVT DstVT = Trunc.getSimpleValueType();
46280         MVT SrcVT = Src.getSimpleValueType();
46281         unsigned NumSrcElts = SrcVT.getVectorNumElements();
46282         unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
46283         MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
46284         if (NumTruncBits == VT.getSizeInBits() &&
46285             TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
46286           return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
46287                                    TruncVT, St->getMemOperand());
46288         }
46289       }
46290     }
46291   }
46292 
46293   // Optimize trunc store (of multiple scalars) to shuffle and store.
46294   // First, pack all of the elements in one place. Next, store to memory
46295   // in fewer chunks.
46296   if (St->isTruncatingStore() && VT.isVector()) {
46297     // Check if we can detect an AVG pattern from the truncation. If yes,
46298     // replace the trunc store by a normal store with the result of X86ISD::AVG
46299     // instruction.
46300     if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
46301       if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
46302                                          Subtarget, dl))
46303         return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
46304                             St->getPointerInfo(), St->getOriginalAlign(),
46305                             St->getMemOperand()->getFlags());
46306 
46307     if (TLI.isTruncStoreLegal(VT, StVT)) {
46308       if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
46309         return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
46310                                dl, Val, St->getBasePtr(),
46311                                St->getMemoryVT(), St->getMemOperand(), DAG);
46312       if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
46313                                           DAG, dl))
46314         return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
46315                                dl, Val, St->getBasePtr(),
46316                                St->getMemoryVT(), St->getMemOperand(), DAG);
46317     }
46318 
46319     return SDValue();
46320   }
46321 
46322   // Cast ptr32 and ptr64 pointers to the default address space before a store.
46323   unsigned AddrSpace = St->getAddressSpace();
46324   if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
46325       AddrSpace == X86AS::PTR32_UPTR) {
46326     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
46327     if (PtrVT != St->getBasePtr().getSimpleValueType()) {
46328       SDValue Cast =
46329           DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
46330       return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
46331                           St->getPointerInfo(), St->getOriginalAlign(),
46332                           St->getMemOperand()->getFlags(), St->getAAInfo());
46333     }
46334   }
46335 
46336   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
46337   // the FP state in cases where an emms may be missing.
46338   // A preferable solution to the general problem is to figure out the right
46339   // places to insert EMMS.  This qualifies as a quick hack.
46340 
46341   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
46342   if (VT.getSizeInBits() != 64)
46343     return SDValue();
46344 
46345   const Function &F = DAG.getMachineFunction().getFunction();
46346   bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
46347   bool F64IsLegal =
46348       !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
46349   if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
46350       isa<LoadSDNode>(St->getValue()) &&
46351       cast<LoadSDNode>(St->getValue())->isSimple() &&
46352       St->getChain().hasOneUse() && St->isSimple()) {
46353     LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
46354 
46355     if (!ISD::isNormalLoad(Ld))
46356       return SDValue();
46357 
46358     // Avoid the transformation if there are multiple uses of the loaded value.
46359     if (!Ld->hasNUsesOfValue(1, 0))
46360       return SDValue();
46361 
46362     SDLoc LdDL(Ld);
46363     SDLoc StDL(N);
46364     // Lower to a single movq load/store pair.
46365     SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
46366                                 Ld->getBasePtr(), Ld->getMemOperand());
46367 
46368     // Make sure new load is placed in same chain order.
46369     DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
46370     return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
46371                         St->getMemOperand());
46372   }
46373 
46374   // This is similar to the above case, but here we handle a scalar 64-bit
46375   // integer store that is extracted from a vector on a 32-bit target.
46376   // If we have SSE2, then we can treat it like a floating-point double
46377   // to get past legalization. The execution dependencies fixup pass will
46378   // choose the optimal machine instruction for the store if this really is
46379   // an integer or v2f32 rather than an f64.
46380   if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
46381       St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
46382     SDValue OldExtract = St->getOperand(1);
46383     SDValue ExtOp0 = OldExtract.getOperand(0);
46384     unsigned VecSize = ExtOp0.getValueSizeInBits();
46385     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
46386     SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
46387     SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
46388                                      BitCast, OldExtract.getOperand(1));
46389     return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
46390                         St->getPointerInfo(), St->getOriginalAlign(),
46391                         St->getMemOperand()->getFlags());
46392   }
46393 
46394   return SDValue();
46395 }
46396 
combineVEXTRACT_STORE(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)46397 static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
46398                                      TargetLowering::DAGCombinerInfo &DCI,
46399                                      const X86Subtarget &Subtarget) {
46400   auto *St = cast<MemIntrinsicSDNode>(N);
46401 
46402   SDValue StoredVal = N->getOperand(1);
46403   MVT VT = StoredVal.getSimpleValueType();
46404   EVT MemVT = St->getMemoryVT();
46405 
46406   // Figure out which elements we demand.
46407   unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
46408   APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
46409 
46410   APInt KnownUndef, KnownZero;
46411   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46412   if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef,
46413                                      KnownZero, DCI)) {
46414     if (N->getOpcode() != ISD::DELETED_NODE)
46415       DCI.AddToWorklist(N);
46416     return SDValue(N, 0);
46417   }
46418 
46419   return SDValue();
46420 }
46421 
46422 /// Return 'true' if this vector operation is "horizontal"
46423 /// and return the operands for the horizontal operation in LHS and RHS.  A
46424 /// horizontal operation performs the binary operation on successive elements
46425 /// of its first operand, then on successive elements of its second operand,
46426 /// returning the resulting values in a vector.  For example, if
46427 ///   A = < float a0, float a1, float a2, float a3 >
46428 /// and
46429 ///   B = < float b0, float b1, float b2, float b3 >
46430 /// then the result of doing a horizontal operation on A and B is
46431 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
46432 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
46433 /// A horizontal-op B, for some already available A and B, and if so then LHS is
46434 /// set to A, RHS to B, and the routine returns 'true'.
isHorizontalBinOp(unsigned HOpcode,SDValue & LHS,SDValue & RHS,SelectionDAG & DAG,const X86Subtarget & Subtarget,bool IsCommutative,SmallVectorImpl<int> & PostShuffleMask)46435 static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
46436                               SelectionDAG &DAG, const X86Subtarget &Subtarget,
46437                               bool IsCommutative,
46438                               SmallVectorImpl<int> &PostShuffleMask) {
46439   // If either operand is undef, bail out. The binop should be simplified.
46440   if (LHS.isUndef() || RHS.isUndef())
46441     return false;
46442 
46443   // Look for the following pattern:
46444   //   A = < float a0, float a1, float a2, float a3 >
46445   //   B = < float b0, float b1, float b2, float b3 >
46446   // and
46447   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
46448   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
46449   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
46450   // which is A horizontal-op B.
46451 
46452   MVT VT = LHS.getSimpleValueType();
46453   assert((VT.is128BitVector() || VT.is256BitVector()) &&
46454          "Unsupported vector type for horizontal add/sub");
46455   unsigned NumElts = VT.getVectorNumElements();
46456 
46457   auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
46458                         SmallVectorImpl<int> &ShuffleMask) {
46459     bool UseSubVector = false;
46460     if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
46461         Op.getOperand(0).getValueType().is256BitVector() &&
46462         llvm::isNullConstant(Op.getOperand(1))) {
46463       Op = Op.getOperand(0);
46464       UseSubVector = true;
46465     }
46466     SmallVector<SDValue, 2> SrcOps;
46467     SmallVector<int, 16> SrcMask, ScaledMask;
46468     SDValue BC = peekThroughBitcasts(Op);
46469     if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
46470         !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
46471           return Op.getValueSizeInBits() == BC.getValueSizeInBits();
46472         })) {
46473       resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
46474       if (!UseSubVector && SrcOps.size() <= 2 &&
46475           scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
46476         N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
46477         N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
46478         ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
46479       }
46480       if (UseSubVector && SrcOps.size() == 1 &&
46481           scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
46482         std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
46483         ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
46484         ShuffleMask.assign(Mask.begin(), Mask.end());
46485       }
46486     }
46487   };
46488 
46489   // View LHS in the form
46490   //   LHS = VECTOR_SHUFFLE A, B, LMask
46491   // If LHS is not a shuffle, then pretend it is the identity shuffle:
46492   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
46493   // NOTE: A default initialized SDValue represents an UNDEF of type VT.
46494   SDValue A, B;
46495   SmallVector<int, 16> LMask;
46496   GetShuffle(LHS, A, B, LMask);
46497 
46498   // Likewise, view RHS in the form
46499   //   RHS = VECTOR_SHUFFLE C, D, RMask
46500   SDValue C, D;
46501   SmallVector<int, 16> RMask;
46502   GetShuffle(RHS, C, D, RMask);
46503 
46504   // At least one of the operands should be a vector shuffle.
46505   unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
46506   if (NumShuffles == 0)
46507     return false;
46508 
46509   if (LMask.empty()) {
46510     A = LHS;
46511     for (unsigned i = 0; i != NumElts; ++i)
46512       LMask.push_back(i);
46513   }
46514 
46515   if (RMask.empty()) {
46516     C = RHS;
46517     for (unsigned i = 0; i != NumElts; ++i)
46518       RMask.push_back(i);
46519   }
46520 
46521   // If we have an unary mask, ensure the other op is set to null.
46522   if (isUndefOrInRange(LMask, 0, NumElts))
46523     B = SDValue();
46524   else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
46525     A = SDValue();
46526 
46527   if (isUndefOrInRange(RMask, 0, NumElts))
46528     D = SDValue();
46529   else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
46530     C = SDValue();
46531 
46532   // If A and B occur in reverse order in RHS, then canonicalize by commuting
46533   // RHS operands and shuffle mask.
46534   if (A != C) {
46535     std::swap(C, D);
46536     ShuffleVectorSDNode::commuteMask(RMask);
46537   }
46538   // Check that the shuffles are both shuffling the same vectors.
46539   if (!(A == C && B == D))
46540     return false;
46541 
46542   PostShuffleMask.clear();
46543   PostShuffleMask.append(NumElts, SM_SentinelUndef);
46544 
46545   // LHS and RHS are now:
46546   //   LHS = shuffle A, B, LMask
46547   //   RHS = shuffle A, B, RMask
46548   // Check that the masks correspond to performing a horizontal operation.
46549   // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
46550   // so we just repeat the inner loop if this is a 256-bit op.
46551   unsigned Num128BitChunks = VT.getSizeInBits() / 128;
46552   unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
46553   unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
46554   assert((NumEltsPer128BitChunk % 2 == 0) &&
46555          "Vector type should have an even number of elements in each lane");
46556   for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
46557     for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
46558       // Ignore undefined components.
46559       int LIdx = LMask[i + j], RIdx = RMask[i + j];
46560       if (LIdx < 0 || RIdx < 0 ||
46561           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
46562           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
46563         continue;
46564 
46565       // Check that successive odd/even elements are being operated on. If not,
46566       // this is not a horizontal operation.
46567       if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
46568           !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
46569         return false;
46570 
46571       // Compute the post-shuffle mask index based on where the element
46572       // is stored in the HOP result, and where it needs to be moved to.
46573       int Base = LIdx & ~1u;
46574       int Index = ((Base % NumEltsPer128BitChunk) / 2) +
46575                   ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
46576 
46577       // The  low half of the 128-bit result must choose from A.
46578       // The high half of the 128-bit result must choose from B,
46579       // unless B is undef. In that case, we are always choosing from A.
46580       if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
46581         Index += NumEltsPer64BitChunk;
46582       PostShuffleMask[i + j] = Index;
46583     }
46584   }
46585 
46586   SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
46587   SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
46588 
46589   bool IsIdentityPostShuffle =
46590       isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
46591   if (IsIdentityPostShuffle)
46592     PostShuffleMask.clear();
46593 
46594   // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
46595   if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
46596       isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
46597     return false;
46598 
46599   // If the source nodes are already used in HorizOps then always accept this.
46600   // Shuffle folding should merge these back together.
46601   bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
46602     return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
46603   });
46604   bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
46605     return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
46606   });
46607   bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
46608 
46609   // Assume a SingleSource HOP if we only shuffle one input and don't need to
46610   // shuffle the result.
46611   if (!ForceHorizOp &&
46612       !shouldUseHorizontalOp(NewLHS == NewRHS &&
46613                                  (NumShuffles < 2 || !IsIdentityPostShuffle),
46614                              DAG, Subtarget))
46615     return false;
46616 
46617   LHS = DAG.getBitcast(VT, NewLHS);
46618   RHS = DAG.getBitcast(VT, NewRHS);
46619   return true;
46620 }
46621 
46622 // Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
combineToHorizontalAddSub(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)46623 static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
46624                                          const X86Subtarget &Subtarget) {
46625   EVT VT = N->getValueType(0);
46626   unsigned Opcode = N->getOpcode();
46627   bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
46628   SmallVector<int, 8> PostShuffleMask;
46629 
46630   switch (Opcode) {
46631   case ISD::FADD:
46632   case ISD::FSUB:
46633     if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
46634         (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
46635       SDValue LHS = N->getOperand(0);
46636       SDValue RHS = N->getOperand(1);
46637       auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
46638       if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
46639                             PostShuffleMask)) {
46640         SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
46641         if (!PostShuffleMask.empty())
46642           HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
46643                                             DAG.getUNDEF(VT), PostShuffleMask);
46644         return HorizBinOp;
46645       }
46646     }
46647     break;
46648   case ISD::ADD:
46649   case ISD::SUB:
46650     if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
46651                                  VT == MVT::v16i16 || VT == MVT::v8i32)) {
46652       SDValue LHS = N->getOperand(0);
46653       SDValue RHS = N->getOperand(1);
46654       auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
46655       if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
46656                             PostShuffleMask)) {
46657         auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
46658                                         ArrayRef<SDValue> Ops) {
46659           return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
46660         };
46661         SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
46662                                               {LHS, RHS}, HOpBuilder);
46663         if (!PostShuffleMask.empty())
46664           HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
46665                                             DAG.getUNDEF(VT), PostShuffleMask);
46666         return HorizBinOp;
46667       }
46668     }
46669     break;
46670   }
46671 
46672   return SDValue();
46673 }
46674 
46675 /// Do target-specific dag combines on floating-point adds/subs.
combineFaddFsub(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)46676 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
46677                                const X86Subtarget &Subtarget) {
46678   if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
46679     return HOp;
46680   return SDValue();
46681 }
46682 
46683 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
46684 /// the codegen.
46685 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
46686 /// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
46687 ///       anything that is guaranteed to be transformed by DAGCombiner.
combineTruncatedArithmetic(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget,const SDLoc & DL)46688 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
46689                                           const X86Subtarget &Subtarget,
46690                                           const SDLoc &DL) {
46691   assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
46692   SDValue Src = N->getOperand(0);
46693   unsigned SrcOpcode = Src.getOpcode();
46694   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46695 
46696   EVT VT = N->getValueType(0);
46697   EVT SrcVT = Src.getValueType();
46698 
46699   auto IsFreeTruncation = [VT](SDValue Op) {
46700     unsigned TruncSizeInBits = VT.getScalarSizeInBits();
46701 
46702     // See if this has been extended from a smaller/equal size to
46703     // the truncation size, allowing a truncation to combine with the extend.
46704     unsigned Opcode = Op.getOpcode();
46705     if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
46706          Opcode == ISD::ZERO_EXTEND) &&
46707         Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
46708       return true;
46709 
46710     // See if this is a single use constant which can be constant folded.
46711     // NOTE: We don't peek throught bitcasts here because there is currently
46712     // no support for constant folding truncate+bitcast+vector_of_constants. So
46713     // we'll just send up with a truncate on both operands which will
46714     // get turned back into (truncate (binop)) causing an infinite loop.
46715     return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
46716   };
46717 
46718   auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
46719     SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
46720     SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
46721     return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
46722   };
46723 
46724   // Don't combine if the operation has other uses.
46725   if (!Src.hasOneUse())
46726     return SDValue();
46727 
46728   // Only support vector truncation for now.
46729   // TODO: i64 scalar math would benefit as well.
46730   if (!VT.isVector())
46731     return SDValue();
46732 
46733   // In most cases its only worth pre-truncating if we're only facing the cost
46734   // of one truncation.
46735   // i.e. if one of the inputs will constant fold or the input is repeated.
46736   switch (SrcOpcode) {
46737   case ISD::MUL:
46738     // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
46739     // better to truncate if we have the chance.
46740     if (SrcVT.getScalarType() == MVT::i64 &&
46741         TLI.isOperationLegal(SrcOpcode, VT) &&
46742         !TLI.isOperationLegal(SrcOpcode, SrcVT))
46743       return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
46744     LLVM_FALLTHROUGH;
46745   case ISD::AND:
46746   case ISD::XOR:
46747   case ISD::OR:
46748   case ISD::ADD:
46749   case ISD::SUB: {
46750     SDValue Op0 = Src.getOperand(0);
46751     SDValue Op1 = Src.getOperand(1);
46752     if (TLI.isOperationLegal(SrcOpcode, VT) &&
46753         (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
46754       return TruncateArithmetic(Op0, Op1);
46755     break;
46756   }
46757   }
46758 
46759   return SDValue();
46760 }
46761 
46762 /// Truncate using ISD::AND mask and X86ISD::PACKUS.
46763 /// e.g. trunc <8 x i32> X to <8 x i16> -->
46764 /// MaskX = X & 0xffff (clear high bits to prevent saturation)
46765 /// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
combineVectorTruncationWithPACKUS(SDNode * N,const SDLoc & DL,const X86Subtarget & Subtarget,SelectionDAG & DAG)46766 static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
46767                                                  const X86Subtarget &Subtarget,
46768                                                  SelectionDAG &DAG) {
46769   SDValue In = N->getOperand(0);
46770   EVT InVT = In.getValueType();
46771   EVT OutVT = N->getValueType(0);
46772 
46773   APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
46774                                     OutVT.getScalarSizeInBits());
46775   In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
46776   return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
46777 }
46778 
46779 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
combineVectorTruncationWithPACKSS(SDNode * N,const SDLoc & DL,const X86Subtarget & Subtarget,SelectionDAG & DAG)46780 static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
46781                                                  const X86Subtarget &Subtarget,
46782                                                  SelectionDAG &DAG) {
46783   SDValue In = N->getOperand(0);
46784   EVT InVT = In.getValueType();
46785   EVT OutVT = N->getValueType(0);
46786   In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
46787                    DAG.getValueType(OutVT));
46788   return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
46789 }
46790 
46791 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
46792 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
46793 /// legalization the truncation will be translated into a BUILD_VECTOR with each
46794 /// element that is extracted from a vector and then truncated, and it is
46795 /// difficult to do this optimization based on them.
combineVectorTruncation(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)46796 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
46797                                        const X86Subtarget &Subtarget) {
46798   EVT OutVT = N->getValueType(0);
46799   if (!OutVT.isVector())
46800     return SDValue();
46801 
46802   SDValue In = N->getOperand(0);
46803   if (!In.getValueType().isSimple())
46804     return SDValue();
46805 
46806   EVT InVT = In.getValueType();
46807   unsigned NumElems = OutVT.getVectorNumElements();
46808 
46809   // AVX512 provides fast truncate ops.
46810   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
46811     return SDValue();
46812 
46813   EVT OutSVT = OutVT.getVectorElementType();
46814   EVT InSVT = InVT.getVectorElementType();
46815   if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
46816         (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
46817         NumElems >= 8))
46818     return SDValue();
46819 
46820   // SSSE3's pshufb results in less instructions in the cases below.
46821   if (Subtarget.hasSSSE3() && NumElems == 8 && InSVT != MVT::i64)
46822     return SDValue();
46823 
46824   SDLoc DL(N);
46825   // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
46826   // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
46827   // truncate 2 x v4i32 to v8i16.
46828   if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
46829     return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
46830   if (InSVT == MVT::i32)
46831     return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
46832 
46833   return SDValue();
46834 }
46835 
46836 /// This function transforms vector truncation of 'extended sign-bits' or
46837 /// 'extended zero-bits' values.
46838 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
combineVectorSignBitsTruncation(SDNode * N,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget)46839 static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
46840                                                SelectionDAG &DAG,
46841                                                const X86Subtarget &Subtarget) {
46842   // Requires SSE2.
46843   if (!Subtarget.hasSSE2())
46844     return SDValue();
46845 
46846   if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
46847     return SDValue();
46848 
46849   SDValue In = N->getOperand(0);
46850   if (!In.getValueType().isSimple())
46851     return SDValue();
46852 
46853   MVT VT = N->getValueType(0).getSimpleVT();
46854   MVT SVT = VT.getScalarType();
46855 
46856   MVT InVT = In.getValueType().getSimpleVT();
46857   MVT InSVT = InVT.getScalarType();
46858 
46859   // Check we have a truncation suited for PACKSS/PACKUS.
46860   if (!isPowerOf2_32(VT.getVectorNumElements()))
46861     return SDValue();
46862   if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
46863     return SDValue();
46864   if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
46865     return SDValue();
46866 
46867   // Truncation to sub-128bit vXi32 can be better handled with shuffles.
46868   if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
46869     return SDValue();
46870 
46871   // AVX512 has fast truncate, but if the input is already going to be split,
46872   // there's no harm in trying pack.
46873   if (Subtarget.hasAVX512() &&
46874       !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
46875         InVT.is512BitVector())) {
46876     // PACK should still be worth it for 128-bit vectors if the sources were
46877     // originally concatenated from subvectors.
46878     SmallVector<SDValue> ConcatOps;
46879     if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps))
46880     return SDValue();
46881   }
46882 
46883   unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
46884   unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
46885 
46886   // Use PACKUS if the input has zero-bits that extend all the way to the
46887   // packed/truncated value. e.g. masks, zext_in_reg, etc.
46888   KnownBits Known = DAG.computeKnownBits(In);
46889   unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
46890   if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
46891     return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
46892 
46893   // Use PACKSS if the input has sign-bits that extend all the way to the
46894   // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
46895   unsigned NumSignBits = DAG.ComputeNumSignBits(In);
46896 
46897   // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
46898   // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
46899   // on and combines/simplifications can't then use it.
46900   if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
46901     return SDValue();
46902 
46903   unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;
46904   if (NumSignBits > MinSignBits)
46905     return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
46906 
46907   // If we have a srl that only generates signbits that we will discard in
46908   // the truncation then we can use PACKSS by converting the srl to a sra.
46909   // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
46910   if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))
46911     if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
46912             In, APInt::getAllOnesValue(VT.getVectorNumElements()))) {
46913       if (*ShAmt == MinSignBits) {
46914         SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());
46915         return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,
46916                                       Subtarget);
46917       }
46918     }
46919 
46920   return SDValue();
46921 }
46922 
46923 // Try to form a MULHU or MULHS node by looking for
46924 // (trunc (srl (mul ext, ext), 16))
46925 // TODO: This is X86 specific because we want to be able to handle wide types
46926 // before type legalization. But we can only do it if the vector will be
46927 // legalized via widening/splitting. Type legalization can't handle promotion
46928 // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
46929 // combiner.
combinePMULH(SDValue Src,EVT VT,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget)46930 static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
46931                             SelectionDAG &DAG, const X86Subtarget &Subtarget) {
46932   // First instruction should be a right shift of a multiply.
46933   if (Src.getOpcode() != ISD::SRL ||
46934       Src.getOperand(0).getOpcode() != ISD::MUL)
46935     return SDValue();
46936 
46937   if (!Subtarget.hasSSE2())
46938     return SDValue();
46939 
46940   // Only handle vXi16 types that are at least 128-bits unless they will be
46941   // widened.
46942   if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
46943     return SDValue();
46944 
46945   // Input type should be at least vXi32.
46946   EVT InVT = Src.getValueType();
46947   if (InVT.getVectorElementType().getSizeInBits() < 32)
46948     return SDValue();
46949 
46950   // Need a shift by 16.
46951   APInt ShiftAmt;
46952   if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
46953       ShiftAmt != 16)
46954     return SDValue();
46955 
46956   SDValue LHS = Src.getOperand(0).getOperand(0);
46957   SDValue RHS = Src.getOperand(0).getOperand(1);
46958 
46959   unsigned ExtOpc = LHS.getOpcode();
46960   if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
46961       RHS.getOpcode() != ExtOpc)
46962     return SDValue();
46963 
46964   // Peek through the extends.
46965   LHS = LHS.getOperand(0);
46966   RHS = RHS.getOperand(0);
46967 
46968   // Ensure the input types match.
46969   if (LHS.getValueType() != VT || RHS.getValueType() != VT)
46970     return SDValue();
46971 
46972   unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
46973   return DAG.getNode(Opc, DL, VT, LHS, RHS);
46974 }
46975 
46976 // Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
46977 // from one vector with signed bytes from another vector, adds together
46978 // adjacent pairs of 16-bit products, and saturates the result before
46979 // truncating to 16-bits.
46980 //
46981 // Which looks something like this:
46982 // (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
46983 //                 (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
detectPMADDUBSW(SDValue In,EVT VT,SelectionDAG & DAG,const X86Subtarget & Subtarget,const SDLoc & DL)46984 static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
46985                                const X86Subtarget &Subtarget,
46986                                const SDLoc &DL) {
46987   if (!VT.isVector() || !Subtarget.hasSSSE3())
46988     return SDValue();
46989 
46990   unsigned NumElems = VT.getVectorNumElements();
46991   EVT ScalarVT = VT.getVectorElementType();
46992   if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
46993     return SDValue();
46994 
46995   SDValue SSatVal = detectSSatPattern(In, VT);
46996   if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
46997     return SDValue();
46998 
46999   // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
47000   // of multiplies from even/odd elements.
47001   SDValue N0 = SSatVal.getOperand(0);
47002   SDValue N1 = SSatVal.getOperand(1);
47003 
47004   if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
47005     return SDValue();
47006 
47007   SDValue N00 = N0.getOperand(0);
47008   SDValue N01 = N0.getOperand(1);
47009   SDValue N10 = N1.getOperand(0);
47010   SDValue N11 = N1.getOperand(1);
47011 
47012   // TODO: Handle constant vectors and use knownbits/computenumsignbits?
47013   // Canonicalize zero_extend to LHS.
47014   if (N01.getOpcode() == ISD::ZERO_EXTEND)
47015     std::swap(N00, N01);
47016   if (N11.getOpcode() == ISD::ZERO_EXTEND)
47017     std::swap(N10, N11);
47018 
47019   // Ensure we have a zero_extend and a sign_extend.
47020   if (N00.getOpcode() != ISD::ZERO_EXTEND ||
47021       N01.getOpcode() != ISD::SIGN_EXTEND ||
47022       N10.getOpcode() != ISD::ZERO_EXTEND ||
47023       N11.getOpcode() != ISD::SIGN_EXTEND)
47024     return SDValue();
47025 
47026   // Peek through the extends.
47027   N00 = N00.getOperand(0);
47028   N01 = N01.getOperand(0);
47029   N10 = N10.getOperand(0);
47030   N11 = N11.getOperand(0);
47031 
47032   // Ensure the extend is from vXi8.
47033   if (N00.getValueType().getVectorElementType() != MVT::i8 ||
47034       N01.getValueType().getVectorElementType() != MVT::i8 ||
47035       N10.getValueType().getVectorElementType() != MVT::i8 ||
47036       N11.getValueType().getVectorElementType() != MVT::i8)
47037     return SDValue();
47038 
47039   // All inputs should be build_vectors.
47040   if (N00.getOpcode() != ISD::BUILD_VECTOR ||
47041       N01.getOpcode() != ISD::BUILD_VECTOR ||
47042       N10.getOpcode() != ISD::BUILD_VECTOR ||
47043       N11.getOpcode() != ISD::BUILD_VECTOR)
47044     return SDValue();
47045 
47046   // N00/N10 are zero extended. N01/N11 are sign extended.
47047 
47048   // For each element, we need to ensure we have an odd element from one vector
47049   // multiplied by the odd element of another vector and the even element from
47050   // one of the same vectors being multiplied by the even element from the
47051   // other vector. So we need to make sure for each element i, this operator
47052   // is being performed:
47053   //  A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
47054   SDValue ZExtIn, SExtIn;
47055   for (unsigned i = 0; i != NumElems; ++i) {
47056     SDValue N00Elt = N00.getOperand(i);
47057     SDValue N01Elt = N01.getOperand(i);
47058     SDValue N10Elt = N10.getOperand(i);
47059     SDValue N11Elt = N11.getOperand(i);
47060     // TODO: Be more tolerant to undefs.
47061     if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
47062         N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
47063         N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
47064         N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
47065       return SDValue();
47066     auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
47067     auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
47068     auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
47069     auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
47070     if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
47071       return SDValue();
47072     unsigned IdxN00 = ConstN00Elt->getZExtValue();
47073     unsigned IdxN01 = ConstN01Elt->getZExtValue();
47074     unsigned IdxN10 = ConstN10Elt->getZExtValue();
47075     unsigned IdxN11 = ConstN11Elt->getZExtValue();
47076     // Add is commutative so indices can be reordered.
47077     if (IdxN00 > IdxN10) {
47078       std::swap(IdxN00, IdxN10);
47079       std::swap(IdxN01, IdxN11);
47080     }
47081     // N0 indices be the even element. N1 indices must be the next odd element.
47082     if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
47083         IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
47084       return SDValue();
47085     SDValue N00In = N00Elt.getOperand(0);
47086     SDValue N01In = N01Elt.getOperand(0);
47087     SDValue N10In = N10Elt.getOperand(0);
47088     SDValue N11In = N11Elt.getOperand(0);
47089     // First time we find an input capture it.
47090     if (!ZExtIn) {
47091       ZExtIn = N00In;
47092       SExtIn = N01In;
47093     }
47094     if (ZExtIn != N00In || SExtIn != N01In ||
47095         ZExtIn != N10In || SExtIn != N11In)
47096       return SDValue();
47097   }
47098 
47099   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47100                          ArrayRef<SDValue> Ops) {
47101     // Shrink by adding truncate nodes and let DAGCombine fold with the
47102     // sources.
47103     EVT InVT = Ops[0].getValueType();
47104     assert(InVT.getScalarType() == MVT::i8 &&
47105            "Unexpected scalar element type");
47106     assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
47107     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
47108                                  InVT.getVectorNumElements() / 2);
47109     return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
47110   };
47111   return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
47112                           PMADDBuilder);
47113 }
47114 
combineTruncate(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)47115 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
47116                                const X86Subtarget &Subtarget) {
47117   EVT VT = N->getValueType(0);
47118   SDValue Src = N->getOperand(0);
47119   SDLoc DL(N);
47120 
47121   // Attempt to pre-truncate inputs to arithmetic ops instead.
47122   if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
47123     return V;
47124 
47125   // Try to detect AVG pattern first.
47126   if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
47127     return Avg;
47128 
47129   // Try to detect PMADD
47130   if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
47131     return PMAdd;
47132 
47133   // Try to combine truncation with signed/unsigned saturation.
47134   if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
47135     return Val;
47136 
47137   // Try to combine PMULHUW/PMULHW for vXi16.
47138   if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
47139     return V;
47140 
47141   // The bitcast source is a direct mmx result.
47142   // Detect bitcasts between i32 to x86mmx
47143   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
47144     SDValue BCSrc = Src.getOperand(0);
47145     if (BCSrc.getValueType() == MVT::x86mmx)
47146       return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
47147   }
47148 
47149   // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
47150   if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
47151     return V;
47152 
47153   return combineVectorTruncation(N, DAG, Subtarget);
47154 }
47155 
combineVTRUNC(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)47156 static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
47157                              TargetLowering::DAGCombinerInfo &DCI) {
47158   EVT VT = N->getValueType(0);
47159   SDValue In = N->getOperand(0);
47160   SDLoc DL(N);
47161 
47162   if (auto SSatVal = detectSSatPattern(In, VT))
47163     return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
47164   if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
47165     return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
47166 
47167   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47168   APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));
47169   if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
47170     return SDValue(N, 0);
47171 
47172   return SDValue();
47173 }
47174 
47175 /// Returns the negated value if the node \p N flips sign of FP value.
47176 ///
47177 /// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
47178 /// or FSUB(0, x)
47179 /// AVX512F does not have FXOR, so FNEG is lowered as
47180 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
47181 /// In this case we go though all bitcasts.
47182 /// This also recognizes splat of a negated value and returns the splat of that
47183 /// value.
isFNEG(SelectionDAG & DAG,SDNode * N,unsigned Depth=0)47184 static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
47185   if (N->getOpcode() == ISD::FNEG)
47186     return N->getOperand(0);
47187 
47188   // Don't recurse exponentially.
47189   if (Depth > SelectionDAG::MaxRecursionDepth)
47190     return SDValue();
47191 
47192   unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
47193 
47194   SDValue Op = peekThroughBitcasts(SDValue(N, 0));
47195   EVT VT = Op->getValueType(0);
47196 
47197   // Make sure the element size doesn't change.
47198   if (VT.getScalarSizeInBits() != ScalarSize)
47199     return SDValue();
47200 
47201   unsigned Opc = Op.getOpcode();
47202   switch (Opc) {
47203   case ISD::VECTOR_SHUFFLE: {
47204     // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
47205     // of this is VECTOR_SHUFFLE(-VEC1, UNDEF).  The mask can be anything here.
47206     if (!Op.getOperand(1).isUndef())
47207       return SDValue();
47208     if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
47209       if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
47210         return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
47211                                     cast<ShuffleVectorSDNode>(Op)->getMask());
47212     break;
47213   }
47214   case ISD::INSERT_VECTOR_ELT: {
47215     // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
47216     // -V, INDEX).
47217     SDValue InsVector = Op.getOperand(0);
47218     SDValue InsVal = Op.getOperand(1);
47219     if (!InsVector.isUndef())
47220       return SDValue();
47221     if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
47222       if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
47223         return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
47224                            NegInsVal, Op.getOperand(2));
47225     break;
47226   }
47227   case ISD::FSUB:
47228   case ISD::XOR:
47229   case X86ISD::FXOR: {
47230     SDValue Op1 = Op.getOperand(1);
47231     SDValue Op0 = Op.getOperand(0);
47232 
47233     // For XOR and FXOR, we want to check if constant
47234     // bits of Op1 are sign bit masks. For FSUB, we
47235     // have to check if constant bits of Op0 are sign
47236     // bit masks and hence we swap the operands.
47237     if (Opc == ISD::FSUB)
47238       std::swap(Op0, Op1);
47239 
47240     APInt UndefElts;
47241     SmallVector<APInt, 16> EltBits;
47242     // Extract constant bits and see if they are all
47243     // sign bit masks. Ignore the undef elements.
47244     if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
47245                                       /* AllowWholeUndefs */ true,
47246                                       /* AllowPartialUndefs */ false)) {
47247       for (unsigned I = 0, E = EltBits.size(); I < E; I++)
47248         if (!UndefElts[I] && !EltBits[I].isSignMask())
47249           return SDValue();
47250 
47251       return peekThroughBitcasts(Op0);
47252     }
47253   }
47254   }
47255 
47256   return SDValue();
47257 }
47258 
negateFMAOpcode(unsigned Opcode,bool NegMul,bool NegAcc,bool NegRes)47259 static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
47260                                 bool NegRes) {
47261   if (NegMul) {
47262     switch (Opcode) {
47263     default: llvm_unreachable("Unexpected opcode");
47264     case ISD::FMA:              Opcode = X86ISD::FNMADD;        break;
47265     case ISD::STRICT_FMA:       Opcode = X86ISD::STRICT_FNMADD; break;
47266     case X86ISD::FMADD_RND:     Opcode = X86ISD::FNMADD_RND;    break;
47267     case X86ISD::FMSUB:         Opcode = X86ISD::FNMSUB;        break;
47268     case X86ISD::STRICT_FMSUB:  Opcode = X86ISD::STRICT_FNMSUB; break;
47269     case X86ISD::FMSUB_RND:     Opcode = X86ISD::FNMSUB_RND;    break;
47270     case X86ISD::FNMADD:        Opcode = ISD::FMA;              break;
47271     case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA;       break;
47272     case X86ISD::FNMADD_RND:    Opcode = X86ISD::FMADD_RND;     break;
47273     case X86ISD::FNMSUB:        Opcode = X86ISD::FMSUB;         break;
47274     case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB;  break;
47275     case X86ISD::FNMSUB_RND:    Opcode = X86ISD::FMSUB_RND;     break;
47276     }
47277   }
47278 
47279   if (NegAcc) {
47280     switch (Opcode) {
47281     default: llvm_unreachable("Unexpected opcode");
47282     case ISD::FMA:              Opcode = X86ISD::FMSUB;         break;
47283     case ISD::STRICT_FMA:       Opcode = X86ISD::STRICT_FMSUB;  break;
47284     case X86ISD::FMADD_RND:     Opcode = X86ISD::FMSUB_RND;     break;
47285     case X86ISD::FMSUB:         Opcode = ISD::FMA;              break;
47286     case X86ISD::STRICT_FMSUB:  Opcode = ISD::STRICT_FMA;       break;
47287     case X86ISD::FMSUB_RND:     Opcode = X86ISD::FMADD_RND;     break;
47288     case X86ISD::FNMADD:        Opcode = X86ISD::FNMSUB;        break;
47289     case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
47290     case X86ISD::FNMADD_RND:    Opcode = X86ISD::FNMSUB_RND;    break;
47291     case X86ISD::FNMSUB:        Opcode = X86ISD::FNMADD;        break;
47292     case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
47293     case X86ISD::FNMSUB_RND:    Opcode = X86ISD::FNMADD_RND;    break;
47294     case X86ISD::FMADDSUB:      Opcode = X86ISD::FMSUBADD;      break;
47295     case X86ISD::FMADDSUB_RND:  Opcode = X86ISD::FMSUBADD_RND;  break;
47296     case X86ISD::FMSUBADD:      Opcode = X86ISD::FMADDSUB;      break;
47297     case X86ISD::FMSUBADD_RND:  Opcode = X86ISD::FMADDSUB_RND;  break;
47298     }
47299   }
47300 
47301   if (NegRes) {
47302     switch (Opcode) {
47303     // For accuracy reason, we never combine fneg and fma under strict FP.
47304     default: llvm_unreachable("Unexpected opcode");
47305     case ISD::FMA:             Opcode = X86ISD::FNMSUB;       break;
47306     case X86ISD::FMADD_RND:    Opcode = X86ISD::FNMSUB_RND;   break;
47307     case X86ISD::FMSUB:        Opcode = X86ISD::FNMADD;       break;
47308     case X86ISD::FMSUB_RND:    Opcode = X86ISD::FNMADD_RND;   break;
47309     case X86ISD::FNMADD:       Opcode = X86ISD::FMSUB;        break;
47310     case X86ISD::FNMADD_RND:   Opcode = X86ISD::FMSUB_RND;    break;
47311     case X86ISD::FNMSUB:       Opcode = ISD::FMA;             break;
47312     case X86ISD::FNMSUB_RND:   Opcode = X86ISD::FMADD_RND;    break;
47313     }
47314   }
47315 
47316   return Opcode;
47317 }
47318 
47319 /// Do target-specific dag combines on floating point negations.
combineFneg(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)47320 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
47321                            TargetLowering::DAGCombinerInfo &DCI,
47322                            const X86Subtarget &Subtarget) {
47323   EVT OrigVT = N->getValueType(0);
47324   SDValue Arg = isFNEG(DAG, N);
47325   if (!Arg)
47326     return SDValue();
47327 
47328   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47329   EVT VT = Arg.getValueType();
47330   EVT SVT = VT.getScalarType();
47331   SDLoc DL(N);
47332 
47333   // Let legalize expand this if it isn't a legal type yet.
47334   if (!TLI.isTypeLegal(VT))
47335     return SDValue();
47336 
47337   // If we're negating a FMUL node on a target with FMA, then we can avoid the
47338   // use of a constant by performing (-0 - A*B) instead.
47339   // FIXME: Check rounding control flags as well once it becomes available.
47340   if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
47341       Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
47342     SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
47343     SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
47344                                   Arg.getOperand(1), Zero);
47345     return DAG.getBitcast(OrigVT, NewNode);
47346   }
47347 
47348   bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
47349   bool LegalOperations = !DCI.isBeforeLegalizeOps();
47350   if (SDValue NegArg =
47351           TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
47352     return DAG.getBitcast(OrigVT, NegArg);
47353 
47354   return SDValue();
47355 }
47356 
getNegatedExpression(SDValue Op,SelectionDAG & DAG,bool LegalOperations,bool ForCodeSize,NegatibleCost & Cost,unsigned Depth) const47357 SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
47358                                                 bool LegalOperations,
47359                                                 bool ForCodeSize,
47360                                                 NegatibleCost &Cost,
47361                                                 unsigned Depth) const {
47362   // fneg patterns are removable even if they have multiple uses.
47363   if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
47364     Cost = NegatibleCost::Cheaper;
47365     return DAG.getBitcast(Op.getValueType(), Arg);
47366   }
47367 
47368   EVT VT = Op.getValueType();
47369   EVT SVT = VT.getScalarType();
47370   unsigned Opc = Op.getOpcode();
47371   SDNodeFlags Flags = Op.getNode()->getFlags();
47372   switch (Opc) {
47373   case ISD::FMA:
47374   case X86ISD::FMSUB:
47375   case X86ISD::FNMADD:
47376   case X86ISD::FNMSUB:
47377   case X86ISD::FMADD_RND:
47378   case X86ISD::FMSUB_RND:
47379   case X86ISD::FNMADD_RND:
47380   case X86ISD::FNMSUB_RND: {
47381     if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
47382         !(SVT == MVT::f32 || SVT == MVT::f64) ||
47383         !isOperationLegal(ISD::FMA, VT))
47384       break;
47385 
47386     // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
47387     // if it may have signed zeros.
47388     if (!Flags.hasNoSignedZeros())
47389       break;
47390 
47391     // This is always negatible for free but we might be able to remove some
47392     // extra operand negations as well.
47393     SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
47394     for (int i = 0; i != 3; ++i)
47395       NewOps[i] = getCheaperNegatedExpression(
47396           Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
47397 
47398     bool NegA = !!NewOps[0];
47399     bool NegB = !!NewOps[1];
47400     bool NegC = !!NewOps[2];
47401     unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
47402 
47403     Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
47404                                   : NegatibleCost::Neutral;
47405 
47406     // Fill in the non-negated ops with the original values.
47407     for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
47408       if (!NewOps[i])
47409         NewOps[i] = Op.getOperand(i);
47410     return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
47411   }
47412   case X86ISD::FRCP:
47413     if (SDValue NegOp0 =
47414             getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
47415                                  ForCodeSize, Cost, Depth + 1))
47416       return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
47417     break;
47418   }
47419 
47420   return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
47421                                               ForCodeSize, Cost, Depth);
47422 }
47423 
lowerX86FPLogicOp(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)47424 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
47425                                  const X86Subtarget &Subtarget) {
47426   MVT VT = N->getSimpleValueType(0);
47427   // If we have integer vector types available, use the integer opcodes.
47428   if (!VT.isVector() || !Subtarget.hasSSE2())
47429     return SDValue();
47430 
47431   SDLoc dl(N);
47432 
47433   unsigned IntBits = VT.getScalarSizeInBits();
47434   MVT IntSVT = MVT::getIntegerVT(IntBits);
47435   MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
47436 
47437   SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
47438   SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
47439   unsigned IntOpcode;
47440   switch (N->getOpcode()) {
47441   default: llvm_unreachable("Unexpected FP logic op");
47442   case X86ISD::FOR:   IntOpcode = ISD::OR; break;
47443   case X86ISD::FXOR:  IntOpcode = ISD::XOR; break;
47444   case X86ISD::FAND:  IntOpcode = ISD::AND; break;
47445   case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
47446   }
47447   SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
47448   return DAG.getBitcast(VT, IntOp);
47449 }
47450 
47451 
47452 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
foldXor1SetCC(SDNode * N,SelectionDAG & DAG)47453 static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
47454   if (N->getOpcode() != ISD::XOR)
47455     return SDValue();
47456 
47457   SDValue LHS = N->getOperand(0);
47458   if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
47459     return SDValue();
47460 
47461   X86::CondCode NewCC = X86::GetOppositeBranchCondition(
47462       X86::CondCode(LHS->getConstantOperandVal(0)));
47463   SDLoc DL(N);
47464   return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
47465 }
47466 
combineXor(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)47467 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
47468                           TargetLowering::DAGCombinerInfo &DCI,
47469                           const X86Subtarget &Subtarget) {
47470   SDValue N0 = N->getOperand(0);
47471   SDValue N1 = N->getOperand(1);
47472   EVT VT = N->getValueType(0);
47473 
47474   // If this is SSE1 only convert to FXOR to avoid scalarization.
47475   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
47476     return DAG.getBitcast(MVT::v4i32,
47477                           DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
47478                                       DAG.getBitcast(MVT::v4f32, N0),
47479                                       DAG.getBitcast(MVT::v4f32, N1)));
47480   }
47481 
47482   if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
47483     return Cmp;
47484 
47485   if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
47486     return R;
47487 
47488   if (DCI.isBeforeLegalizeOps())
47489     return SDValue();
47490 
47491   if (SDValue SetCC = foldXor1SetCC(N, DAG))
47492     return SetCC;
47493 
47494   if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
47495     return RV;
47496 
47497   // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
47498   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47499   if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
47500       N0.getOperand(0).getValueType().isVector() &&
47501       N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
47502       TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
47503     return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
47504                                          N0.getOperand(0).getValueType()));
47505   }
47506 
47507   // Handle AVX512 mask widening.
47508   // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
47509   if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
47510       VT.getVectorElementType() == MVT::i1 &&
47511       N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&
47512       TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
47513     return DAG.getNode(
47514         ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
47515         DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
47516         N0.getOperand(2));
47517   }
47518 
47519   // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
47520   // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
47521   // TODO: Under what circumstances could this be performed in DAGCombine?
47522   if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
47523       N0.getOperand(0).getOpcode() == N->getOpcode()) {
47524     SDValue TruncExtSrc = N0.getOperand(0);
47525     auto *N1C = dyn_cast<ConstantSDNode>(N1);
47526     auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
47527     if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
47528       SDLoc DL(N);
47529       SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
47530       SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
47531       return DAG.getNode(ISD::XOR, DL, VT, LHS,
47532                          DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
47533     }
47534   }
47535 
47536   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
47537     return FPLogic;
47538 
47539   return combineFneg(N, DAG, DCI, Subtarget);
47540 }
47541 
combineBEXTR(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)47542 static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
47543                             TargetLowering::DAGCombinerInfo &DCI,
47544                             const X86Subtarget &Subtarget) {
47545   EVT VT = N->getValueType(0);
47546   unsigned NumBits = VT.getSizeInBits();
47547 
47548   // TODO - Constant Folding.
47549 
47550   // Simplify the inputs.
47551   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47552   APInt DemandedMask(APInt::getAllOnesValue(NumBits));
47553   if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
47554     return SDValue(N, 0);
47555 
47556   return SDValue();
47557 }
47558 
isNullFPScalarOrVectorConst(SDValue V)47559 static bool isNullFPScalarOrVectorConst(SDValue V) {
47560   return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
47561 }
47562 
47563 /// If a value is a scalar FP zero or a vector FP zero (potentially including
47564 /// undefined elements), return a zero constant that may be used to fold away
47565 /// that value. In the case of a vector, the returned constant will not contain
47566 /// undefined elements even if the input parameter does. This makes it suitable
47567 /// to be used as a replacement operand with operations (eg, bitwise-and) where
47568 /// an undef should not propagate.
getNullFPConstForNullVal(SDValue V,SelectionDAG & DAG,const X86Subtarget & Subtarget)47569 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
47570                                         const X86Subtarget &Subtarget) {
47571   if (!isNullFPScalarOrVectorConst(V))
47572     return SDValue();
47573 
47574   if (V.getValueType().isVector())
47575     return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
47576 
47577   return V;
47578 }
47579 
combineFAndFNotToFAndn(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)47580 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
47581                                       const X86Subtarget &Subtarget) {
47582   SDValue N0 = N->getOperand(0);
47583   SDValue N1 = N->getOperand(1);
47584   EVT VT = N->getValueType(0);
47585   SDLoc DL(N);
47586 
47587   // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
47588   if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
47589         (VT == MVT::f64 && Subtarget.hasSSE2()) ||
47590         (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
47591     return SDValue();
47592 
47593   auto isAllOnesConstantFP = [](SDValue V) {
47594     if (V.getSimpleValueType().isVector())
47595       return ISD::isBuildVectorAllOnes(V.getNode());
47596     auto *C = dyn_cast<ConstantFPSDNode>(V);
47597     return C && C->getConstantFPValue()->isAllOnesValue();
47598   };
47599 
47600   // fand (fxor X, -1), Y --> fandn X, Y
47601   if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
47602     return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
47603 
47604   // fand X, (fxor Y, -1) --> fandn Y, X
47605   if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
47606     return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
47607 
47608   return SDValue();
47609 }
47610 
47611 /// Do target-specific dag combines on X86ISD::FAND nodes.
combineFAnd(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)47612 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
47613                            const X86Subtarget &Subtarget) {
47614   // FAND(0.0, x) -> 0.0
47615   if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
47616     return V;
47617 
47618   // FAND(x, 0.0) -> 0.0
47619   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
47620     return V;
47621 
47622   if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
47623     return V;
47624 
47625   return lowerX86FPLogicOp(N, DAG, Subtarget);
47626 }
47627 
47628 /// Do target-specific dag combines on X86ISD::FANDN nodes.
combineFAndn(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)47629 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
47630                             const X86Subtarget &Subtarget) {
47631   // FANDN(0.0, x) -> x
47632   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
47633     return N->getOperand(1);
47634 
47635   // FANDN(x, 0.0) -> 0.0
47636   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
47637     return V;
47638 
47639   return lowerX86FPLogicOp(N, DAG, Subtarget);
47640 }
47641 
47642 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
combineFOr(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)47643 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
47644                           TargetLowering::DAGCombinerInfo &DCI,
47645                           const X86Subtarget &Subtarget) {
47646   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
47647 
47648   // F[X]OR(0.0, x) -> x
47649   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
47650     return N->getOperand(1);
47651 
47652   // F[X]OR(x, 0.0) -> x
47653   if (isNullFPScalarOrVectorConst(N->getOperand(1)))
47654     return N->getOperand(0);
47655 
47656   if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
47657     return NewVal;
47658 
47659   return lowerX86FPLogicOp(N, DAG, Subtarget);
47660 }
47661 
47662 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
combineFMinFMax(SDNode * N,SelectionDAG & DAG)47663 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
47664   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
47665 
47666   // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
47667   if (!DAG.getTarget().Options.NoNaNsFPMath ||
47668       !DAG.getTarget().Options.NoSignedZerosFPMath)
47669     return SDValue();
47670 
47671   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
47672   // into FMINC and FMAXC, which are Commutative operations.
47673   unsigned NewOp = 0;
47674   switch (N->getOpcode()) {
47675     default: llvm_unreachable("unknown opcode");
47676     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
47677     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
47678   }
47679 
47680   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
47681                      N->getOperand(0), N->getOperand(1));
47682 }
47683 
combineFMinNumFMaxNum(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)47684 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
47685                                      const X86Subtarget &Subtarget) {
47686   if (Subtarget.useSoftFloat())
47687     return SDValue();
47688 
47689   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47690 
47691   EVT VT = N->getValueType(0);
47692   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
47693         (Subtarget.hasSSE2() && VT == MVT::f64) ||
47694         (VT.isVector() && TLI.isTypeLegal(VT))))
47695     return SDValue();
47696 
47697   SDValue Op0 = N->getOperand(0);
47698   SDValue Op1 = N->getOperand(1);
47699   SDLoc DL(N);
47700   auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
47701 
47702   // If we don't have to respect NaN inputs, this is a direct translation to x86
47703   // min/max instructions.
47704   if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
47705     return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
47706 
47707   // If one of the operands is known non-NaN use the native min/max instructions
47708   // with the non-NaN input as second operand.
47709   if (DAG.isKnownNeverNaN(Op1))
47710     return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
47711   if (DAG.isKnownNeverNaN(Op0))
47712     return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
47713 
47714   // If we have to respect NaN inputs, this takes at least 3 instructions.
47715   // Favor a library call when operating on a scalar and minimizing code size.
47716   if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
47717     return SDValue();
47718 
47719   EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
47720                                          VT);
47721 
47722   // There are 4 possibilities involving NaN inputs, and these are the required
47723   // outputs:
47724   //                   Op1
47725   //               Num     NaN
47726   //            ----------------
47727   //       Num  |  Max  |  Op0 |
47728   // Op0        ----------------
47729   //       NaN  |  Op1  |  NaN |
47730   //            ----------------
47731   //
47732   // The SSE FP max/min instructions were not designed for this case, but rather
47733   // to implement:
47734   //   Min = Op1 < Op0 ? Op1 : Op0
47735   //   Max = Op1 > Op0 ? Op1 : Op0
47736   //
47737   // So they always return Op0 if either input is a NaN. However, we can still
47738   // use those instructions for fmaxnum by selecting away a NaN input.
47739 
47740   // If either operand is NaN, the 2nd source operand (Op0) is passed through.
47741   SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
47742   SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
47743 
47744   // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
47745   // are NaN, the NaN value of Op1 is the result.
47746   return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
47747 }
47748 
combineX86INT_TO_FP(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)47749 static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
47750                                    TargetLowering::DAGCombinerInfo &DCI) {
47751   EVT VT = N->getValueType(0);
47752   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47753 
47754   APInt KnownUndef, KnownZero;
47755   APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
47756   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
47757                                      KnownZero, DCI))
47758     return SDValue(N, 0);
47759 
47760   // Convert a full vector load into vzload when not all bits are needed.
47761   SDValue In = N->getOperand(0);
47762   MVT InVT = In.getSimpleValueType();
47763   if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
47764       ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
47765     assert(InVT.is128BitVector() && "Expected 128-bit input vector");
47766     LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
47767     unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
47768     MVT MemVT = MVT::getIntegerVT(NumBits);
47769     MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
47770     if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
47771       SDLoc dl(N);
47772       SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
47773                                     DAG.getBitcast(InVT, VZLoad));
47774       DCI.CombineTo(N, Convert);
47775       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
47776       DCI.recursivelyDeleteUnusedNodes(LN);
47777       return SDValue(N, 0);
47778     }
47779   }
47780 
47781   return SDValue();
47782 }
47783 
combineCVTP2I_CVTTP2I(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)47784 static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
47785                                      TargetLowering::DAGCombinerInfo &DCI) {
47786   bool IsStrict = N->isTargetStrictFPOpcode();
47787   EVT VT = N->getValueType(0);
47788 
47789   // Convert a full vector load into vzload when not all bits are needed.
47790   SDValue In = N->getOperand(IsStrict ? 1 : 0);
47791   MVT InVT = In.getSimpleValueType();
47792   if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
47793       ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
47794     assert(InVT.is128BitVector() && "Expected 128-bit input vector");
47795     LoadSDNode *LN = cast<LoadSDNode>(In);
47796     unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
47797     MVT MemVT = MVT::getFloatingPointVT(NumBits);
47798     MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
47799     if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
47800       SDLoc dl(N);
47801       if (IsStrict) {
47802         SDValue Convert =
47803             DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
47804                         {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
47805         DCI.CombineTo(N, Convert, Convert.getValue(1));
47806       } else {
47807         SDValue Convert =
47808             DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
47809         DCI.CombineTo(N, Convert);
47810       }
47811       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
47812       DCI.recursivelyDeleteUnusedNodes(LN);
47813       return SDValue(N, 0);
47814     }
47815   }
47816 
47817   return SDValue();
47818 }
47819 
47820 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
combineAndnp(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)47821 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
47822                             TargetLowering::DAGCombinerInfo &DCI,
47823                             const X86Subtarget &Subtarget) {
47824   MVT VT = N->getSimpleValueType(0);
47825 
47826   // ANDNP(0, x) -> x
47827   if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
47828     return N->getOperand(1);
47829 
47830   // ANDNP(x, 0) -> 0
47831   if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
47832     return DAG.getConstant(0, SDLoc(N), VT);
47833 
47834   // Turn ANDNP back to AND if input is inverted.
47835   if (SDValue Not = IsNOT(N->getOperand(0), DAG))
47836     return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
47837                        N->getOperand(1));
47838 
47839   // Attempt to recursively combine a bitmask ANDNP with shuffles.
47840   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
47841     SDValue Op(N, 0);
47842     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47843       return Res;
47844   }
47845 
47846   return SDValue();
47847 }
47848 
combineBT(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)47849 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
47850                          TargetLowering::DAGCombinerInfo &DCI) {
47851   SDValue N1 = N->getOperand(1);
47852 
47853   // BT ignores high bits in the bit index operand.
47854   unsigned BitWidth = N1.getValueSizeInBits();
47855   APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
47856   if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
47857     if (N->getOpcode() != ISD::DELETED_NODE)
47858       DCI.AddToWorklist(N);
47859     return SDValue(N, 0);
47860   }
47861 
47862   return SDValue();
47863 }
47864 
combineCVTPH2PS(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)47865 static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
47866                                TargetLowering::DAGCombinerInfo &DCI) {
47867   bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
47868   SDValue Src = N->getOperand(IsStrict ? 1 : 0);
47869 
47870   if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
47871     APInt KnownUndef, KnownZero;
47872     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47873     APInt DemandedElts = APInt::getLowBitsSet(8, 4);
47874     if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
47875                                        DCI)) {
47876       if (N->getOpcode() != ISD::DELETED_NODE)
47877         DCI.AddToWorklist(N);
47878       return SDValue(N, 0);
47879     }
47880 
47881     // Convert a full vector load into vzload when not all bits are needed.
47882     if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
47883       LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
47884       if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
47885         SDLoc dl(N);
47886         if (IsStrict) {
47887           SDValue Convert = DAG.getNode(
47888               N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
47889               {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
47890           DCI.CombineTo(N, Convert, Convert.getValue(1));
47891         } else {
47892           SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
47893                                         DAG.getBitcast(MVT::v8i16, VZLoad));
47894           DCI.CombineTo(N, Convert);
47895         }
47896 
47897         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
47898         DCI.recursivelyDeleteUnusedNodes(LN);
47899         return SDValue(N, 0);
47900       }
47901     }
47902   }
47903 
47904   return SDValue();
47905 }
47906 
47907 // Try to combine sext_in_reg of a cmov of constants by extending the constants.
combineSextInRegCmov(SDNode * N,SelectionDAG & DAG)47908 static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
47909   assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
47910 
47911   EVT DstVT = N->getValueType(0);
47912 
47913   SDValue N0 = N->getOperand(0);
47914   SDValue N1 = N->getOperand(1);
47915   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
47916 
47917   if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
47918     return SDValue();
47919 
47920   // Look through single use any_extends / truncs.
47921   SDValue IntermediateBitwidthOp;
47922   if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
47923       N0.hasOneUse()) {
47924     IntermediateBitwidthOp = N0;
47925     N0 = N0.getOperand(0);
47926   }
47927 
47928   // See if we have a single use cmov.
47929   if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
47930     return SDValue();
47931 
47932   SDValue CMovOp0 = N0.getOperand(0);
47933   SDValue CMovOp1 = N0.getOperand(1);
47934 
47935   // Make sure both operands are constants.
47936   if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
47937       !isa<ConstantSDNode>(CMovOp1.getNode()))
47938     return SDValue();
47939 
47940   SDLoc DL(N);
47941 
47942   // If we looked through an any_extend/trunc above, add one to the constants.
47943   if (IntermediateBitwidthOp) {
47944     unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
47945     CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
47946     CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
47947   }
47948 
47949   CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
47950   CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
47951 
47952   EVT CMovVT = DstVT;
47953   // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
47954   if (DstVT == MVT::i16) {
47955     CMovVT = MVT::i32;
47956     CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
47957     CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
47958   }
47959 
47960   SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
47961                              N0.getOperand(2), N0.getOperand(3));
47962 
47963   if (CMovVT != DstVT)
47964     CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
47965 
47966   return CMov;
47967 }
47968 
combineSignExtendInReg(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)47969 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
47970                                       const X86Subtarget &Subtarget) {
47971   assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
47972 
47973   if (SDValue V = combineSextInRegCmov(N, DAG))
47974     return V;
47975 
47976   EVT VT = N->getValueType(0);
47977   SDValue N0 = N->getOperand(0);
47978   SDValue N1 = N->getOperand(1);
47979   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
47980   SDLoc dl(N);
47981 
47982   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
47983   // both SSE and AVX2 since there is no sign-extended shift right
47984   // operation on a vector with 64-bit elements.
47985   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
47986   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
47987   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
47988                            N0.getOpcode() == ISD::SIGN_EXTEND)) {
47989     SDValue N00 = N0.getOperand(0);
47990 
47991     // EXTLOAD has a better solution on AVX2,
47992     // it may be replaced with X86ISD::VSEXT node.
47993     if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
47994       if (!ISD::isNormalLoad(N00.getNode()))
47995         return SDValue();
47996 
47997     // Attempt to promote any comparison mask ops before moving the
47998     // SIGN_EXTEND_INREG in the way.
47999     if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
48000       return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
48001 
48002     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
48003       SDValue Tmp =
48004           DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
48005       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
48006     }
48007   }
48008   return SDValue();
48009 }
48010 
48011 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
48012 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
48013 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
48014 /// opportunities to combine math ops, use an LEA, or use a complex addressing
48015 /// mode. This can eliminate extend, add, and shift instructions.
promoteExtBeforeAdd(SDNode * Ext,SelectionDAG & DAG,const X86Subtarget & Subtarget)48016 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
48017                                    const X86Subtarget &Subtarget) {
48018   if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
48019       Ext->getOpcode() != ISD::ZERO_EXTEND)
48020     return SDValue();
48021 
48022   // TODO: This should be valid for other integer types.
48023   EVT VT = Ext->getValueType(0);
48024   if (VT != MVT::i64)
48025     return SDValue();
48026 
48027   SDValue Add = Ext->getOperand(0);
48028   if (Add.getOpcode() != ISD::ADD)
48029     return SDValue();
48030 
48031   bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
48032   bool NSW = Add->getFlags().hasNoSignedWrap();
48033   bool NUW = Add->getFlags().hasNoUnsignedWrap();
48034 
48035   // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
48036   // into the 'zext'
48037   if ((Sext && !NSW) || (!Sext && !NUW))
48038     return SDValue();
48039 
48040   // Having a constant operand to the 'add' ensures that we are not increasing
48041   // the instruction count because the constant is extended for free below.
48042   // A constant operand can also become the displacement field of an LEA.
48043   auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
48044   if (!AddOp1)
48045     return SDValue();
48046 
48047   // Don't make the 'add' bigger if there's no hope of combining it with some
48048   // other 'add' or 'shl' instruction.
48049   // TODO: It may be profitable to generate simpler LEA instructions in place
48050   // of single 'add' instructions, but the cost model for selecting an LEA
48051   // currently has a high threshold.
48052   bool HasLEAPotential = false;
48053   for (auto *User : Ext->uses()) {
48054     if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
48055       HasLEAPotential = true;
48056       break;
48057     }
48058   }
48059   if (!HasLEAPotential)
48060     return SDValue();
48061 
48062   // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
48063   int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
48064   SDValue AddOp0 = Add.getOperand(0);
48065   SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
48066   SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
48067 
48068   // The wider add is guaranteed to not wrap because both operands are
48069   // sign-extended.
48070   SDNodeFlags Flags;
48071   Flags.setNoSignedWrap(NSW);
48072   Flags.setNoUnsignedWrap(NUW);
48073   return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
48074 }
48075 
48076 // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
48077 // operands and the result of CMOV is not used anywhere else - promote CMOV
48078 // itself instead of promoting its result. This could be beneficial, because:
48079 //     1) X86TargetLowering::EmitLoweredSelect later can do merging of two
48080 //        (or more) pseudo-CMOVs only when they go one-after-another and
48081 //        getting rid of result extension code after CMOV will help that.
48082 //     2) Promotion of constant CMOV arguments is free, hence the
48083 //        {ANY,SIGN,ZERO}_EXTEND will just be deleted.
48084 //     3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
48085 //        promotion is also good in terms of code-size.
48086 //        (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
48087 //         promotion).
combineToExtendCMOV(SDNode * Extend,SelectionDAG & DAG)48088 static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
48089   SDValue CMovN = Extend->getOperand(0);
48090   if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
48091     return SDValue();
48092 
48093   EVT TargetVT = Extend->getValueType(0);
48094   unsigned ExtendOpcode = Extend->getOpcode();
48095   SDLoc DL(Extend);
48096 
48097   EVT VT = CMovN.getValueType();
48098   SDValue CMovOp0 = CMovN.getOperand(0);
48099   SDValue CMovOp1 = CMovN.getOperand(1);
48100 
48101   if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
48102       !isa<ConstantSDNode>(CMovOp1.getNode()))
48103     return SDValue();
48104 
48105   // Only extend to i32 or i64.
48106   if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
48107     return SDValue();
48108 
48109   // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
48110   // are free.
48111   if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
48112     return SDValue();
48113 
48114   // If this a zero extend to i64, we should only extend to i32 and use a free
48115   // zero extend to finish.
48116   EVT ExtendVT = TargetVT;
48117   if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
48118     ExtendVT = MVT::i32;
48119 
48120   CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
48121   CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
48122 
48123   SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
48124                             CMovN.getOperand(2), CMovN.getOperand(3));
48125 
48126   // Finish extending if needed.
48127   if (ExtendVT != TargetVT)
48128     Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
48129 
48130   return Res;
48131 }
48132 
48133 // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
48134 // This is more or less the reverse of combineBitcastvxi1.
48135 static SDValue
combineToExtendBoolVectorInReg(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)48136 combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
48137                                TargetLowering::DAGCombinerInfo &DCI,
48138                                const X86Subtarget &Subtarget) {
48139   unsigned Opcode = N->getOpcode();
48140   if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
48141       Opcode != ISD::ANY_EXTEND)
48142     return SDValue();
48143   if (!DCI.isBeforeLegalizeOps())
48144     return SDValue();
48145   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
48146     return SDValue();
48147 
48148   SDValue N0 = N->getOperand(0);
48149   EVT VT = N->getValueType(0);
48150   EVT SVT = VT.getScalarType();
48151   EVT InSVT = N0.getValueType().getScalarType();
48152   unsigned EltSizeInBits = SVT.getSizeInBits();
48153 
48154   // Input type must be extending a bool vector (bit-casted from a scalar
48155   // integer) to legal integer types.
48156   if (!VT.isVector())
48157     return SDValue();
48158   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
48159     return SDValue();
48160   if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
48161     return SDValue();
48162 
48163   SDValue N00 = N0.getOperand(0);
48164   EVT SclVT = N0.getOperand(0).getValueType();
48165   if (!SclVT.isScalarInteger())
48166     return SDValue();
48167 
48168   SDLoc DL(N);
48169   SDValue Vec;
48170   SmallVector<int, 32> ShuffleMask;
48171   unsigned NumElts = VT.getVectorNumElements();
48172   assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
48173 
48174   // Broadcast the scalar integer to the vector elements.
48175   if (NumElts > EltSizeInBits) {
48176     // If the scalar integer is greater than the vector element size, then we
48177     // must split it down into sub-sections for broadcasting. For example:
48178     //   i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
48179     //   i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
48180     assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
48181     unsigned Scale = NumElts / EltSizeInBits;
48182     EVT BroadcastVT =
48183         EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
48184     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
48185     Vec = DAG.getBitcast(VT, Vec);
48186 
48187     for (unsigned i = 0; i != Scale; ++i)
48188       ShuffleMask.append(EltSizeInBits, i);
48189     Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
48190   } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
48191              (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
48192     // If we have register broadcast instructions, use the scalar size as the
48193     // element type for the shuffle. Then cast to the wider element type. The
48194     // widened bits won't be used, and this might allow the use of a broadcast
48195     // load.
48196     assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
48197     unsigned Scale = EltSizeInBits / NumElts;
48198     EVT BroadcastVT =
48199         EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
48200     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
48201     ShuffleMask.append(NumElts * Scale, 0);
48202     Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
48203     Vec = DAG.getBitcast(VT, Vec);
48204   } else {
48205     // For smaller scalar integers, we can simply any-extend it to the vector
48206     // element size (we don't care about the upper bits) and broadcast it to all
48207     // elements.
48208     SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
48209     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
48210     ShuffleMask.append(NumElts, 0);
48211     Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
48212   }
48213 
48214   // Now, mask the relevant bit in each element.
48215   SmallVector<SDValue, 32> Bits;
48216   for (unsigned i = 0; i != NumElts; ++i) {
48217     int BitIdx = (i % EltSizeInBits);
48218     APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
48219     Bits.push_back(DAG.getConstant(Bit, DL, SVT));
48220   }
48221   SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
48222   Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
48223 
48224   // Compare against the bitmask and extend the result.
48225   EVT CCVT = VT.changeVectorElementType(MVT::i1);
48226   Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
48227   Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
48228 
48229   // For SEXT, this is now done, otherwise shift the result down for
48230   // zero-extension.
48231   if (Opcode == ISD::SIGN_EXTEND)
48232     return Vec;
48233   return DAG.getNode(ISD::SRL, DL, VT, Vec,
48234                      DAG.getConstant(EltSizeInBits - 1, DL, VT));
48235 }
48236 
48237 // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
48238 // result type.
combineExtSetcc(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)48239 static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
48240                                const X86Subtarget &Subtarget) {
48241   SDValue N0 = N->getOperand(0);
48242   EVT VT = N->getValueType(0);
48243   SDLoc dl(N);
48244 
48245   // Only do this combine with AVX512 for vector extends.
48246   if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
48247     return SDValue();
48248 
48249   // Only combine legal element types.
48250   EVT SVT = VT.getVectorElementType();
48251   if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
48252       SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
48253     return SDValue();
48254 
48255   // We can only do this if the vector size in 256 bits or less.
48256   unsigned Size = VT.getSizeInBits();
48257   if (Size > 256 && Subtarget.useAVX512Regs())
48258     return SDValue();
48259 
48260   // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
48261   // that's the only integer compares with we have.
48262   ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
48263   if (ISD::isUnsignedIntSetCC(CC))
48264     return SDValue();
48265 
48266   // Only do this combine if the extension will be fully consumed by the setcc.
48267   EVT N00VT = N0.getOperand(0).getValueType();
48268   EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
48269   if (Size != MatchingVecType.getSizeInBits())
48270     return SDValue();
48271 
48272   SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
48273 
48274   if (N->getOpcode() == ISD::ZERO_EXTEND)
48275     Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
48276 
48277   return Res;
48278 }
48279 
combineSext(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)48280 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
48281                            TargetLowering::DAGCombinerInfo &DCI,
48282                            const X86Subtarget &Subtarget) {
48283   SDValue N0 = N->getOperand(0);
48284   EVT VT = N->getValueType(0);
48285   SDLoc DL(N);
48286 
48287   // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
48288   if (!DCI.isBeforeLegalizeOps() &&
48289       N0.getOpcode() == X86ISD::SETCC_CARRY) {
48290     SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
48291                                  N0->getOperand(1));
48292     bool ReplaceOtherUses = !N0.hasOneUse();
48293     DCI.CombineTo(N, Setcc);
48294     // Replace other uses with a truncate of the widened setcc_carry.
48295     if (ReplaceOtherUses) {
48296       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
48297                                   N0.getValueType(), Setcc);
48298       DCI.CombineTo(N0.getNode(), Trunc);
48299     }
48300 
48301     return SDValue(N, 0);
48302   }
48303 
48304   if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
48305     return NewCMov;
48306 
48307   if (!DCI.isBeforeLegalizeOps())
48308     return SDValue();
48309 
48310   if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
48311     return V;
48312 
48313   if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
48314     return V;
48315 
48316   if (VT.isVector()) {
48317     if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
48318       return R;
48319 
48320     if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
48321       return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
48322   }
48323 
48324   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
48325     return NewAdd;
48326 
48327   return SDValue();
48328 }
48329 
combineFMA(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)48330 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
48331                           TargetLowering::DAGCombinerInfo &DCI,
48332                           const X86Subtarget &Subtarget) {
48333   SDLoc dl(N);
48334   EVT VT = N->getValueType(0);
48335   bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
48336 
48337   // Let legalize expand this if it isn't a legal type yet.
48338   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48339   if (!TLI.isTypeLegal(VT))
48340     return SDValue();
48341 
48342   SDValue A = N->getOperand(IsStrict ? 1 : 0);
48343   SDValue B = N->getOperand(IsStrict ? 2 : 1);
48344   SDValue C = N->getOperand(IsStrict ? 3 : 2);
48345 
48346   // If the operation allows fast-math and the target does not support FMA,
48347   // split this into mul+add to avoid libcall(s).
48348   SDNodeFlags Flags = N->getFlags();
48349   if (!IsStrict && Flags.hasAllowReassociation() &&
48350       TLI.isOperationExpand(ISD::FMA, VT)) {
48351     SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
48352     return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
48353   }
48354 
48355   EVT ScalarVT = VT.getScalarType();
48356   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
48357     return SDValue();
48358 
48359   auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
48360     bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
48361     bool LegalOperations = !DCI.isBeforeLegalizeOps();
48362     if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
48363                                                        CodeSize)) {
48364       V = NegV;
48365       return true;
48366     }
48367     // Look through extract_vector_elts. If it comes from an FNEG, create a
48368     // new extract from the FNEG input.
48369     if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
48370         isNullConstant(V.getOperand(1))) {
48371       SDValue Vec = V.getOperand(0);
48372       if (SDValue NegV = TLI.getCheaperNegatedExpression(
48373               Vec, DAG, LegalOperations, CodeSize)) {
48374         V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
48375                         NegV, V.getOperand(1));
48376         return true;
48377       }
48378     }
48379 
48380     return false;
48381   };
48382 
48383   // Do not convert the passthru input of scalar intrinsics.
48384   // FIXME: We could allow negations of the lower element only.
48385   bool NegA = invertIfNegative(A);
48386   bool NegB = invertIfNegative(B);
48387   bool NegC = invertIfNegative(C);
48388 
48389   if (!NegA && !NegB && !NegC)
48390     return SDValue();
48391 
48392   unsigned NewOpcode =
48393       negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
48394 
48395   // Propagate fast-math-flags to new FMA node.
48396   SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
48397   if (IsStrict) {
48398     assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
48399     return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
48400                        {N->getOperand(0), A, B, C});
48401   } else {
48402     if (N->getNumOperands() == 4)
48403       return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
48404     return DAG.getNode(NewOpcode, dl, VT, A, B, C);
48405   }
48406 }
48407 
48408 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
48409 // Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
combineFMADDSUB(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)48410 static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
48411                                TargetLowering::DAGCombinerInfo &DCI) {
48412   SDLoc dl(N);
48413   EVT VT = N->getValueType(0);
48414   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48415   bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
48416   bool LegalOperations = !DCI.isBeforeLegalizeOps();
48417 
48418   SDValue N2 = N->getOperand(2);
48419 
48420   SDValue NegN2 =
48421       TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
48422   if (!NegN2)
48423     return SDValue();
48424   unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
48425 
48426   if (N->getNumOperands() == 4)
48427     return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
48428                        NegN2, N->getOperand(3));
48429   return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
48430                      NegN2);
48431 }
48432 
combineZext(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)48433 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
48434                            TargetLowering::DAGCombinerInfo &DCI,
48435                            const X86Subtarget &Subtarget) {
48436   SDLoc dl(N);
48437   SDValue N0 = N->getOperand(0);
48438   EVT VT = N->getValueType(0);
48439 
48440   // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
48441   // FIXME: Is this needed? We don't seem to have any tests for it.
48442   if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
48443       N0.getOpcode() == X86ISD::SETCC_CARRY) {
48444     SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
48445                                  N0->getOperand(1));
48446     bool ReplaceOtherUses = !N0.hasOneUse();
48447     DCI.CombineTo(N, Setcc);
48448     // Replace other uses with a truncate of the widened setcc_carry.
48449     if (ReplaceOtherUses) {
48450       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
48451                                   N0.getValueType(), Setcc);
48452       DCI.CombineTo(N0.getNode(), Trunc);
48453     }
48454 
48455     return SDValue(N, 0);
48456   }
48457 
48458   if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
48459     return NewCMov;
48460 
48461   if (DCI.isBeforeLegalizeOps())
48462     if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
48463       return V;
48464 
48465   if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
48466     return V;
48467 
48468   if (VT.isVector())
48469     if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
48470       return R;
48471 
48472   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
48473     return NewAdd;
48474 
48475   if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
48476     return R;
48477 
48478   // TODO: Combine with any target/faux shuffle.
48479   if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
48480       VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
48481     SDValue N00 = N0.getOperand(0);
48482     SDValue N01 = N0.getOperand(1);
48483     unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
48484     APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
48485     if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
48486         (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
48487       return concatSubVectors(N00, N01, DAG, dl);
48488     }
48489   }
48490 
48491   return SDValue();
48492 }
48493 
48494 /// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
48495 /// recognizable memcmp expansion.
isOrXorXorTree(SDValue X,bool Root=true)48496 static bool isOrXorXorTree(SDValue X, bool Root = true) {
48497   if (X.getOpcode() == ISD::OR)
48498     return isOrXorXorTree(X.getOperand(0), false) &&
48499            isOrXorXorTree(X.getOperand(1), false);
48500   if (Root)
48501     return false;
48502   return X.getOpcode() == ISD::XOR;
48503 }
48504 
48505 /// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
48506 /// expansion.
48507 template<typename F>
emitOrXorXorTree(SDValue X,SDLoc & DL,SelectionDAG & DAG,EVT VecVT,EVT CmpVT,bool HasPT,F SToV)48508 static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
48509                                 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
48510   SDValue Op0 = X.getOperand(0);
48511   SDValue Op1 = X.getOperand(1);
48512   if (X.getOpcode() == ISD::OR) {
48513     SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
48514     SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
48515     if (VecVT != CmpVT)
48516       return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
48517     if (HasPT)
48518       return DAG.getNode(ISD::OR, DL, VecVT, A, B);
48519     return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
48520   } else if (X.getOpcode() == ISD::XOR) {
48521     SDValue A = SToV(Op0);
48522     SDValue B = SToV(Op1);
48523     if (VecVT != CmpVT)
48524       return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
48525     if (HasPT)
48526       return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
48527     return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
48528   }
48529   llvm_unreachable("Impossible");
48530 }
48531 
48532 /// Try to map a 128-bit or larger integer comparison to vector instructions
48533 /// before type legalization splits it up into chunks.
combineVectorSizedSetCCEquality(SDNode * SetCC,SelectionDAG & DAG,const X86Subtarget & Subtarget)48534 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
48535                                                const X86Subtarget &Subtarget) {
48536   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
48537   assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
48538 
48539   // We're looking for an oversized integer equality comparison.
48540   SDValue X = SetCC->getOperand(0);
48541   SDValue Y = SetCC->getOperand(1);
48542   EVT OpVT = X.getValueType();
48543   unsigned OpSize = OpVT.getSizeInBits();
48544   if (!OpVT.isScalarInteger() || OpSize < 128)
48545     return SDValue();
48546 
48547   // Ignore a comparison with zero because that gets special treatment in
48548   // EmitTest(). But make an exception for the special case of a pair of
48549   // logically-combined vector-sized operands compared to zero. This pattern may
48550   // be generated by the memcmp expansion pass with oversized integer compares
48551   // (see PR33325).
48552   bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
48553   if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
48554     return SDValue();
48555 
48556   // Don't perform this combine if constructing the vector will be expensive.
48557   auto IsVectorBitCastCheap = [](SDValue X) {
48558     X = peekThroughBitcasts(X);
48559     return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
48560            X.getOpcode() == ISD::LOAD;
48561   };
48562   if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
48563       !IsOrXorXorTreeCCZero)
48564     return SDValue();
48565 
48566   EVT VT = SetCC->getValueType(0);
48567   SDLoc DL(SetCC);
48568 
48569   // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
48570   // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
48571   // Otherwise use PCMPEQ (plus AND) and mask testing.
48572   if ((OpSize == 128 && Subtarget.hasSSE2()) ||
48573       (OpSize == 256 && Subtarget.hasAVX()) ||
48574       (OpSize == 512 && Subtarget.useAVX512Regs())) {
48575     bool HasPT = Subtarget.hasSSE41();
48576 
48577     // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
48578     // vector registers are essentially free. (Technically, widening registers
48579     // prevents load folding, but the tradeoff is worth it.)
48580     bool PreferKOT = Subtarget.preferMaskRegisters();
48581     bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
48582 
48583     EVT VecVT = MVT::v16i8;
48584     EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
48585     if (OpSize == 256) {
48586       VecVT = MVT::v32i8;
48587       CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
48588     }
48589     EVT CastVT = VecVT;
48590     bool NeedsAVX512FCast = false;
48591     if (OpSize == 512 || NeedZExt) {
48592       if (Subtarget.hasBWI()) {
48593         VecVT = MVT::v64i8;
48594         CmpVT = MVT::v64i1;
48595         if (OpSize == 512)
48596           CastVT = VecVT;
48597       } else {
48598         VecVT = MVT::v16i32;
48599         CmpVT = MVT::v16i1;
48600         CastVT = OpSize == 512 ? VecVT :
48601                  OpSize == 256 ? MVT::v8i32 : MVT::v4i32;
48602         NeedsAVX512FCast = true;
48603       }
48604     }
48605 
48606     auto ScalarToVector = [&](SDValue X) -> SDValue {
48607       bool TmpZext = false;
48608       EVT TmpCastVT = CastVT;
48609       if (X.getOpcode() == ISD::ZERO_EXTEND) {
48610         SDValue OrigX = X.getOperand(0);
48611         unsigned OrigSize = OrigX.getScalarValueSizeInBits();
48612         if (OrigSize < OpSize) {
48613           if (OrigSize == 128) {
48614             TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
48615             X = OrigX;
48616             TmpZext = true;
48617           } else if (OrigSize == 256) {
48618             TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
48619             X = OrigX;
48620             TmpZext = true;
48621           }
48622         }
48623       }
48624       X = DAG.getBitcast(TmpCastVT, X);
48625       if (!NeedZExt && !TmpZext)
48626         return X;
48627       return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
48628                          DAG.getConstant(0, DL, VecVT), X,
48629                          DAG.getVectorIdxConstant(0, DL));
48630     };
48631 
48632     SDValue Cmp;
48633     if (IsOrXorXorTreeCCZero) {
48634       // This is a bitwise-combined equality comparison of 2 pairs of vectors:
48635       // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
48636       // Use 2 vector equality compares and 'and' the results before doing a
48637       // MOVMSK.
48638       Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
48639     } else {
48640       SDValue VecX = ScalarToVector(X);
48641       SDValue VecY = ScalarToVector(Y);
48642       if (VecVT != CmpVT) {
48643         Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
48644       } else if (HasPT) {
48645         Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
48646       } else {
48647         Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
48648       }
48649     }
48650     // AVX512 should emit a setcc that will lower to kortest.
48651     if (VecVT != CmpVT) {
48652       EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :
48653                    CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;
48654       return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
48655                           DAG.getConstant(0, DL, KRegVT), CC);
48656     }
48657     if (HasPT) {
48658       SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
48659                                      Cmp);
48660       SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
48661       X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
48662       SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
48663       return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
48664     }
48665     // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
48666     // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
48667     // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
48668     assert(Cmp.getValueType() == MVT::v16i8 &&
48669            "Non 128-bit vector on pre-SSE41 target");
48670     SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
48671     SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
48672     return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
48673   }
48674 
48675   return SDValue();
48676 }
48677 
combineSetCC(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)48678 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
48679                             TargetLowering::DAGCombinerInfo &DCI,
48680                             const X86Subtarget &Subtarget) {
48681   const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
48682   const SDValue LHS = N->getOperand(0);
48683   const SDValue RHS = N->getOperand(1);
48684   EVT VT = N->getValueType(0);
48685   EVT OpVT = LHS.getValueType();
48686   SDLoc DL(N);
48687 
48688   if (CC == ISD::SETNE || CC == ISD::SETEQ) {
48689     if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
48690       return V;
48691 
48692     if (VT == MVT::i1 && isNullConstant(RHS)) {
48693       SDValue X86CC;
48694       if (SDValue V =
48695               MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))
48696         return DAG.getNode(ISD::TRUNCATE, DL, VT,
48697                            DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));
48698     }
48699 
48700     if (OpVT.isScalarInteger()) {
48701       // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
48702       // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
48703       auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
48704         if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
48705           if (N0.getOperand(0) == N1)
48706             return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
48707                                N0.getOperand(1));
48708           if (N0.getOperand(1) == N1)
48709             return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
48710                                N0.getOperand(0));
48711         }
48712         return SDValue();
48713       };
48714       if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
48715         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
48716       if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
48717         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
48718 
48719       // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
48720       // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
48721       auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
48722         if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
48723           if (N0.getOperand(0) == N1)
48724             return DAG.getNode(ISD::AND, DL, OpVT, N1,
48725                                DAG.getNOT(DL, N0.getOperand(1), OpVT));
48726           if (N0.getOperand(1) == N1)
48727             return DAG.getNode(ISD::AND, DL, OpVT, N1,
48728                                DAG.getNOT(DL, N0.getOperand(0), OpVT));
48729         }
48730         return SDValue();
48731       };
48732       if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
48733         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
48734       if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
48735         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
48736 
48737       // cmpeq(trunc(x),0) --> cmpeq(x,0)
48738       // cmpne(trunc(x),0) --> cmpne(x,0)
48739       // iff x upper bits are zero.
48740       // TODO: Add support for RHS to be truncate as well?
48741       if (LHS.getOpcode() == ISD::TRUNCATE &&
48742           LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
48743           isNullConstant(RHS) && !DCI.isBeforeLegalize()) {
48744         EVT SrcVT = LHS.getOperand(0).getValueType();
48745         APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
48746                                                 OpVT.getScalarSizeInBits());
48747         const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48748         if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
48749             TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
48750           return DAG.getSetCC(DL, VT, LHS.getOperand(0),
48751                               DAG.getConstant(0, DL, SrcVT), CC);
48752       }
48753     }
48754   }
48755 
48756   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
48757       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
48758     // Using temporaries to avoid messing up operand ordering for later
48759     // transformations if this doesn't work.
48760     SDValue Op0 = LHS;
48761     SDValue Op1 = RHS;
48762     ISD::CondCode TmpCC = CC;
48763     // Put build_vector on the right.
48764     if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
48765       std::swap(Op0, Op1);
48766       TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
48767     }
48768 
48769     bool IsSEXT0 =
48770         (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
48771         (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
48772     bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
48773 
48774     if (IsSEXT0 && IsVZero1) {
48775       assert(VT == Op0.getOperand(0).getValueType() &&
48776              "Unexpected operand type");
48777       if (TmpCC == ISD::SETGT)
48778         return DAG.getConstant(0, DL, VT);
48779       if (TmpCC == ISD::SETLE)
48780         return DAG.getConstant(1, DL, VT);
48781       if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
48782         return DAG.getNOT(DL, Op0.getOperand(0), VT);
48783 
48784       assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
48785              "Unexpected condition code!");
48786       return Op0.getOperand(0);
48787     }
48788   }
48789 
48790   // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
48791   // pre-promote its result type since vXi1 vectors don't get promoted
48792   // during type legalization.
48793   // NOTE: The element count check is to ignore operand types that need to
48794   // go through type promotion to a 128-bit vector.
48795   if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
48796       VT.getVectorElementType() == MVT::i1 &&
48797       (OpVT.getVectorElementType() == MVT::i8 ||
48798        OpVT.getVectorElementType() == MVT::i16)) {
48799     SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
48800     return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
48801   }
48802 
48803   // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
48804   // to avoid scalarization via legalization because v4i32 is not a legal type.
48805   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
48806       LHS.getValueType() == MVT::v4f32)
48807     return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
48808 
48809   return SDValue();
48810 }
48811 
combineMOVMSK(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)48812 static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
48813                              TargetLowering::DAGCombinerInfo &DCI,
48814                              const X86Subtarget &Subtarget) {
48815   SDValue Src = N->getOperand(0);
48816   MVT SrcVT = Src.getSimpleValueType();
48817   MVT VT = N->getSimpleValueType(0);
48818   unsigned NumBits = VT.getScalarSizeInBits();
48819   unsigned NumElts = SrcVT.getVectorNumElements();
48820 
48821   // Perform constant folding.
48822   if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
48823     assert(VT == MVT::i32 && "Unexpected result type");
48824     APInt Imm(32, 0);
48825     for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
48826       if (!Src.getOperand(Idx).isUndef() &&
48827           Src.getConstantOperandAPInt(Idx).isNegative())
48828         Imm.setBit(Idx);
48829     }
48830     return DAG.getConstant(Imm, SDLoc(N), VT);
48831   }
48832 
48833   // Look through int->fp bitcasts that don't change the element width.
48834   unsigned EltWidth = SrcVT.getScalarSizeInBits();
48835   if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
48836       Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
48837     return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
48838 
48839   // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
48840   // with scalar comparisons.
48841   if (SDValue NotSrc = IsNOT(Src, DAG)) {
48842     SDLoc DL(N);
48843     APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
48844     NotSrc = DAG.getBitcast(SrcVT, NotSrc);
48845     return DAG.getNode(ISD::XOR, DL, VT,
48846                        DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
48847                        DAG.getConstant(NotMask, DL, VT));
48848   }
48849 
48850   // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
48851   // results with scalar comparisons.
48852   if (Src.getOpcode() == X86ISD::PCMPGT &&
48853       ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
48854     SDLoc DL(N);
48855     APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
48856     return DAG.getNode(ISD::XOR, DL, VT,
48857                        DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
48858                        DAG.getConstant(NotMask, DL, VT));
48859   }
48860 
48861   // Simplify the inputs.
48862   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48863   APInt DemandedMask(APInt::getAllOnesValue(NumBits));
48864   if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
48865     return SDValue(N, 0);
48866 
48867   return SDValue();
48868 }
48869 
combineX86GatherScatter(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)48870 static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
48871                                        TargetLowering::DAGCombinerInfo &DCI) {
48872   // With vector masks we only demand the upper bit of the mask.
48873   SDValue Mask = cast<X86MaskedGatherScatterSDNode>(N)->getMask();
48874   if (Mask.getScalarValueSizeInBits() != 1) {
48875     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48876     APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
48877     if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
48878       if (N->getOpcode() != ISD::DELETED_NODE)
48879         DCI.AddToWorklist(N);
48880       return SDValue(N, 0);
48881     }
48882   }
48883 
48884   return SDValue();
48885 }
48886 
rebuildGatherScatter(MaskedGatherScatterSDNode * GorS,SDValue Index,SDValue Base,SDValue Scale,SelectionDAG & DAG)48887 static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
48888                                     SDValue Index, SDValue Base, SDValue Scale,
48889                                     SelectionDAG &DAG) {
48890   SDLoc DL(GorS);
48891 
48892   if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
48893     SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
48894                       Gather->getMask(), Base, Index, Scale } ;
48895     return DAG.getMaskedGather(Gather->getVTList(),
48896                                Gather->getMemoryVT(), DL, Ops,
48897                                Gather->getMemOperand(),
48898                                Gather->getIndexType(),
48899                                Gather->getExtensionType());
48900   }
48901   auto *Scatter = cast<MaskedScatterSDNode>(GorS);
48902   SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
48903                     Scatter->getMask(), Base, Index, Scale };
48904   return DAG.getMaskedScatter(Scatter->getVTList(),
48905                               Scatter->getMemoryVT(), DL,
48906                               Ops, Scatter->getMemOperand(),
48907                               Scatter->getIndexType(),
48908                               Scatter->isTruncatingStore());
48909 }
48910 
combineGatherScatter(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)48911 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
48912                                     TargetLowering::DAGCombinerInfo &DCI) {
48913   SDLoc DL(N);
48914   auto *GorS = cast<MaskedGatherScatterSDNode>(N);
48915   SDValue Index = GorS->getIndex();
48916   SDValue Base = GorS->getBasePtr();
48917   SDValue Scale = GorS->getScale();
48918 
48919   if (DCI.isBeforeLegalize()) {
48920     unsigned IndexWidth = Index.getScalarValueSizeInBits();
48921 
48922     // Shrink constant indices if they are larger than 32-bits.
48923     // Only do this before legalize types since v2i64 could become v2i32.
48924     // FIXME: We could check that the type is legal if we're after legalize
48925     // types, but then we would need to construct test cases where that happens.
48926     // FIXME: We could support more than just constant vectors, but we need to
48927     // careful with costing. A truncate that can be optimized out would be fine.
48928     // Otherwise we might only want to create a truncate if it avoids a split.
48929     if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
48930       if (BV->isConstant() && IndexWidth > 32 &&
48931           DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
48932         EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
48933         Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
48934         return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
48935       }
48936     }
48937 
48938     // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
48939     // there are sufficient sign bits. Only do this before legalize types to
48940     // avoid creating illegal types in truncate.
48941     if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
48942          Index.getOpcode() == ISD::ZERO_EXTEND) &&
48943         IndexWidth > 32 &&
48944         Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
48945         DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
48946       EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
48947       Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
48948       return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
48949     }
48950   }
48951 
48952   if (DCI.isBeforeLegalizeOps()) {
48953     unsigned IndexWidth = Index.getScalarValueSizeInBits();
48954 
48955     // Make sure the index is either i32 or i64
48956     if (IndexWidth != 32 && IndexWidth != 64) {
48957       MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
48958       EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
48959       Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
48960       return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
48961     }
48962   }
48963 
48964   // With vector masks we only demand the upper bit of the mask.
48965   SDValue Mask = GorS->getMask();
48966   if (Mask.getScalarValueSizeInBits() != 1) {
48967     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48968     APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
48969     if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
48970       if (N->getOpcode() != ISD::DELETED_NODE)
48971         DCI.AddToWorklist(N);
48972       return SDValue(N, 0);
48973     }
48974   }
48975 
48976   return SDValue();
48977 }
48978 
48979 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
combineX86SetCC(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)48980 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
48981                                const X86Subtarget &Subtarget) {
48982   SDLoc DL(N);
48983   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
48984   SDValue EFLAGS = N->getOperand(1);
48985 
48986   // Try to simplify the EFLAGS and condition code operands.
48987   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
48988     return getSETCC(CC, Flags, DL, DAG);
48989 
48990   return SDValue();
48991 }
48992 
48993 /// Optimize branch condition evaluation.
combineBrCond(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)48994 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
48995                              const X86Subtarget &Subtarget) {
48996   SDLoc DL(N);
48997   SDValue EFLAGS = N->getOperand(3);
48998   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
48999 
49000   // Try to simplify the EFLAGS and condition code operands.
49001   // Make sure to not keep references to operands, as combineSetCCEFLAGS can
49002   // RAUW them under us.
49003   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
49004     SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
49005     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
49006                        N->getOperand(1), Cond, Flags);
49007   }
49008 
49009   return SDValue();
49010 }
49011 
49012 // TODO: Could we move this to DAGCombine?
combineVectorCompareAndMaskUnaryOp(SDNode * N,SelectionDAG & DAG)49013 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
49014                                                   SelectionDAG &DAG) {
49015   // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
49016   // to optimize away operation when it's from a constant.
49017   //
49018   // The general transformation is:
49019   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
49020   //       AND(VECTOR_CMP(x,y), constant2)
49021   //    constant2 = UNARYOP(constant)
49022 
49023   // Early exit if this isn't a vector operation, the operand of the
49024   // unary operation isn't a bitwise AND, or if the sizes of the operations
49025   // aren't the same.
49026   EVT VT = N->getValueType(0);
49027   bool IsStrict = N->isStrictFPOpcode();
49028   unsigned NumEltBits = VT.getScalarSizeInBits();
49029   SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
49030   if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
49031       DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
49032       VT.getSizeInBits() != Op0.getValueSizeInBits())
49033     return SDValue();
49034 
49035   // Now check that the other operand of the AND is a constant. We could
49036   // make the transformation for non-constant splats as well, but it's unclear
49037   // that would be a benefit as it would not eliminate any operations, just
49038   // perform one more step in scalar code before moving to the vector unit.
49039   if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
49040     // Bail out if the vector isn't a constant.
49041     if (!BV->isConstant())
49042       return SDValue();
49043 
49044     // Everything checks out. Build up the new and improved node.
49045     SDLoc DL(N);
49046     EVT IntVT = BV->getValueType(0);
49047     // Create a new constant of the appropriate type for the transformed
49048     // DAG.
49049     SDValue SourceConst;
49050     if (IsStrict)
49051       SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
49052                                 {N->getOperand(0), SDValue(BV, 0)});
49053     else
49054       SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
49055     // The AND node needs bitcasts to/from an integer vector type around it.
49056     SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
49057     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
49058                                  MaskConst);
49059     SDValue Res = DAG.getBitcast(VT, NewAnd);
49060     if (IsStrict)
49061       return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
49062     return Res;
49063   }
49064 
49065   return SDValue();
49066 }
49067 
49068 /// If we are converting a value to floating-point, try to replace scalar
49069 /// truncate of an extracted vector element with a bitcast. This tries to keep
49070 /// the sequence on XMM registers rather than moving between vector and GPRs.
combineToFPTruncExtElt(SDNode * N,SelectionDAG & DAG)49071 static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
49072   // TODO: This is currently only used by combineSIntToFP, but it is generalized
49073   //       to allow being called by any similar cast opcode.
49074   // TODO: Consider merging this into lowering: vectorizeExtractedCast().
49075   SDValue Trunc = N->getOperand(0);
49076   if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
49077     return SDValue();
49078 
49079   SDValue ExtElt = Trunc.getOperand(0);
49080   if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49081       !isNullConstant(ExtElt.getOperand(1)))
49082     return SDValue();
49083 
49084   EVT TruncVT = Trunc.getValueType();
49085   EVT SrcVT = ExtElt.getValueType();
49086   unsigned DestWidth = TruncVT.getSizeInBits();
49087   unsigned SrcWidth = SrcVT.getSizeInBits();
49088   if (SrcWidth % DestWidth != 0)
49089     return SDValue();
49090 
49091   // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
49092   EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
49093   unsigned VecWidth = SrcVecVT.getSizeInBits();
49094   unsigned NumElts = VecWidth / DestWidth;
49095   EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
49096   SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
49097   SDLoc DL(N);
49098   SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
49099                                   BitcastVec, ExtElt.getOperand(1));
49100   return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
49101 }
49102 
combineUIntToFP(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)49103 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
49104                                const X86Subtarget &Subtarget) {
49105   bool IsStrict = N->isStrictFPOpcode();
49106   SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
49107   EVT VT = N->getValueType(0);
49108   EVT InVT = Op0.getValueType();
49109 
49110   // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
49111   // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
49112   // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
49113   if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
49114     SDLoc dl(N);
49115     EVT DstVT = InVT.changeVectorElementType(MVT::i32);
49116     SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
49117 
49118     // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
49119     if (IsStrict)
49120       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
49121                          {N->getOperand(0), P});
49122     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
49123   }
49124 
49125   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
49126   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
49127   // the optimization here.
49128   if (DAG.SignBitIsZero(Op0)) {
49129     if (IsStrict)
49130       return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
49131                          {N->getOperand(0), Op0});
49132     return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
49133   }
49134 
49135   return SDValue();
49136 }
49137 
combineSIntToFP(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)49138 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
49139                                TargetLowering::DAGCombinerInfo &DCI,
49140                                const X86Subtarget &Subtarget) {
49141   // First try to optimize away the conversion entirely when it's
49142   // conditionally from a constant. Vectors only.
49143   bool IsStrict = N->isStrictFPOpcode();
49144   if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
49145     return Res;
49146 
49147   // Now move on to more general possibilities.
49148   SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
49149   EVT VT = N->getValueType(0);
49150   EVT InVT = Op0.getValueType();
49151 
49152   // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
49153   // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
49154   // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
49155   if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
49156     SDLoc dl(N);
49157     EVT DstVT = InVT.changeVectorElementType(MVT::i32);
49158     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
49159     if (IsStrict)
49160       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
49161                          {N->getOperand(0), P});
49162     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
49163   }
49164 
49165   // Without AVX512DQ we only support i64 to float scalar conversion. For both
49166   // vectors and scalars, see if we know that the upper bits are all the sign
49167   // bit, in which case we can truncate the input to i32 and convert from that.
49168   if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
49169     unsigned BitWidth = InVT.getScalarSizeInBits();
49170     unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
49171     if (NumSignBits >= (BitWidth - 31)) {
49172       EVT TruncVT = MVT::i32;
49173       if (InVT.isVector())
49174         TruncVT = InVT.changeVectorElementType(TruncVT);
49175       SDLoc dl(N);
49176       if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
49177         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
49178         if (IsStrict)
49179           return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
49180                              {N->getOperand(0), Trunc});
49181         return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
49182       }
49183       // If we're after legalize and the type is v2i32 we need to shuffle and
49184       // use CVTSI2P.
49185       assert(InVT == MVT::v2i64 && "Unexpected VT!");
49186       SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
49187       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
49188                                           { 0, 2, -1, -1 });
49189       if (IsStrict)
49190         return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
49191                            {N->getOperand(0), Shuf});
49192       return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
49193     }
49194   }
49195 
49196   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
49197   // a 32-bit target where SSE doesn't support i64->FP operations.
49198   if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
49199       Op0.getOpcode() == ISD::LOAD) {
49200     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
49201 
49202     // This transformation is not supported if the result type is f16 or f128.
49203     if (VT == MVT::f16 || VT == MVT::f128)
49204       return SDValue();
49205 
49206     // If we have AVX512DQ we can use packed conversion instructions unless
49207     // the VT is f80.
49208     if (Subtarget.hasDQI() && VT != MVT::f80)
49209       return SDValue();
49210 
49211     if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
49212         Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
49213       std::pair<SDValue, SDValue> Tmp =
49214           Subtarget.getTargetLowering()->BuildFILD(
49215               VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
49216               Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
49217       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
49218       return Tmp.first;
49219     }
49220   }
49221 
49222   if (IsStrict)
49223     return SDValue();
49224 
49225   if (SDValue V = combineToFPTruncExtElt(N, DAG))
49226     return V;
49227 
49228   return SDValue();
49229 }
49230 
needCarryOrOverflowFlag(SDValue Flags)49231 static bool needCarryOrOverflowFlag(SDValue Flags) {
49232   assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
49233 
49234   for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
49235          UI != UE; ++UI) {
49236     SDNode *User = *UI;
49237 
49238     X86::CondCode CC;
49239     switch (User->getOpcode()) {
49240     default:
49241       // Be conservative.
49242       return true;
49243     case X86ISD::SETCC:
49244     case X86ISD::SETCC_CARRY:
49245       CC = (X86::CondCode)User->getConstantOperandVal(0);
49246       break;
49247     case X86ISD::BRCOND:
49248       CC = (X86::CondCode)User->getConstantOperandVal(2);
49249       break;
49250     case X86ISD::CMOV:
49251       CC = (X86::CondCode)User->getConstantOperandVal(2);
49252       break;
49253     }
49254 
49255     switch (CC) {
49256     default: break;
49257     case X86::COND_A: case X86::COND_AE:
49258     case X86::COND_B: case X86::COND_BE:
49259     case X86::COND_O: case X86::COND_NO:
49260     case X86::COND_G: case X86::COND_GE:
49261     case X86::COND_L: case X86::COND_LE:
49262       return true;
49263     }
49264   }
49265 
49266   return false;
49267 }
49268 
onlyZeroFlagUsed(SDValue Flags)49269 static bool onlyZeroFlagUsed(SDValue Flags) {
49270   assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
49271 
49272   for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
49273          UI != UE; ++UI) {
49274     SDNode *User = *UI;
49275 
49276     unsigned CCOpNo;
49277     switch (User->getOpcode()) {
49278     default:
49279       // Be conservative.
49280       return false;
49281     case X86ISD::SETCC:       CCOpNo = 0; break;
49282     case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
49283     case X86ISD::BRCOND:      CCOpNo = 2; break;
49284     case X86ISD::CMOV:        CCOpNo = 2; break;
49285     }
49286 
49287     X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
49288     if (CC != X86::COND_E && CC != X86::COND_NE)
49289       return false;
49290   }
49291 
49292   return true;
49293 }
49294 
combineCMP(SDNode * N,SelectionDAG & DAG)49295 static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
49296   // Only handle test patterns.
49297   if (!isNullConstant(N->getOperand(1)))
49298     return SDValue();
49299 
49300   // If we have a CMP of a truncated binop, see if we can make a smaller binop
49301   // and use its flags directly.
49302   // TODO: Maybe we should try promoting compares that only use the zero flag
49303   // first if we can prove the upper bits with computeKnownBits?
49304   SDLoc dl(N);
49305   SDValue Op = N->getOperand(0);
49306   EVT VT = Op.getValueType();
49307 
49308   // If we have a constant logical shift that's only used in a comparison
49309   // against zero turn it into an equivalent AND. This allows turning it into
49310   // a TEST instruction later.
49311   if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
49312       Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
49313       onlyZeroFlagUsed(SDValue(N, 0))) {
49314     unsigned BitWidth = VT.getSizeInBits();
49315     const APInt &ShAmt = Op.getConstantOperandAPInt(1);
49316     if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
49317       unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
49318       APInt Mask = Op.getOpcode() == ISD::SRL
49319                        ? APInt::getHighBitsSet(BitWidth, MaskBits)
49320                        : APInt::getLowBitsSet(BitWidth, MaskBits);
49321       if (Mask.isSignedIntN(32)) {
49322         Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
49323                          DAG.getConstant(Mask, dl, VT));
49324         return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
49325                            DAG.getConstant(0, dl, VT));
49326       }
49327     }
49328   }
49329 
49330   // Look for a truncate.
49331   if (Op.getOpcode() != ISD::TRUNCATE)
49332     return SDValue();
49333 
49334   SDValue Trunc = Op;
49335   Op = Op.getOperand(0);
49336 
49337   // See if we can compare with zero against the truncation source,
49338   // which should help using the Z flag from many ops. Only do this for
49339   // i32 truncated op to prevent partial-reg compares of promoted ops.
49340   EVT OpVT = Op.getValueType();
49341   APInt UpperBits =
49342       APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());
49343   if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
49344       onlyZeroFlagUsed(SDValue(N, 0))) {
49345     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
49346                        DAG.getConstant(0, dl, OpVT));
49347   }
49348 
49349   // After this the truncate and arithmetic op must have a single use.
49350   if (!Trunc.hasOneUse() || !Op.hasOneUse())
49351       return SDValue();
49352 
49353   unsigned NewOpc;
49354   switch (Op.getOpcode()) {
49355   default: return SDValue();
49356   case ISD::AND:
49357     // Skip and with constant. We have special handling for and with immediate
49358     // during isel to generate test instructions.
49359     if (isa<ConstantSDNode>(Op.getOperand(1)))
49360       return SDValue();
49361     NewOpc = X86ISD::AND;
49362     break;
49363   case ISD::OR:  NewOpc = X86ISD::OR;  break;
49364   case ISD::XOR: NewOpc = X86ISD::XOR; break;
49365   case ISD::ADD:
49366     // If the carry or overflow flag is used, we can't truncate.
49367     if (needCarryOrOverflowFlag(SDValue(N, 0)))
49368       return SDValue();
49369     NewOpc = X86ISD::ADD;
49370     break;
49371   case ISD::SUB:
49372     // If the carry or overflow flag is used, we can't truncate.
49373     if (needCarryOrOverflowFlag(SDValue(N, 0)))
49374       return SDValue();
49375     NewOpc = X86ISD::SUB;
49376     break;
49377   }
49378 
49379   // We found an op we can narrow. Truncate its inputs.
49380   SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
49381   SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
49382 
49383   // Use a X86 specific opcode to avoid DAG combine messing with it.
49384   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49385   Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
49386 
49387   // For AND, keep a CMP so that we can match the test pattern.
49388   if (NewOpc == X86ISD::AND)
49389     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
49390                        DAG.getConstant(0, dl, VT));
49391 
49392   // Return the flags.
49393   return Op.getValue(1);
49394 }
49395 
combineX86AddSub(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)49396 static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
49397                                 TargetLowering::DAGCombinerInfo &DCI) {
49398   assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
49399          "Expected X86ISD::ADD or X86ISD::SUB");
49400 
49401   SDLoc DL(N);
49402   SDValue LHS = N->getOperand(0);
49403   SDValue RHS = N->getOperand(1);
49404   MVT VT = LHS.getSimpleValueType();
49405   unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;
49406 
49407   // If we don't use the flag result, simplify back to a generic ADD/SUB.
49408   if (!N->hasAnyUseOfValue(1)) {
49409     SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
49410     return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
49411   }
49412 
49413   // Fold any similar generic ADD/SUB opcodes to reuse this node.
49414   auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
49415     SDValue Ops[] = {N0, N1};
49416     SDVTList VTs = DAG.getVTList(N->getValueType(0));
49417     if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
49418       SDValue Op(N, 0);
49419       if (Negate)
49420         Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
49421       DCI.CombineTo(GenericAddSub, Op);
49422     }
49423   };
49424   MatchGeneric(LHS, RHS, false);
49425   MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
49426 
49427   return SDValue();
49428 }
49429 
combineSBB(SDNode * N,SelectionDAG & DAG)49430 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
49431   if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
49432     MVT VT = N->getSimpleValueType(0);
49433     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49434     return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
49435                        N->getOperand(0), N->getOperand(1),
49436                        Flags);
49437   }
49438 
49439   // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
49440   // iff the flag result is dead.
49441   SDValue Op0 = N->getOperand(0);
49442   SDValue Op1 = N->getOperand(1);
49443   if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
49444       !N->hasAnyUseOfValue(1))
49445     return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
49446                        Op0.getOperand(1), N->getOperand(2));
49447 
49448   return SDValue();
49449 }
49450 
49451 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
combineADC(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)49452 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
49453                           TargetLowering::DAGCombinerInfo &DCI) {
49454   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
49455   // the result is either zero or one (depending on the input carry bit).
49456   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
49457   if (X86::isZeroNode(N->getOperand(0)) &&
49458       X86::isZeroNode(N->getOperand(1)) &&
49459       // We don't have a good way to replace an EFLAGS use, so only do this when
49460       // dead right now.
49461       SDValue(N, 1).use_empty()) {
49462     SDLoc DL(N);
49463     EVT VT = N->getValueType(0);
49464     SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
49465     SDValue Res1 =
49466         DAG.getNode(ISD::AND, DL, VT,
49467                     DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49468                                 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49469                                 N->getOperand(2)),
49470                     DAG.getConstant(1, DL, VT));
49471     return DCI.CombineTo(N, Res1, CarryOut);
49472   }
49473 
49474   if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
49475     MVT VT = N->getSimpleValueType(0);
49476     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49477     return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
49478                        N->getOperand(0), N->getOperand(1),
49479                        Flags);
49480   }
49481 
49482   return SDValue();
49483 }
49484 
49485 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
49486 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
49487 /// with CMP+{ADC, SBB}.
combineAddOrSubToADCOrSBB(SDNode * N,SelectionDAG & DAG)49488 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
49489   bool IsSub = N->getOpcode() == ISD::SUB;
49490   SDValue X = N->getOperand(0);
49491   SDValue Y = N->getOperand(1);
49492 
49493   // If this is an add, canonicalize a zext operand to the RHS.
49494   // TODO: Incomplete? What if both sides are zexts?
49495   if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
49496       Y.getOpcode() != ISD::ZERO_EXTEND)
49497     std::swap(X, Y);
49498 
49499   // Look through a one-use zext.
49500   bool PeekedThroughZext = false;
49501   if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
49502     Y = Y.getOperand(0);
49503     PeekedThroughZext = true;
49504   }
49505 
49506   // If this is an add, canonicalize a setcc operand to the RHS.
49507   // TODO: Incomplete? What if both sides are setcc?
49508   // TODO: Should we allow peeking through a zext of the other operand?
49509   if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
49510       Y.getOpcode() != X86ISD::SETCC)
49511     std::swap(X, Y);
49512 
49513   if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
49514     return SDValue();
49515 
49516   SDLoc DL(N);
49517   EVT VT = N->getValueType(0);
49518   X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
49519 
49520   // If X is -1 or 0, then we have an opportunity to avoid constants required in
49521   // the general case below.
49522   auto *ConstantX = dyn_cast<ConstantSDNode>(X);
49523   if (ConstantX) {
49524     if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
49525         (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
49526       // This is a complicated way to get -1 or 0 from the carry flag:
49527       // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
49528       //  0 - SETB  -->  0 -  (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
49529       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49530                          DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49531                          Y.getOperand(1));
49532     }
49533 
49534     if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
49535         (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
49536       SDValue EFLAGS = Y->getOperand(1);
49537       if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
49538           EFLAGS.getValueType().isInteger() &&
49539           !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49540         // Swap the operands of a SUB, and we have the same pattern as above.
49541         // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
49542         //  0 - SETA  (SUB A, B) -->  0 - SETB  (SUB B, A) --> SUB + SBB
49543         SDValue NewSub = DAG.getNode(
49544             X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49545             EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49546         SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
49547         return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49548                            DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49549                            NewEFLAGS);
49550       }
49551     }
49552   }
49553 
49554   if (CC == X86::COND_B) {
49555     // X + SETB Z --> adc X, 0
49556     // X - SETB Z --> sbb X, 0
49557     return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
49558                        DAG.getVTList(VT, MVT::i32), X,
49559                        DAG.getConstant(0, DL, VT), Y.getOperand(1));
49560   }
49561 
49562   if (CC == X86::COND_A) {
49563     SDValue EFLAGS = Y.getOperand(1);
49564     // Try to convert COND_A into COND_B in an attempt to facilitate
49565     // materializing "setb reg".
49566     //
49567     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
49568     // cannot take an immediate as its first operand.
49569     //
49570     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
49571         EFLAGS.getValueType().isInteger() &&
49572         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49573       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
49574                                    EFLAGS.getNode()->getVTList(),
49575                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49576       SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
49577       return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
49578                          DAG.getVTList(VT, MVT::i32), X,
49579                          DAG.getConstant(0, DL, VT), NewEFLAGS);
49580     }
49581   }
49582 
49583   if (CC == X86::COND_AE) {
49584     // X + SETAE --> sbb X, -1
49585     // X - SETAE --> adc X, -1
49586     return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
49587                        DAG.getVTList(VT, MVT::i32), X,
49588                        DAG.getConstant(-1, DL, VT), Y.getOperand(1));
49589   }
49590 
49591   if (CC == X86::COND_BE) {
49592     // X + SETBE --> sbb X, -1
49593     // X - SETBE --> adc X, -1
49594     SDValue EFLAGS = Y.getOperand(1);
49595     // Try to convert COND_BE into COND_AE in an attempt to facilitate
49596     // materializing "setae reg".
49597     //
49598     // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
49599     // cannot take an immediate as its first operand.
49600     //
49601     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
49602         EFLAGS.getValueType().isInteger() &&
49603         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49604       SDValue NewSub = DAG.getNode(
49605           X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49606           EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49607       SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
49608       return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
49609                          DAG.getVTList(VT, MVT::i32), X,
49610                          DAG.getConstant(-1, DL, VT), NewEFLAGS);
49611     }
49612   }
49613 
49614   if (CC != X86::COND_E && CC != X86::COND_NE)
49615     return SDValue();
49616 
49617   SDValue Cmp = Y.getOperand(1);
49618   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
49619       !X86::isZeroNode(Cmp.getOperand(1)) ||
49620       !Cmp.getOperand(0).getValueType().isInteger())
49621     return SDValue();
49622 
49623   SDValue Z = Cmp.getOperand(0);
49624   EVT ZVT = Z.getValueType();
49625 
49626   // If X is -1 or 0, then we have an opportunity to avoid constants required in
49627   // the general case below.
49628   if (ConstantX) {
49629     // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
49630     // fake operands:
49631     //  0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
49632     // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
49633     if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
49634         (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
49635       SDValue Zero = DAG.getConstant(0, DL, ZVT);
49636       SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49637       SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
49638       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49639                          DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49640                          SDValue(Neg.getNode(), 1));
49641     }
49642 
49643     // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
49644     // with fake operands:
49645     //  0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
49646     // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
49647     if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
49648         (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
49649       SDValue One = DAG.getConstant(1, DL, ZVT);
49650       SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49651       SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
49652       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49653                          DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49654                          Cmp1.getValue(1));
49655     }
49656   }
49657 
49658   // (cmp Z, 1) sets the carry flag if Z is 0.
49659   SDValue One = DAG.getConstant(1, DL, ZVT);
49660   SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49661   SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
49662 
49663   // Add the flags type for ADC/SBB nodes.
49664   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49665 
49666   // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
49667   // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
49668   if (CC == X86::COND_NE)
49669     return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
49670                        DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
49671 
49672   // X - (Z == 0) --> sub X, (zext(sete  Z, 0)) --> sbb X, 0, (cmp Z, 1)
49673   // X + (Z == 0) --> add X, (zext(sete  Z, 0)) --> adc X, 0, (cmp Z, 1)
49674   return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
49675                      DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
49676 }
49677 
matchPMADDWD(SelectionDAG & DAG,SDValue Op0,SDValue Op1,const SDLoc & DL,EVT VT,const X86Subtarget & Subtarget)49678 static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
49679                             const SDLoc &DL, EVT VT,
49680                             const X86Subtarget &Subtarget) {
49681   // Example of pattern we try to detect:
49682   // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
49683   //(add (build_vector (extract_elt t, 0),
49684   //                   (extract_elt t, 2),
49685   //                   (extract_elt t, 4),
49686   //                   (extract_elt t, 6)),
49687   //     (build_vector (extract_elt t, 1),
49688   //                   (extract_elt t, 3),
49689   //                   (extract_elt t, 5),
49690   //                   (extract_elt t, 7)))
49691 
49692   if (!Subtarget.hasSSE2())
49693     return SDValue();
49694 
49695   if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
49696       Op1.getOpcode() != ISD::BUILD_VECTOR)
49697     return SDValue();
49698 
49699   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
49700       VT.getVectorNumElements() < 4 ||
49701       !isPowerOf2_32(VT.getVectorNumElements()))
49702     return SDValue();
49703 
49704   // Check if one of Op0,Op1 is of the form:
49705   // (build_vector (extract_elt Mul, 0),
49706   //               (extract_elt Mul, 2),
49707   //               (extract_elt Mul, 4),
49708   //                   ...
49709   // the other is of the form:
49710   // (build_vector (extract_elt Mul, 1),
49711   //               (extract_elt Mul, 3),
49712   //               (extract_elt Mul, 5),
49713   //                   ...
49714   // and identify Mul.
49715   SDValue Mul;
49716   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
49717     SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
49718             Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
49719     // TODO: Be more tolerant to undefs.
49720     if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49721         Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49722         Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49723         Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
49724       return SDValue();
49725     auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
49726     auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
49727     auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
49728     auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
49729     if (!Const0L || !Const1L || !Const0H || !Const1H)
49730       return SDValue();
49731     unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
49732              Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
49733     // Commutativity of mul allows factors of a product to reorder.
49734     if (Idx0L > Idx1L)
49735       std::swap(Idx0L, Idx1L);
49736     if (Idx0H > Idx1H)
49737       std::swap(Idx0H, Idx1H);
49738     // Commutativity of add allows pairs of factors to reorder.
49739     if (Idx0L > Idx0H) {
49740       std::swap(Idx0L, Idx0H);
49741       std::swap(Idx1L, Idx1H);
49742     }
49743     if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
49744         Idx1H != 2 * i + 3)
49745       return SDValue();
49746     if (!Mul) {
49747       // First time an extract_elt's source vector is visited. Must be a MUL
49748       // with 2X number of vector elements than the BUILD_VECTOR.
49749       // Both extracts must be from same MUL.
49750       Mul = Op0L->getOperand(0);
49751       if (Mul->getOpcode() != ISD::MUL ||
49752           Mul.getValueType().getVectorNumElements() != 2 * e)
49753         return SDValue();
49754     }
49755     // Check that the extract is from the same MUL previously seen.
49756     if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
49757         Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
49758       return SDValue();
49759   }
49760 
49761   // Check if the Mul source can be safely shrunk.
49762   ShrinkMode Mode;
49763   if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
49764       Mode == ShrinkMode::MULU16)
49765     return SDValue();
49766 
49767   EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
49768                                  VT.getVectorNumElements() * 2);
49769   SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
49770   SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
49771 
49772   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49773                          ArrayRef<SDValue> Ops) {
49774     EVT InVT = Ops[0].getValueType();
49775     assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
49776     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
49777                                  InVT.getVectorNumElements() / 2);
49778     return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
49779   };
49780   return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
49781 }
49782 
49783 // Attempt to turn this pattern into PMADDWD.
49784 // (add (mul (sext (build_vector)), (sext (build_vector))),
49785 //      (mul (sext (build_vector)), (sext (build_vector)))
matchPMADDWD_2(SelectionDAG & DAG,SDValue N0,SDValue N1,const SDLoc & DL,EVT VT,const X86Subtarget & Subtarget)49786 static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
49787                               const SDLoc &DL, EVT VT,
49788                               const X86Subtarget &Subtarget) {
49789   if (!Subtarget.hasSSE2())
49790     return SDValue();
49791 
49792   if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
49793     return SDValue();
49794 
49795   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
49796       VT.getVectorNumElements() < 4 ||
49797       !isPowerOf2_32(VT.getVectorNumElements()))
49798     return SDValue();
49799 
49800   SDValue N00 = N0.getOperand(0);
49801   SDValue N01 = N0.getOperand(1);
49802   SDValue N10 = N1.getOperand(0);
49803   SDValue N11 = N1.getOperand(1);
49804 
49805   // All inputs need to be sign extends.
49806   // TODO: Support ZERO_EXTEND from known positive?
49807   if (N00.getOpcode() != ISD::SIGN_EXTEND ||
49808       N01.getOpcode() != ISD::SIGN_EXTEND ||
49809       N10.getOpcode() != ISD::SIGN_EXTEND ||
49810       N11.getOpcode() != ISD::SIGN_EXTEND)
49811     return SDValue();
49812 
49813   // Peek through the extends.
49814   N00 = N00.getOperand(0);
49815   N01 = N01.getOperand(0);
49816   N10 = N10.getOperand(0);
49817   N11 = N11.getOperand(0);
49818 
49819   // Must be extending from vXi16.
49820   EVT InVT = N00.getValueType();
49821   if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
49822       N10.getValueType() != InVT || N11.getValueType() != InVT)
49823     return SDValue();
49824 
49825   // All inputs should be build_vectors.
49826   if (N00.getOpcode() != ISD::BUILD_VECTOR ||
49827       N01.getOpcode() != ISD::BUILD_VECTOR ||
49828       N10.getOpcode() != ISD::BUILD_VECTOR ||
49829       N11.getOpcode() != ISD::BUILD_VECTOR)
49830     return SDValue();
49831 
49832   // For each element, we need to ensure we have an odd element from one vector
49833   // multiplied by the odd element of another vector and the even element from
49834   // one of the same vectors being multiplied by the even element from the
49835   // other vector. So we need to make sure for each element i, this operator
49836   // is being performed:
49837   //  A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
49838   SDValue In0, In1;
49839   for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
49840     SDValue N00Elt = N00.getOperand(i);
49841     SDValue N01Elt = N01.getOperand(i);
49842     SDValue N10Elt = N10.getOperand(i);
49843     SDValue N11Elt = N11.getOperand(i);
49844     // TODO: Be more tolerant to undefs.
49845     if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49846         N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49847         N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49848         N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
49849       return SDValue();
49850     auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
49851     auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
49852     auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
49853     auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
49854     if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
49855       return SDValue();
49856     unsigned IdxN00 = ConstN00Elt->getZExtValue();
49857     unsigned IdxN01 = ConstN01Elt->getZExtValue();
49858     unsigned IdxN10 = ConstN10Elt->getZExtValue();
49859     unsigned IdxN11 = ConstN11Elt->getZExtValue();
49860     // Add is commutative so indices can be reordered.
49861     if (IdxN00 > IdxN10) {
49862       std::swap(IdxN00, IdxN10);
49863       std::swap(IdxN01, IdxN11);
49864     }
49865     // N0 indices be the even element. N1 indices must be the next odd element.
49866     if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
49867         IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
49868       return SDValue();
49869     SDValue N00In = N00Elt.getOperand(0);
49870     SDValue N01In = N01Elt.getOperand(0);
49871     SDValue N10In = N10Elt.getOperand(0);
49872     SDValue N11In = N11Elt.getOperand(0);
49873 
49874     // First time we find an input capture it.
49875     if (!In0) {
49876       In0 = N00In;
49877       In1 = N01In;
49878 
49879       // The input vectors must be at least as wide as the output.
49880       // If they are larger than the output, we extract subvector below.
49881       if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
49882           In1.getValueSizeInBits() < VT.getSizeInBits())
49883         return SDValue();
49884     }
49885     // Mul is commutative so the input vectors can be in any order.
49886     // Canonicalize to make the compares easier.
49887     if (In0 != N00In)
49888       std::swap(N00In, N01In);
49889     if (In0 != N10In)
49890       std::swap(N10In, N11In);
49891     if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
49892       return SDValue();
49893   }
49894 
49895   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49896                          ArrayRef<SDValue> Ops) {
49897     EVT OpVT = Ops[0].getValueType();
49898     assert(OpVT.getScalarType() == MVT::i16 &&
49899            "Unexpected scalar element type");
49900     assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
49901     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
49902                                  OpVT.getVectorNumElements() / 2);
49903     return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
49904   };
49905 
49906   // If the output is narrower than an input, extract the low part of the input
49907   // vector.
49908   EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
49909                                VT.getVectorNumElements() * 2);
49910   if (OutVT16.bitsLT(In0.getValueType())) {
49911     In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
49912                       DAG.getIntPtrConstant(0, DL));
49913   }
49914   if (OutVT16.bitsLT(In1.getValueType())) {
49915     In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
49916                       DAG.getIntPtrConstant(0, DL));
49917   }
49918   return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
49919                           PMADDBuilder);
49920 }
49921 
49922 /// CMOV of constants requires materializing constant operands in registers.
49923 /// Try to fold those constants into an 'add' instruction to reduce instruction
49924 /// count. We do this with CMOV rather the generic 'select' because there are
49925 /// earlier folds that may be used to turn select-of-constants into logic hacks.
pushAddIntoCmovOfConsts(SDNode * N,SelectionDAG & DAG)49926 static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG) {
49927   // If an operand is zero, add-of-0 gets simplified away, so that's clearly
49928   // better because we eliminate 1-2 instructions. This transform is still
49929   // an improvement without zero operands because we trade 2 move constants and
49930   // 1 add for 2 adds (LEA) as long as the constants can be represented as
49931   // immediate asm operands (fit in 32-bits).
49932   auto isSuitableCmov = [](SDValue V) {
49933     if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
49934       return false;
49935     if (!isa<ConstantSDNode>(V.getOperand(0)) ||
49936         !isa<ConstantSDNode>(V.getOperand(1)))
49937       return false;
49938     return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
49939            (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
49940             V.getConstantOperandAPInt(1).isSignedIntN(32));
49941   };
49942 
49943   // Match an appropriate CMOV as the first operand of the add.
49944   SDValue Cmov = N->getOperand(0);
49945   SDValue OtherOp = N->getOperand(1);
49946   if (!isSuitableCmov(Cmov))
49947     std::swap(Cmov, OtherOp);
49948   if (!isSuitableCmov(Cmov))
49949     return SDValue();
49950 
49951   // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
49952   EVT VT = N->getValueType(0);
49953   SDLoc DL(N);
49954   SDValue FalseOp = Cmov.getOperand(0);
49955   SDValue TrueOp = Cmov.getOperand(1);
49956   FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
49957   TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
49958   return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
49959                      Cmov.getOperand(3));
49960 }
49961 
combineAdd(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)49962 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
49963                           TargetLowering::DAGCombinerInfo &DCI,
49964                           const X86Subtarget &Subtarget) {
49965   EVT VT = N->getValueType(0);
49966   SDValue Op0 = N->getOperand(0);
49967   SDValue Op1 = N->getOperand(1);
49968 
49969   if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG))
49970     return Select;
49971 
49972   if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
49973     return MAdd;
49974   if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
49975     return MAdd;
49976 
49977   // Try to synthesize horizontal adds from adds of shuffles.
49978   if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
49979     return V;
49980 
49981   // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
49982   // (sub Y, (sext (vXi1 X))).
49983   // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
49984   // generic DAG combine without a legal type check, but adding this there
49985   // caused regressions.
49986   if (VT.isVector()) {
49987     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49988     if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
49989         Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
49990         TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
49991       SDLoc DL(N);
49992       SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
49993       return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
49994     }
49995 
49996     if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
49997         Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
49998         TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
49999       SDLoc DL(N);
50000       SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
50001       return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
50002     }
50003   }
50004 
50005   return combineAddOrSubToADCOrSBB(N, DAG);
50006 }
50007 
combineSub(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)50008 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
50009                           TargetLowering::DAGCombinerInfo &DCI,
50010                           const X86Subtarget &Subtarget) {
50011   SDValue Op0 = N->getOperand(0);
50012   SDValue Op1 = N->getOperand(1);
50013 
50014   // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
50015   auto IsNonOpaqueConstant = [&](SDValue Op) {
50016     if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {
50017       if (auto *Cst = dyn_cast<ConstantSDNode>(C))
50018         return !Cst->isOpaque();
50019       return true;
50020     }
50021     return false;
50022   };
50023 
50024   // X86 can't encode an immediate LHS of a sub. See if we can push the
50025   // negation into a preceding instruction. If the RHS of the sub is a XOR with
50026   // one use and a constant, invert the immediate, saving one register.
50027   // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
50028   if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
50029       IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) {
50030     SDLoc DL(N);
50031     EVT VT = Op0.getValueType();
50032     SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
50033                                  DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
50034     SDValue NewAdd =
50035         DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
50036     return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
50037   }
50038 
50039   // Try to synthesize horizontal subs from subs of shuffles.
50040   if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
50041     return V;
50042 
50043   return combineAddOrSubToADCOrSBB(N, DAG);
50044 }
50045 
combineVectorCompare(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)50046 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
50047                                     const X86Subtarget &Subtarget) {
50048   MVT VT = N->getSimpleValueType(0);
50049   SDLoc DL(N);
50050 
50051   if (N->getOperand(0) == N->getOperand(1)) {
50052     if (N->getOpcode() == X86ISD::PCMPEQ)
50053       return DAG.getConstant(-1, DL, VT);
50054     if (N->getOpcode() == X86ISD::PCMPGT)
50055       return DAG.getConstant(0, DL, VT);
50056   }
50057 
50058   return SDValue();
50059 }
50060 
50061 /// Helper that combines an array of subvector ops as if they were the operands
50062 /// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
50063 /// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
combineConcatVectorOps(const SDLoc & DL,MVT VT,ArrayRef<SDValue> Ops,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)50064 static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
50065                                       ArrayRef<SDValue> Ops, SelectionDAG &DAG,
50066                                       TargetLowering::DAGCombinerInfo &DCI,
50067                                       const X86Subtarget &Subtarget) {
50068   assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
50069   unsigned EltSizeInBits = VT.getScalarSizeInBits();
50070 
50071   if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
50072     return DAG.getUNDEF(VT);
50073 
50074   if (llvm::all_of(Ops, [](SDValue Op) {
50075         return ISD::isBuildVectorAllZeros(Op.getNode());
50076       }))
50077     return getZeroVector(VT, Subtarget, DAG, DL);
50078 
50079   SDValue Op0 = Ops[0];
50080   bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
50081 
50082   // Repeated subvectors.
50083   if (IsSplat &&
50084       (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
50085     // If this broadcast is inserted into both halves, use a larger broadcast.
50086     if (Op0.getOpcode() == X86ISD::VBROADCAST)
50087       return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
50088 
50089     // If this scalar/subvector broadcast_load is inserted into both halves, use
50090     // a larger broadcast_load. Update other uses to use an extracted subvector.
50091     if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
50092         Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
50093       auto *MemIntr = cast<MemIntrinsicSDNode>(Op0);
50094       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
50095       SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};
50096       SDValue BcastLd = DAG.getMemIntrinsicNode(Op0.getOpcode(), DL, Tys, Ops,
50097                                                 MemIntr->getMemoryVT(),
50098                                                 MemIntr->getMemOperand());
50099       DAG.ReplaceAllUsesOfValueWith(
50100           Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
50101       DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
50102       return BcastLd;
50103     }
50104 
50105     // If this is a simple subvector load repeated across multiple lanes, then
50106     // broadcast the load. Update other uses to use an extracted subvector.
50107     if (auto *Ld = dyn_cast<LoadSDNode>(Op0)) {
50108       if (Ld->isSimple() && !Ld->isNonTemporal() &&
50109           Ld->getExtensionType() == ISD::NON_EXTLOAD) {
50110         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
50111         SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
50112         SDValue BcastLd =
50113             DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops,
50114                                     Ld->getMemoryVT(), Ld->getMemOperand());
50115         DAG.ReplaceAllUsesOfValueWith(
50116             Op0,
50117             extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
50118         DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
50119         return BcastLd;
50120       }
50121     }
50122 
50123     // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
50124     if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
50125         (Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))
50126       return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
50127                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
50128                                      Op0.getOperand(0),
50129                                      DAG.getIntPtrConstant(0, DL)));
50130 
50131     // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
50132     if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
50133         (Subtarget.hasAVX2() ||
50134          (EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
50135         Op0.getOperand(0).getValueType() == VT.getScalarType())
50136       return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
50137 
50138     // concat_vectors(extract_subvector(broadcast(x)),
50139     //                extract_subvector(broadcast(x))) -> broadcast(x)
50140     if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
50141         Op0.getOperand(0).getValueType() == VT) {
50142       if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||
50143           Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
50144         return Op0.getOperand(0);
50145     }
50146   }
50147 
50148   // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
50149   // Only concat of subvector high halves which vperm2x128 is best at.
50150   // TODO: This should go in combineX86ShufflesRecursively eventually.
50151   if (VT.is256BitVector() && Ops.size() == 2) {
50152     SDValue Src0 = peekThroughBitcasts(Ops[0]);
50153     SDValue Src1 = peekThroughBitcasts(Ops[1]);
50154     if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
50155         Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
50156       EVT SrcVT0 = Src0.getOperand(0).getValueType();
50157       EVT SrcVT1 = Src1.getOperand(0).getValueType();
50158       unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
50159       unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
50160       if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
50161           Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
50162           Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
50163         return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
50164                            DAG.getBitcast(VT, Src0.getOperand(0)),
50165                            DAG.getBitcast(VT, Src1.getOperand(0)),
50166                            DAG.getTargetConstant(0x31, DL, MVT::i8));
50167       }
50168     }
50169   }
50170 
50171   // Repeated opcode.
50172   // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
50173   // but it currently struggles with different vector widths.
50174   if (llvm::all_of(Ops, [Op0](SDValue Op) {
50175         return Op.getOpcode() == Op0.getOpcode();
50176       })) {
50177     auto ConcatSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
50178       SmallVector<SDValue> Subs;
50179       for (SDValue SubOp : SubOps)
50180         Subs.push_back(SubOp.getOperand(I));
50181       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
50182     };
50183 
50184     unsigned NumOps = Ops.size();
50185     switch (Op0.getOpcode()) {
50186     case X86ISD::SHUFP: {
50187       // Add SHUFPD support if/when necessary.
50188       if (!IsSplat && VT.getScalarType() == MVT::f32 &&
50189           llvm::all_of(Ops, [Op0](SDValue Op) {
50190             return Op.getOperand(2) == Op0.getOperand(2);
50191           })) {
50192         return DAG.getNode(Op0.getOpcode(), DL, VT,
50193                            ConcatSubOperand(VT, Ops, 0),
50194                            ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
50195       }
50196       break;
50197     }
50198     case X86ISD::PSHUFHW:
50199     case X86ISD::PSHUFLW:
50200     case X86ISD::PSHUFD:
50201       if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
50202           Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
50203         return DAG.getNode(Op0.getOpcode(), DL, VT,
50204                            ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
50205       }
50206       LLVM_FALLTHROUGH;
50207     case X86ISD::VPERMILPI:
50208       // TODO - add support for vXf64/vXi64 shuffles.
50209       if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
50210           Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
50211         SDValue Res = DAG.getBitcast(MVT::v8f32, ConcatSubOperand(VT, Ops, 0));
50212         Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
50213                           Op0.getOperand(1));
50214         return DAG.getBitcast(VT, Res);
50215       }
50216       break;
50217     case X86ISD::VPERMV3:
50218       if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
50219         MVT OpVT = Op0.getSimpleValueType();
50220         int NumSrcElts = OpVT.getVectorNumElements();
50221         SmallVector<int, 64> ConcatMask;
50222         for (unsigned i = 0; i != NumOps; ++i) {
50223           SmallVector<int, 64> SubMask;
50224           SmallVector<SDValue, 2> SubOps;
50225           if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
50226                                     SubMask))
50227             break;
50228           for (int M : SubMask) {
50229             if (0 <= M) {
50230               M += M < NumSrcElts ? 0 : NumSrcElts;
50231               M += i * NumSrcElts;
50232             }
50233             ConcatMask.push_back(M);
50234           }
50235         }
50236         if (ConcatMask.size() == (NumOps * NumSrcElts)) {
50237           SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
50238                                           Ops[1].getOperand(0), DAG, DL);
50239           SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
50240                                           Ops[1].getOperand(2), DAG, DL);
50241           MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
50242           MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
50243           SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
50244           return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
50245         }
50246       }
50247       break;
50248     case X86ISD::VSHLI:
50249     case X86ISD::VSRLI:
50250       // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
50251       // TODO: Move this to LowerScalarImmediateShift?
50252       if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
50253           llvm::all_of(Ops, [](SDValue Op) {
50254             return Op.getConstantOperandAPInt(1) == 32;
50255           })) {
50256         SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
50257         SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
50258         if (Op0.getOpcode() == X86ISD::VSHLI) {
50259           Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
50260                                      {8, 0, 8, 2, 8, 4, 8, 6});
50261         } else {
50262           Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
50263                                      {1, 8, 3, 8, 5, 8, 7, 8});
50264         }
50265         return DAG.getBitcast(VT, Res);
50266       }
50267       LLVM_FALLTHROUGH;
50268     case X86ISD::VSRAI:
50269       if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
50270            (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
50271             (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
50272           llvm::all_of(Ops, [Op0](SDValue Op) {
50273             return Op0.getOperand(1) == Op.getOperand(1);
50274           })) {
50275         return DAG.getNode(Op0.getOpcode(), DL, VT,
50276                            ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
50277       }
50278       break;
50279     case X86ISD::VPERMI:
50280     case X86ISD::VROTLI:
50281     case X86ISD::VROTRI:
50282       if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
50283           llvm::all_of(Ops, [Op0](SDValue Op) {
50284             return Op0.getOperand(1) == Op.getOperand(1);
50285           })) {
50286         return DAG.getNode(Op0.getOpcode(), DL, VT,
50287                            ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
50288       }
50289       break;
50290     case ISD::AND:
50291     case ISD::OR:
50292     case ISD::XOR:
50293     case X86ISD::ANDNP:
50294       // TODO: Add 256-bit support.
50295       if (!IsSplat && VT.is512BitVector()) {
50296         MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
50297         SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
50298                                  NumOps * SrcVT.getVectorNumElements());
50299         return DAG.getNode(Op0.getOpcode(), DL, VT,
50300                            ConcatSubOperand(SrcVT, Ops, 0),
50301                            ConcatSubOperand(SrcVT, Ops, 1));
50302       }
50303       break;
50304     case X86ISD::HADD:
50305     case X86ISD::HSUB:
50306     case X86ISD::FHADD:
50307     case X86ISD::FHSUB:
50308     case X86ISD::PACKSS:
50309     case X86ISD::PACKUS:
50310       if (!IsSplat && VT.is256BitVector() &&
50311           (VT.isFloatingPoint() || Subtarget.hasInt256())) {
50312         MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
50313         SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
50314                                  NumOps * SrcVT.getVectorNumElements());
50315         return DAG.getNode(Op0.getOpcode(), DL, VT,
50316                            ConcatSubOperand(SrcVT, Ops, 0),
50317                            ConcatSubOperand(SrcVT, Ops, 1));
50318       }
50319       break;
50320     case X86ISD::PALIGNR:
50321       if (!IsSplat &&
50322           ((VT.is256BitVector() && Subtarget.hasInt256()) ||
50323            (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
50324           llvm::all_of(Ops, [Op0](SDValue Op) {
50325             return Op0.getOperand(2) == Op.getOperand(2);
50326           })) {
50327         return DAG.getNode(Op0.getOpcode(), DL, VT,
50328                            ConcatSubOperand(VT, Ops, 0),
50329                            ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
50330       }
50331       break;
50332     }
50333   }
50334 
50335   // Fold subvector loads into one.
50336   // If needed, look through bitcasts to get to the load.
50337   if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
50338     bool Fast;
50339     const X86TargetLowering *TLI = Subtarget.getTargetLowering();
50340     if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
50341                                 *FirstLd->getMemOperand(), &Fast) &&
50342         Fast) {
50343       if (SDValue Ld =
50344               EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
50345         return Ld;
50346     }
50347   }
50348 
50349   return SDValue();
50350 }
50351 
combineConcatVectors(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)50352 static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
50353                                     TargetLowering::DAGCombinerInfo &DCI,
50354                                     const X86Subtarget &Subtarget) {
50355   EVT VT = N->getValueType(0);
50356   EVT SrcVT = N->getOperand(0).getValueType();
50357   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50358 
50359   // Don't do anything for i1 vectors.
50360   if (VT.getVectorElementType() == MVT::i1)
50361     return SDValue();
50362 
50363   if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
50364     SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
50365     if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
50366                                            DCI, Subtarget))
50367       return R;
50368   }
50369 
50370   return SDValue();
50371 }
50372 
combineInsertSubvector(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)50373 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
50374                                       TargetLowering::DAGCombinerInfo &DCI,
50375                                       const X86Subtarget &Subtarget) {
50376   if (DCI.isBeforeLegalizeOps())
50377     return SDValue();
50378 
50379   MVT OpVT = N->getSimpleValueType(0);
50380 
50381   bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
50382 
50383   SDLoc dl(N);
50384   SDValue Vec = N->getOperand(0);
50385   SDValue SubVec = N->getOperand(1);
50386 
50387   uint64_t IdxVal = N->getConstantOperandVal(2);
50388   MVT SubVecVT = SubVec.getSimpleValueType();
50389 
50390   if (Vec.isUndef() && SubVec.isUndef())
50391     return DAG.getUNDEF(OpVT);
50392 
50393   // Inserting undefs/zeros into zeros/undefs is a zero vector.
50394   if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
50395       (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
50396     return getZeroVector(OpVT, Subtarget, DAG, dl);
50397 
50398   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
50399     // If we're inserting into a zero vector and then into a larger zero vector,
50400     // just insert into the larger zero vector directly.
50401     if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
50402         ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
50403       uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
50404       return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
50405                          getZeroVector(OpVT, Subtarget, DAG, dl),
50406                          SubVec.getOperand(1),
50407                          DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
50408     }
50409 
50410     // If we're inserting into a zero vector and our input was extracted from an
50411     // insert into a zero vector of the same type and the extraction was at
50412     // least as large as the original insertion. Just insert the original
50413     // subvector into a zero vector.
50414     if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
50415         isNullConstant(SubVec.getOperand(1)) &&
50416         SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
50417       SDValue Ins = SubVec.getOperand(0);
50418       if (isNullConstant(Ins.getOperand(2)) &&
50419           ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
50420           Ins.getOperand(1).getValueSizeInBits().getFixedSize() <=
50421               SubVecVT.getFixedSizeInBits())
50422         return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
50423                            getZeroVector(OpVT, Subtarget, DAG, dl),
50424                            Ins.getOperand(1), N->getOperand(2));
50425     }
50426   }
50427 
50428   // Stop here if this is an i1 vector.
50429   if (IsI1Vector)
50430     return SDValue();
50431 
50432   // If this is an insert of an extract, combine to a shuffle. Don't do this
50433   // if the insert or extract can be represented with a subregister operation.
50434   if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
50435       SubVec.getOperand(0).getSimpleValueType() == OpVT &&
50436       (IdxVal != 0 ||
50437        !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
50438     int ExtIdxVal = SubVec.getConstantOperandVal(1);
50439     if (ExtIdxVal != 0) {
50440       int VecNumElts = OpVT.getVectorNumElements();
50441       int SubVecNumElts = SubVecVT.getVectorNumElements();
50442       SmallVector<int, 64> Mask(VecNumElts);
50443       // First create an identity shuffle mask.
50444       for (int i = 0; i != VecNumElts; ++i)
50445         Mask[i] = i;
50446       // Now insert the extracted portion.
50447       for (int i = 0; i != SubVecNumElts; ++i)
50448         Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
50449 
50450       return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
50451     }
50452   }
50453 
50454   // Match concat_vector style patterns.
50455   SmallVector<SDValue, 2> SubVectorOps;
50456   if (collectConcatOps(N, SubVectorOps)) {
50457     if (SDValue Fold =
50458             combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
50459       return Fold;
50460 
50461     // If we're inserting all zeros into the upper half, change this to
50462     // a concat with zero. We will match this to a move
50463     // with implicit upper bit zeroing during isel.
50464     // We do this here because we don't want combineConcatVectorOps to
50465     // create INSERT_SUBVECTOR from CONCAT_VECTORS.
50466     if (SubVectorOps.size() == 2 &&
50467         ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
50468       return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
50469                          getZeroVector(OpVT, Subtarget, DAG, dl),
50470                          SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
50471   }
50472 
50473   // If this is a broadcast insert into an upper undef, use a larger broadcast.
50474   if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
50475     return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
50476 
50477   // If this is a broadcast load inserted into an upper undef, use a larger
50478   // broadcast load.
50479   if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
50480       SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
50481     auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
50482     SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
50483     SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
50484     SDValue BcastLd =
50485         DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
50486                                 MemIntr->getMemoryVT(),
50487                                 MemIntr->getMemOperand());
50488     DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
50489     return BcastLd;
50490   }
50491 
50492   // If we're splatting the lower half subvector of a full vector load into the
50493   // upper half, attempt to create a subvector broadcast.
50494   if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
50495       Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
50496     auto *VecLd = dyn_cast<LoadSDNode>(Vec);
50497     auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
50498     if (VecLd && SubLd &&
50499         DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
50500                                            SubVec.getValueSizeInBits() / 8, 0))
50501       return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
50502                                SubLd, 0, DAG);
50503   }
50504 
50505   return SDValue();
50506 }
50507 
50508 /// If we are extracting a subvector of a vector select and the select condition
50509 /// is composed of concatenated vectors, try to narrow the select width. This
50510 /// is a common pattern for AVX1 integer code because 256-bit selects may be
50511 /// legal, but there is almost no integer math/logic available for 256-bit.
50512 /// This function should only be called with legal types (otherwise, the calls
50513 /// to get simple value types will assert).
narrowExtractedVectorSelect(SDNode * Ext,SelectionDAG & DAG)50514 static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
50515   SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
50516   SmallVector<SDValue, 4> CatOps;
50517   if (Sel.getOpcode() != ISD::VSELECT ||
50518       !collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
50519     return SDValue();
50520 
50521   // Note: We assume simple value types because this should only be called with
50522   //       legal operations/types.
50523   // TODO: This can be extended to handle extraction to 256-bits.
50524   MVT VT = Ext->getSimpleValueType(0);
50525   if (!VT.is128BitVector())
50526     return SDValue();
50527 
50528   MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
50529   if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
50530     return SDValue();
50531 
50532   MVT WideVT = Ext->getOperand(0).getSimpleValueType();
50533   MVT SelVT = Sel.getSimpleValueType();
50534   assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
50535          "Unexpected vector type with legal operations");
50536 
50537   unsigned SelElts = SelVT.getVectorNumElements();
50538   unsigned CastedElts = WideVT.getVectorNumElements();
50539   unsigned ExtIdx = Ext->getConstantOperandVal(1);
50540   if (SelElts % CastedElts == 0) {
50541     // The select has the same or more (narrower) elements than the extract
50542     // operand. The extraction index gets scaled by that factor.
50543     ExtIdx *= (SelElts / CastedElts);
50544   } else if (CastedElts % SelElts == 0) {
50545     // The select has less (wider) elements than the extract operand. Make sure
50546     // that the extraction index can be divided evenly.
50547     unsigned IndexDivisor = CastedElts / SelElts;
50548     if (ExtIdx % IndexDivisor != 0)
50549       return SDValue();
50550     ExtIdx /= IndexDivisor;
50551   } else {
50552     llvm_unreachable("Element count of simple vector types are not divisible?");
50553   }
50554 
50555   unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
50556   unsigned NarrowElts = SelElts / NarrowingFactor;
50557   MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
50558   SDLoc DL(Ext);
50559   SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
50560   SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
50561   SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
50562   SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
50563   return DAG.getBitcast(VT, NarrowSel);
50564 }
50565 
combineExtractSubvector(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)50566 static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
50567                                        TargetLowering::DAGCombinerInfo &DCI,
50568                                        const X86Subtarget &Subtarget) {
50569   // For AVX1 only, if we are extracting from a 256-bit and+not (which will
50570   // eventually get combined/lowered into ANDNP) with a concatenated operand,
50571   // split the 'and' into 128-bit ops to avoid the concatenate and extract.
50572   // We let generic combining take over from there to simplify the
50573   // insert/extract and 'not'.
50574   // This pattern emerges during AVX1 legalization. We handle it before lowering
50575   // to avoid complications like splitting constant vector loads.
50576 
50577   // Capture the original wide type in the likely case that we need to bitcast
50578   // back to this type.
50579   if (!N->getValueType(0).isSimple())
50580     return SDValue();
50581 
50582   MVT VT = N->getSimpleValueType(0);
50583   SDValue InVec = N->getOperand(0);
50584   unsigned IdxVal = N->getConstantOperandVal(1);
50585   SDValue InVecBC = peekThroughBitcasts(InVec);
50586   EVT InVecVT = InVec.getValueType();
50587   unsigned SizeInBits = VT.getSizeInBits();
50588   unsigned InSizeInBits = InVecVT.getSizeInBits();
50589   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50590 
50591   if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
50592       TLI.isTypeLegal(InVecVT) &&
50593       InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
50594     auto isConcatenatedNot = [](SDValue V) {
50595       V = peekThroughBitcasts(V);
50596       if (!isBitwiseNot(V))
50597         return false;
50598       SDValue NotOp = V->getOperand(0);
50599       return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
50600     };
50601     if (isConcatenatedNot(InVecBC.getOperand(0)) ||
50602         isConcatenatedNot(InVecBC.getOperand(1))) {
50603       // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
50604       SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
50605       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
50606                          DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
50607     }
50608   }
50609 
50610   if (DCI.isBeforeLegalizeOps())
50611     return SDValue();
50612 
50613   if (SDValue V = narrowExtractedVectorSelect(N, DAG))
50614     return V;
50615 
50616   if (ISD::isBuildVectorAllZeros(InVec.getNode()))
50617     return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
50618 
50619   if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
50620     if (VT.getScalarType() == MVT::i1)
50621       return DAG.getConstant(1, SDLoc(N), VT);
50622     return getOnesVector(VT, DAG, SDLoc(N));
50623   }
50624 
50625   if (InVec.getOpcode() == ISD::BUILD_VECTOR)
50626     return DAG.getBuildVector(
50627         VT, SDLoc(N),
50628         InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
50629 
50630   // If we are extracting from an insert into a zero vector, replace with a
50631   // smaller insert into zero if we don't access less than the original
50632   // subvector. Don't do this for i1 vectors.
50633   if (VT.getVectorElementType() != MVT::i1 &&
50634       InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
50635       InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
50636       ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
50637       InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
50638     SDLoc DL(N);
50639     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
50640                        getZeroVector(VT, Subtarget, DAG, DL),
50641                        InVec.getOperand(1), InVec.getOperand(2));
50642   }
50643 
50644   // If we're extracting an upper subvector from a broadcast we should just
50645   // extract the lowest subvector instead which should allow
50646   // SimplifyDemandedVectorElts do more simplifications.
50647   if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
50648                       InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||
50649                       DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
50650     return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
50651 
50652   // If we're extracting a broadcasted subvector, just use the lowest subvector.
50653   if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
50654       cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
50655     return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
50656 
50657   // Attempt to extract from the source of a shuffle vector.
50658   if ((InSizeInBits % SizeInBits) == 0 &&
50659       (IdxVal % VT.getVectorNumElements()) == 0) {
50660     SmallVector<int, 32> ShuffleMask;
50661     SmallVector<int, 32> ScaledMask;
50662     SmallVector<SDValue, 2> ShuffleInputs;
50663     unsigned NumSubVecs = InSizeInBits / SizeInBits;
50664     // Decode the shuffle mask and scale it so its shuffling subvectors.
50665     if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
50666         scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
50667       unsigned SubVecIdx = IdxVal / VT.getVectorNumElements();
50668       if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
50669         return DAG.getUNDEF(VT);
50670       if (ScaledMask[SubVecIdx] == SM_SentinelZero)
50671         return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
50672       SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
50673       if (Src.getValueSizeInBits() == InSizeInBits) {
50674         unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
50675         unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();
50676         return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
50677                                 SDLoc(N), SizeInBits);
50678       }
50679     }
50680   }
50681 
50682   // If we're extracting the lowest subvector and we're the only user,
50683   // we may be able to perform this with a smaller vector width.
50684   unsigned InOpcode = InVec.getOpcode();
50685   if (IdxVal == 0 && InVec.hasOneUse()) {
50686     if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
50687       // v2f64 CVTDQ2PD(v4i32).
50688       if (InOpcode == ISD::SINT_TO_FP &&
50689           InVec.getOperand(0).getValueType() == MVT::v4i32) {
50690         return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
50691       }
50692       // v2f64 CVTUDQ2PD(v4i32).
50693       if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
50694           InVec.getOperand(0).getValueType() == MVT::v4i32) {
50695         return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
50696       }
50697       // v2f64 CVTPS2PD(v4f32).
50698       if (InOpcode == ISD::FP_EXTEND &&
50699           InVec.getOperand(0).getValueType() == MVT::v4f32) {
50700         return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
50701       }
50702     }
50703     if ((InOpcode == ISD::ANY_EXTEND ||
50704          InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
50705          InOpcode == ISD::ZERO_EXTEND ||
50706          InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
50707          InOpcode == ISD::SIGN_EXTEND ||
50708          InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
50709         (SizeInBits == 128 || SizeInBits == 256) &&
50710         InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
50711       SDLoc DL(N);
50712       SDValue Ext = InVec.getOperand(0);
50713       if (Ext.getValueSizeInBits() > SizeInBits)
50714         Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
50715       unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
50716       return DAG.getNode(ExtOp, DL, VT, Ext);
50717     }
50718     if (InOpcode == ISD::VSELECT &&
50719         InVec.getOperand(0).getValueType().is256BitVector() &&
50720         InVec.getOperand(1).getValueType().is256BitVector() &&
50721         InVec.getOperand(2).getValueType().is256BitVector()) {
50722       SDLoc DL(N);
50723       SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
50724       SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
50725       SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
50726       return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
50727     }
50728     if (InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
50729         (VT.is128BitVector() || VT.is256BitVector())) {
50730       SDLoc DL(N);
50731       SDValue InVecSrc = InVec.getOperand(0);
50732       unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
50733       SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
50734       return DAG.getNode(InOpcode, DL, VT, Ext);
50735     }
50736   }
50737 
50738   // Always split vXi64 logical shifts where we're extracting the upper 32-bits
50739   // as this is very likely to fold into a shuffle/truncation.
50740   if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
50741       InVecVT.getScalarSizeInBits() == 64 &&
50742       InVec.getConstantOperandAPInt(1) == 32) {
50743     SDLoc DL(N);
50744     SDValue Ext =
50745         extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
50746     return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
50747   }
50748 
50749   return SDValue();
50750 }
50751 
combineScalarToVector(SDNode * N,SelectionDAG & DAG)50752 static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
50753   EVT VT = N->getValueType(0);
50754   SDValue Src = N->getOperand(0);
50755   SDLoc DL(N);
50756 
50757   // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
50758   // This occurs frequently in our masked scalar intrinsic code and our
50759   // floating point select lowering with AVX512.
50760   // TODO: SimplifyDemandedBits instead?
50761   if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
50762     if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
50763       if (C->getAPIntValue().isOneValue())
50764         return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
50765                            Src.getOperand(0));
50766 
50767   // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
50768   if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
50769       Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
50770       Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
50771     if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
50772       if (C->isNullValue())
50773         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
50774                            Src.getOperand(1));
50775 
50776   // Reduce v2i64 to v4i32 if we don't need the upper bits.
50777   // TODO: Move to DAGCombine/SimplifyDemandedBits?
50778   if (VT == MVT::v2i64 || VT == MVT::v2f64) {
50779     auto IsAnyExt64 = [](SDValue Op) {
50780       if (Op.getValueType() != MVT::i64 || !Op.hasOneUse())
50781         return SDValue();
50782       if (Op.getOpcode() == ISD::ANY_EXTEND &&
50783           Op.getOperand(0).getScalarValueSizeInBits() <= 32)
50784         return Op.getOperand(0);
50785       if (auto *Ld = dyn_cast<LoadSDNode>(Op))
50786         if (Ld->getExtensionType() == ISD::EXTLOAD &&
50787             Ld->getMemoryVT().getScalarSizeInBits() <= 32)
50788           return Op;
50789       return SDValue();
50790     };
50791     if (SDValue ExtSrc = IsAnyExt64(peekThroughOneUseBitcasts(Src)))
50792       return DAG.getBitcast(
50793           VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
50794                           DAG.getAnyExtOrTrunc(ExtSrc, DL, MVT::i32)));
50795   }
50796 
50797   // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
50798   if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
50799       Src.getOperand(0).getValueType() == MVT::x86mmx)
50800     return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
50801 
50802   // See if we're broadcasting the scalar value, in which case just reuse that.
50803   // Ensure the same SDValue from the SDNode use is being used.
50804   if (VT.getScalarType() == Src.getValueType())
50805     for (SDNode *User : Src->uses())
50806       if (User->getOpcode() == X86ISD::VBROADCAST &&
50807           Src == User->getOperand(0)) {
50808         unsigned SizeInBits = VT.getFixedSizeInBits();
50809         unsigned BroadcastSizeInBits =
50810             User->getValueSizeInBits(0).getFixedSize();
50811         if (BroadcastSizeInBits == SizeInBits)
50812           return SDValue(User, 0);
50813         if (BroadcastSizeInBits > SizeInBits)
50814           return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
50815         // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
50816         // coverage.
50817       }
50818 
50819   return SDValue();
50820 }
50821 
50822 // Simplify PMULDQ and PMULUDQ operations.
combinePMULDQ(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)50823 static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
50824                              TargetLowering::DAGCombinerInfo &DCI,
50825                              const X86Subtarget &Subtarget) {
50826   SDValue LHS = N->getOperand(0);
50827   SDValue RHS = N->getOperand(1);
50828 
50829   // Canonicalize constant to RHS.
50830   if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
50831       !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
50832     return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
50833 
50834   // Multiply by zero.
50835   // Don't return RHS as it may contain UNDEFs.
50836   if (ISD::isBuildVectorAllZeros(RHS.getNode()))
50837     return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
50838 
50839   // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
50840   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50841   if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
50842     return SDValue(N, 0);
50843 
50844   // If the input is an extend_invec and the SimplifyDemandedBits call didn't
50845   // convert it to any_extend_invec, due to the LegalOperations check, do the
50846   // conversion directly to a vector shuffle manually. This exposes combine
50847   // opportunities missed by combineEXTEND_VECTOR_INREG not calling
50848   // combineX86ShufflesRecursively on SSE4.1 targets.
50849   // FIXME: This is basically a hack around several other issues related to
50850   // ANY_EXTEND_VECTOR_INREG.
50851   if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
50852       (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
50853        LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
50854       LHS.getOperand(0).getValueType() == MVT::v4i32) {
50855     SDLoc dl(N);
50856     LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
50857                                LHS.getOperand(0), { 0, -1, 1, -1 });
50858     LHS = DAG.getBitcast(MVT::v2i64, LHS);
50859     return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
50860   }
50861   if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
50862       (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
50863        RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
50864       RHS.getOperand(0).getValueType() == MVT::v4i32) {
50865     SDLoc dl(N);
50866     RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
50867                                RHS.getOperand(0), { 0, -1, 1, -1 });
50868     RHS = DAG.getBitcast(MVT::v2i64, RHS);
50869     return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
50870   }
50871 
50872   return SDValue();
50873 }
50874 
combineEXTEND_VECTOR_INREG(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)50875 static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
50876                                           TargetLowering::DAGCombinerInfo &DCI,
50877                                           const X86Subtarget &Subtarget) {
50878   EVT VT = N->getValueType(0);
50879   SDValue In = N->getOperand(0);
50880   unsigned Opcode = N->getOpcode();
50881   unsigned InOpcode = In.getOpcode();
50882   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50883 
50884   // Try to merge vector loads and extend_inreg to an extload.
50885   if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
50886       In.hasOneUse()) {
50887     auto *Ld = cast<LoadSDNode>(In);
50888     if (Ld->isSimple()) {
50889       MVT SVT = In.getSimpleValueType().getVectorElementType();
50890       ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
50891                                  ? ISD::SEXTLOAD
50892                                  : ISD::ZEXTLOAD;
50893       EVT MemVT = VT.changeVectorElementType(SVT);
50894       if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
50895         SDValue Load =
50896             DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
50897                            Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
50898                            Ld->getMemOperand()->getFlags());
50899         DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
50900         return Load;
50901       }
50902     }
50903   }
50904 
50905   // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
50906   if (Opcode == InOpcode)
50907     return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0));
50908 
50909   // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
50910   // -> EXTEND_VECTOR_INREG(X).
50911   // TODO: Handle non-zero subvector indices.
50912   if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
50913       In.getOperand(0).getOpcode() == getOpcode_EXTEND(Opcode) &&
50914       In.getOperand(0).getOperand(0).getValueSizeInBits() ==
50915           In.getValueSizeInBits())
50916     return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0).getOperand(0));
50917 
50918   // Attempt to combine as a shuffle.
50919   // TODO: General ZERO_EXTEND_VECTOR_INREG support.
50920   if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
50921       (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasSSE41())) {
50922     SDValue Op(N, 0);
50923     if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
50924       if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50925         return Res;
50926   }
50927 
50928   return SDValue();
50929 }
50930 
combineKSHIFT(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)50931 static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
50932                              TargetLowering::DAGCombinerInfo &DCI) {
50933   EVT VT = N->getValueType(0);
50934 
50935   if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
50936     return DAG.getConstant(0, SDLoc(N), VT);
50937 
50938   APInt KnownUndef, KnownZero;
50939   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50940   APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
50941   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
50942                                      KnownZero, DCI))
50943     return SDValue(N, 0);
50944 
50945   return SDValue();
50946 }
50947 
50948 // Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
50949 // Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
50950 // extra instructions between the conversion due to going to scalar and back.
combineFP16_TO_FP(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)50951 static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
50952                                  const X86Subtarget &Subtarget) {
50953   if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
50954     return SDValue();
50955 
50956   if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
50957     return SDValue();
50958 
50959   if (N->getValueType(0) != MVT::f32 ||
50960       N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
50961     return SDValue();
50962 
50963   SDLoc dl(N);
50964   SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
50965                             N->getOperand(0).getOperand(0));
50966   Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
50967                     DAG.getTargetConstant(4, dl, MVT::i32));
50968   Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
50969   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
50970                      DAG.getIntPtrConstant(0, dl));
50971 }
50972 
combineFP_EXTEND(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)50973 static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
50974                                 const X86Subtarget &Subtarget) {
50975   if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
50976     return SDValue();
50977 
50978   bool IsStrict = N->isStrictFPOpcode();
50979   EVT VT = N->getValueType(0);
50980   SDValue Src = N->getOperand(IsStrict ? 1 : 0);
50981   EVT SrcVT = Src.getValueType();
50982 
50983   if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
50984     return SDValue();
50985 
50986   if (VT.getVectorElementType() != MVT::f32 &&
50987       VT.getVectorElementType() != MVT::f64)
50988     return SDValue();
50989 
50990   unsigned NumElts = VT.getVectorNumElements();
50991   if (NumElts == 1 || !isPowerOf2_32(NumElts))
50992     return SDValue();
50993 
50994   SDLoc dl(N);
50995 
50996   // Convert the input to vXi16.
50997   EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
50998   Src = DAG.getBitcast(IntVT, Src);
50999 
51000   // Widen to at least 8 input elements.
51001   if (NumElts < 8) {
51002     unsigned NumConcats = 8 / NumElts;
51003     SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
51004                                 : DAG.getConstant(0, dl, IntVT);
51005     SmallVector<SDValue, 4> Ops(NumConcats, Fill);
51006     Ops[0] = Src;
51007     Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
51008   }
51009 
51010   // Destination is vXf32 with at least 4 elements.
51011   EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
51012                                std::max(4U, NumElts));
51013   SDValue Cvt, Chain;
51014   if (IsStrict) {
51015     Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
51016                       {N->getOperand(0), Src});
51017     Chain = Cvt.getValue(1);
51018   } else {
51019     Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
51020   }
51021 
51022   if (NumElts < 4) {
51023     assert(NumElts == 2 && "Unexpected size");
51024     Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
51025                       DAG.getIntPtrConstant(0, dl));
51026   }
51027 
51028   if (IsStrict) {
51029     // Extend to the original VT if necessary.
51030     if (Cvt.getValueType() != VT) {
51031       Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
51032                         {Chain, Cvt});
51033       Chain = Cvt.getValue(1);
51034     }
51035     return DAG.getMergeValues({Cvt, Chain}, dl);
51036   }
51037 
51038   // Extend to the original VT if necessary.
51039   return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
51040 }
51041 
51042 // Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
51043 // from. Limit this to cases where the loads have the same input chain and the
51044 // output chains are unused. This avoids any memory ordering issues.
combineBROADCAST_LOAD(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)51045 static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
51046                                      TargetLowering::DAGCombinerInfo &DCI) {
51047   assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
51048           N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
51049          "Unknown broadcast load type");
51050 
51051   // Only do this if the chain result is unused.
51052   if (N->hasAnyUseOfValue(1))
51053     return SDValue();
51054 
51055   auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
51056 
51057   SDValue Ptr = MemIntrin->getBasePtr();
51058   SDValue Chain = MemIntrin->getChain();
51059   EVT VT = N->getSimpleValueType(0);
51060   EVT MemVT = MemIntrin->getMemoryVT();
51061 
51062   // Look at other users of our base pointer and try to find a wider broadcast.
51063   // The input chain and the size of the memory VT must match.
51064   for (SDNode *User : Ptr->uses())
51065     if (User != N && User->getOpcode() == N->getOpcode() &&
51066         cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
51067         cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
51068         cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
51069             MemVT.getSizeInBits() &&
51070         !User->hasAnyUseOfValue(1) &&
51071         User->getValueSizeInBits(0).getFixedSize() > VT.getFixedSizeInBits()) {
51072       SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
51073                                          VT.getSizeInBits());
51074       Extract = DAG.getBitcast(VT, Extract);
51075       return DCI.CombineTo(N, Extract, SDValue(User, 1));
51076     }
51077 
51078   return SDValue();
51079 }
51080 
combineFP_ROUND(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)51081 static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
51082                                const X86Subtarget &Subtarget) {
51083   if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
51084     return SDValue();
51085 
51086   EVT VT = N->getValueType(0);
51087   SDValue Src = N->getOperand(0);
51088   EVT SrcVT = Src.getValueType();
51089 
51090   if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
51091       SrcVT.getVectorElementType() != MVT::f32)
51092     return SDValue();
51093 
51094   unsigned NumElts = VT.getVectorNumElements();
51095   if (NumElts == 1 || !isPowerOf2_32(NumElts))
51096     return SDValue();
51097 
51098   SDLoc dl(N);
51099 
51100   // Widen to at least 4 input elements.
51101   if (NumElts < 4)
51102     Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
51103                       DAG.getConstantFP(0.0, dl, SrcVT));
51104 
51105   // Destination is v8i16 with at least 8 elements.
51106   EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
51107                                std::max(8U, NumElts));
51108   SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,
51109                             DAG.getTargetConstant(4, dl, MVT::i32));
51110 
51111   // Extract down to real number of elements.
51112   if (NumElts < 8) {
51113     EVT IntVT = VT.changeVectorElementTypeToInteger();
51114     Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
51115                       DAG.getIntPtrConstant(0, dl));
51116   }
51117 
51118   return DAG.getBitcast(VT, Cvt);
51119 }
51120 
combineMOVDQ2Q(SDNode * N,SelectionDAG & DAG)51121 static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
51122   SDValue Src = N->getOperand(0);
51123 
51124   // Turn MOVDQ2Q+simple_load into an mmx load.
51125   if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
51126     LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
51127 
51128     if (LN->isSimple()) {
51129       SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
51130                                   LN->getBasePtr(),
51131                                   LN->getPointerInfo(),
51132                                   LN->getOriginalAlign(),
51133                                   LN->getMemOperand()->getFlags());
51134       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
51135       return NewLd;
51136     }
51137   }
51138 
51139   return SDValue();
51140 }
51141 
combinePDEP(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)51142 static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
51143                            TargetLowering::DAGCombinerInfo &DCI) {
51144   unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
51145   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51146   if (TLI.SimplifyDemandedBits(SDValue(N, 0),
51147                                APInt::getAllOnesValue(NumBits), DCI))
51148     return SDValue(N, 0);
51149 
51150   return SDValue();
51151 }
51152 
PerformDAGCombine(SDNode * N,DAGCombinerInfo & DCI) const51153 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
51154                                              DAGCombinerInfo &DCI) const {
51155   SelectionDAG &DAG = DCI.DAG;
51156   switch (N->getOpcode()) {
51157   default: break;
51158   case ISD::SCALAR_TO_VECTOR:
51159     return combineScalarToVector(N, DAG);
51160   case ISD::EXTRACT_VECTOR_ELT:
51161   case X86ISD::PEXTRW:
51162   case X86ISD::PEXTRB:
51163     return combineExtractVectorElt(N, DAG, DCI, Subtarget);
51164   case ISD::CONCAT_VECTORS:
51165     return combineConcatVectors(N, DAG, DCI, Subtarget);
51166   case ISD::INSERT_SUBVECTOR:
51167     return combineInsertSubvector(N, DAG, DCI, Subtarget);
51168   case ISD::EXTRACT_SUBVECTOR:
51169     return combineExtractSubvector(N, DAG, DCI, Subtarget);
51170   case ISD::VSELECT:
51171   case ISD::SELECT:
51172   case X86ISD::BLENDV:      return combineSelect(N, DAG, DCI, Subtarget);
51173   case ISD::BITCAST:        return combineBitcast(N, DAG, DCI, Subtarget);
51174   case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);
51175   case X86ISD::CMP:         return combineCMP(N, DAG);
51176   case ISD::ADD:            return combineAdd(N, DAG, DCI, Subtarget);
51177   case ISD::SUB:            return combineSub(N, DAG, DCI, Subtarget);
51178   case X86ISD::ADD:
51179   case X86ISD::SUB:         return combineX86AddSub(N, DAG, DCI);
51180   case X86ISD::SBB:         return combineSBB(N, DAG);
51181   case X86ISD::ADC:         return combineADC(N, DAG, DCI);
51182   case ISD::MUL:            return combineMul(N, DAG, DCI, Subtarget);
51183   case ISD::SHL:            return combineShiftLeft(N, DAG);
51184   case ISD::SRA:            return combineShiftRightArithmetic(N, DAG, Subtarget);
51185   case ISD::SRL:            return combineShiftRightLogical(N, DAG, DCI, Subtarget);
51186   case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
51187   case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
51188   case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
51189   case X86ISD::BEXTR:
51190   case X86ISD::BEXTRI:      return combineBEXTR(N, DAG, DCI, Subtarget);
51191   case ISD::LOAD:           return combineLoad(N, DAG, DCI, Subtarget);
51192   case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
51193   case ISD::STORE:          return combineStore(N, DAG, DCI, Subtarget);
51194   case ISD::MSTORE:         return combineMaskedStore(N, DAG, DCI, Subtarget);
51195   case X86ISD::VEXTRACT_STORE:
51196     return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
51197   case ISD::SINT_TO_FP:
51198   case ISD::STRICT_SINT_TO_FP:
51199     return combineSIntToFP(N, DAG, DCI, Subtarget);
51200   case ISD::UINT_TO_FP:
51201   case ISD::STRICT_UINT_TO_FP:
51202     return combineUIntToFP(N, DAG, Subtarget);
51203   case ISD::FADD:
51204   case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
51205   case ISD::FNEG:           return combineFneg(N, DAG, DCI, Subtarget);
51206   case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
51207   case X86ISD::VTRUNC:      return combineVTRUNC(N, DAG, DCI);
51208   case X86ISD::ANDNP:       return combineAndnp(N, DAG, DCI, Subtarget);
51209   case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);
51210   case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);
51211   case X86ISD::FXOR:
51212   case X86ISD::FOR:         return combineFOr(N, DAG, DCI, Subtarget);
51213   case X86ISD::FMIN:
51214   case X86ISD::FMAX:        return combineFMinFMax(N, DAG);
51215   case ISD::FMINNUM:
51216   case ISD::FMAXNUM:        return combineFMinNumFMaxNum(N, DAG, Subtarget);
51217   case X86ISD::CVTSI2P:
51218   case X86ISD::CVTUI2P:     return combineX86INT_TO_FP(N, DAG, DCI);
51219   case X86ISD::CVTP2SI:
51220   case X86ISD::CVTP2UI:
51221   case X86ISD::STRICT_CVTTP2SI:
51222   case X86ISD::CVTTP2SI:
51223   case X86ISD::STRICT_CVTTP2UI:
51224   case X86ISD::CVTTP2UI:
51225                             return combineCVTP2I_CVTTP2I(N, DAG, DCI);
51226   case X86ISD::STRICT_CVTPH2PS:
51227   case X86ISD::CVTPH2PS:    return combineCVTPH2PS(N, DAG, DCI);
51228   case X86ISD::BT:          return combineBT(N, DAG, DCI);
51229   case ISD::ANY_EXTEND:
51230   case ISD::ZERO_EXTEND:    return combineZext(N, DAG, DCI, Subtarget);
51231   case ISD::SIGN_EXTEND:    return combineSext(N, DAG, DCI, Subtarget);
51232   case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
51233   case ISD::ANY_EXTEND_VECTOR_INREG:
51234   case ISD::SIGN_EXTEND_VECTOR_INREG:
51235   case ISD::ZERO_EXTEND_VECTOR_INREG:
51236     return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
51237   case ISD::SETCC:          return combineSetCC(N, DAG, DCI, Subtarget);
51238   case X86ISD::SETCC:       return combineX86SetCC(N, DAG, Subtarget);
51239   case X86ISD::BRCOND:      return combineBrCond(N, DAG, Subtarget);
51240   case X86ISD::PACKSS:
51241   case X86ISD::PACKUS:      return combineVectorPack(N, DAG, DCI, Subtarget);
51242   case X86ISD::HADD:
51243   case X86ISD::HSUB:
51244   case X86ISD::FHADD:
51245   case X86ISD::FHSUB:       return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
51246   case X86ISD::VSHL:
51247   case X86ISD::VSRA:
51248   case X86ISD::VSRL:
51249     return combineVectorShiftVar(N, DAG, DCI, Subtarget);
51250   case X86ISD::VSHLI:
51251   case X86ISD::VSRAI:
51252   case X86ISD::VSRLI:
51253     return combineVectorShiftImm(N, DAG, DCI, Subtarget);
51254   case ISD::INSERT_VECTOR_ELT:
51255   case X86ISD::PINSRB:
51256   case X86ISD::PINSRW:      return combineVectorInsert(N, DAG, DCI, Subtarget);
51257   case X86ISD::SHUFP:       // Handle all target specific shuffles
51258   case X86ISD::INSERTPS:
51259   case X86ISD::EXTRQI:
51260   case X86ISD::INSERTQI:
51261   case X86ISD::VALIGN:
51262   case X86ISD::PALIGNR:
51263   case X86ISD::VSHLDQ:
51264   case X86ISD::VSRLDQ:
51265   case X86ISD::BLENDI:
51266   case X86ISD::UNPCKH:
51267   case X86ISD::UNPCKL:
51268   case X86ISD::MOVHLPS:
51269   case X86ISD::MOVLHPS:
51270   case X86ISD::PSHUFB:
51271   case X86ISD::PSHUFD:
51272   case X86ISD::PSHUFHW:
51273   case X86ISD::PSHUFLW:
51274   case X86ISD::MOVSHDUP:
51275   case X86ISD::MOVSLDUP:
51276   case X86ISD::MOVDDUP:
51277   case X86ISD::MOVSS:
51278   case X86ISD::MOVSD:
51279   case X86ISD::VBROADCAST:
51280   case X86ISD::VPPERM:
51281   case X86ISD::VPERMI:
51282   case X86ISD::VPERMV:
51283   case X86ISD::VPERMV3:
51284   case X86ISD::VPERMIL2:
51285   case X86ISD::VPERMILPI:
51286   case X86ISD::VPERMILPV:
51287   case X86ISD::VPERM2X128:
51288   case X86ISD::SHUF128:
51289   case X86ISD::VZEXT_MOVL:
51290   case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
51291   case X86ISD::FMADD_RND:
51292   case X86ISD::FMSUB:
51293   case X86ISD::STRICT_FMSUB:
51294   case X86ISD::FMSUB_RND:
51295   case X86ISD::FNMADD:
51296   case X86ISD::STRICT_FNMADD:
51297   case X86ISD::FNMADD_RND:
51298   case X86ISD::FNMSUB:
51299   case X86ISD::STRICT_FNMSUB:
51300   case X86ISD::FNMSUB_RND:
51301   case ISD::FMA:
51302   case ISD::STRICT_FMA:     return combineFMA(N, DAG, DCI, Subtarget);
51303   case X86ISD::FMADDSUB_RND:
51304   case X86ISD::FMSUBADD_RND:
51305   case X86ISD::FMADDSUB:
51306   case X86ISD::FMSUBADD:    return combineFMADDSUB(N, DAG, DCI);
51307   case X86ISD::MOVMSK:      return combineMOVMSK(N, DAG, DCI, Subtarget);
51308   case X86ISD::MGATHER:
51309   case X86ISD::MSCATTER:    return combineX86GatherScatter(N, DAG, DCI);
51310   case ISD::MGATHER:
51311   case ISD::MSCATTER:       return combineGatherScatter(N, DAG, DCI);
51312   case X86ISD::PCMPEQ:
51313   case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
51314   case X86ISD::PMULDQ:
51315   case X86ISD::PMULUDQ:     return combinePMULDQ(N, DAG, DCI, Subtarget);
51316   case X86ISD::KSHIFTL:
51317   case X86ISD::KSHIFTR:     return combineKSHIFT(N, DAG, DCI);
51318   case ISD::FP16_TO_FP:     return combineFP16_TO_FP(N, DAG, Subtarget);
51319   case ISD::STRICT_FP_EXTEND:
51320   case ISD::FP_EXTEND:      return combineFP_EXTEND(N, DAG, Subtarget);
51321   case ISD::FP_ROUND:       return combineFP_ROUND(N, DAG, Subtarget);
51322   case X86ISD::VBROADCAST_LOAD:
51323   case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
51324   case X86ISD::MOVDQ2Q:     return combineMOVDQ2Q(N, DAG);
51325   case X86ISD::PDEP:        return combinePDEP(N, DAG, DCI);
51326   }
51327 
51328   return SDValue();
51329 }
51330 
isTypeDesirableForOp(unsigned Opc,EVT VT) const51331 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
51332   if (!isTypeLegal(VT))
51333     return false;
51334 
51335   // There are no vXi8 shifts.
51336   if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
51337     return false;
51338 
51339   // TODO: Almost no 8-bit ops are desirable because they have no actual
51340   //       size/speed advantages vs. 32-bit ops, but they do have a major
51341   //       potential disadvantage by causing partial register stalls.
51342   //
51343   // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
51344   // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
51345   // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
51346   // check for a constant operand to the multiply.
51347   if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
51348     return false;
51349 
51350   // i16 instruction encodings are longer and some i16 instructions are slow,
51351   // so those are not desirable.
51352   if (VT == MVT::i16) {
51353     switch (Opc) {
51354     default:
51355       break;
51356     case ISD::LOAD:
51357     case ISD::SIGN_EXTEND:
51358     case ISD::ZERO_EXTEND:
51359     case ISD::ANY_EXTEND:
51360     case ISD::SHL:
51361     case ISD::SRA:
51362     case ISD::SRL:
51363     case ISD::SUB:
51364     case ISD::ADD:
51365     case ISD::MUL:
51366     case ISD::AND:
51367     case ISD::OR:
51368     case ISD::XOR:
51369       return false;
51370     }
51371   }
51372 
51373   // Any legal type not explicitly accounted for above here is desirable.
51374   return true;
51375 }
51376 
expandIndirectJTBranch(const SDLoc & dl,SDValue Value,SDValue Addr,SelectionDAG & DAG) const51377 SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
51378                                                   SDValue Value, SDValue Addr,
51379                                                   SelectionDAG &DAG) const {
51380   const Module *M = DAG.getMachineFunction().getMMI().getModule();
51381   Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
51382   if (IsCFProtectionSupported) {
51383     // In case control-flow branch protection is enabled, we need to add
51384     // notrack prefix to the indirect branch.
51385     // In order to do that we create NT_BRIND SDNode.
51386     // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
51387     return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
51388   }
51389 
51390   return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
51391 }
51392 
IsDesirableToPromoteOp(SDValue Op,EVT & PVT) const51393 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
51394   EVT VT = Op.getValueType();
51395   bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
51396                              isa<ConstantSDNode>(Op.getOperand(1));
51397 
51398   // i16 is legal, but undesirable since i16 instruction encodings are longer
51399   // and some i16 instructions are slow.
51400   // 8-bit multiply-by-constant can usually be expanded to something cheaper
51401   // using LEA and/or other ALU ops.
51402   if (VT != MVT::i16 && !Is8BitMulByConstant)
51403     return false;
51404 
51405   auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
51406     if (!Op.hasOneUse())
51407       return false;
51408     SDNode *User = *Op->use_begin();
51409     if (!ISD::isNormalStore(User))
51410       return false;
51411     auto *Ld = cast<LoadSDNode>(Load);
51412     auto *St = cast<StoreSDNode>(User);
51413     return Ld->getBasePtr() == St->getBasePtr();
51414   };
51415 
51416   auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
51417     if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
51418       return false;
51419     if (!Op.hasOneUse())
51420       return false;
51421     SDNode *User = *Op->use_begin();
51422     if (User->getOpcode() != ISD::ATOMIC_STORE)
51423       return false;
51424     auto *Ld = cast<AtomicSDNode>(Load);
51425     auto *St = cast<AtomicSDNode>(User);
51426     return Ld->getBasePtr() == St->getBasePtr();
51427   };
51428 
51429   bool Commute = false;
51430   switch (Op.getOpcode()) {
51431   default: return false;
51432   case ISD::SIGN_EXTEND:
51433   case ISD::ZERO_EXTEND:
51434   case ISD::ANY_EXTEND:
51435     break;
51436   case ISD::SHL:
51437   case ISD::SRA:
51438   case ISD::SRL: {
51439     SDValue N0 = Op.getOperand(0);
51440     // Look out for (store (shl (load), x)).
51441     if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
51442       return false;
51443     break;
51444   }
51445   case ISD::ADD:
51446   case ISD::MUL:
51447   case ISD::AND:
51448   case ISD::OR:
51449   case ISD::XOR:
51450     Commute = true;
51451     LLVM_FALLTHROUGH;
51452   case ISD::SUB: {
51453     SDValue N0 = Op.getOperand(0);
51454     SDValue N1 = Op.getOperand(1);
51455     // Avoid disabling potential load folding opportunities.
51456     if (MayFoldLoad(N1) &&
51457         (!Commute || !isa<ConstantSDNode>(N0) ||
51458          (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
51459       return false;
51460     if (MayFoldLoad(N0) &&
51461         ((Commute && !isa<ConstantSDNode>(N1)) ||
51462          (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
51463       return false;
51464     if (IsFoldableAtomicRMW(N0, Op) ||
51465         (Commute && IsFoldableAtomicRMW(N1, Op)))
51466       return false;
51467   }
51468   }
51469 
51470   PVT = MVT::i32;
51471   return true;
51472 }
51473 
51474 //===----------------------------------------------------------------------===//
51475 //                           X86 Inline Assembly Support
51476 //===----------------------------------------------------------------------===//
51477 
51478 // Helper to match a string separated by whitespace.
matchAsm(StringRef S,ArrayRef<const char * > Pieces)51479 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
51480   S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
51481 
51482   for (StringRef Piece : Pieces) {
51483     if (!S.startswith(Piece)) // Check if the piece matches.
51484       return false;
51485 
51486     S = S.substr(Piece.size());
51487     StringRef::size_type Pos = S.find_first_not_of(" \t");
51488     if (Pos == 0) // We matched a prefix.
51489       return false;
51490 
51491     S = S.substr(Pos);
51492   }
51493 
51494   return S.empty();
51495 }
51496 
clobbersFlagRegisters(const SmallVector<StringRef,4> & AsmPieces)51497 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
51498 
51499   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
51500     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
51501         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
51502         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
51503 
51504       if (AsmPieces.size() == 3)
51505         return true;
51506       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
51507         return true;
51508     }
51509   }
51510   return false;
51511 }
51512 
ExpandInlineAsm(CallInst * CI) const51513 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
51514   InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
51515 
51516   const std::string &AsmStr = IA->getAsmString();
51517 
51518   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
51519   if (!Ty || Ty->getBitWidth() % 16 != 0)
51520     return false;
51521 
51522   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
51523   SmallVector<StringRef, 4> AsmPieces;
51524   SplitString(AsmStr, AsmPieces, ";\n");
51525 
51526   switch (AsmPieces.size()) {
51527   default: return false;
51528   case 1:
51529     // FIXME: this should verify that we are targeting a 486 or better.  If not,
51530     // we will turn this bswap into something that will be lowered to logical
51531     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
51532     // lower so don't worry about this.
51533     // bswap $0
51534     if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
51535         matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
51536         matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
51537         matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
51538         matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
51539         matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
51540       // No need to check constraints, nothing other than the equivalent of
51541       // "=r,0" would be valid here.
51542       return IntrinsicLowering::LowerToByteSwap(CI);
51543     }
51544 
51545     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
51546     if (CI->getType()->isIntegerTy(16) &&
51547         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
51548         (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
51549          matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
51550       AsmPieces.clear();
51551       StringRef ConstraintsStr = IA->getConstraintString();
51552       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
51553       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
51554       if (clobbersFlagRegisters(AsmPieces))
51555         return IntrinsicLowering::LowerToByteSwap(CI);
51556     }
51557     break;
51558   case 3:
51559     if (CI->getType()->isIntegerTy(32) &&
51560         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
51561         matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
51562         matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
51563         matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
51564       AsmPieces.clear();
51565       StringRef ConstraintsStr = IA->getConstraintString();
51566       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
51567       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
51568       if (clobbersFlagRegisters(AsmPieces))
51569         return IntrinsicLowering::LowerToByteSwap(CI);
51570     }
51571 
51572     if (CI->getType()->isIntegerTy(64)) {
51573       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
51574       if (Constraints.size() >= 2 &&
51575           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
51576           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
51577         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
51578         if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
51579             matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
51580             matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
51581           return IntrinsicLowering::LowerToByteSwap(CI);
51582       }
51583     }
51584     break;
51585   }
51586   return false;
51587 }
51588 
parseConstraintCode(llvm::StringRef Constraint)51589 static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
51590   X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
51591                            .Case("{@cca}", X86::COND_A)
51592                            .Case("{@ccae}", X86::COND_AE)
51593                            .Case("{@ccb}", X86::COND_B)
51594                            .Case("{@ccbe}", X86::COND_BE)
51595                            .Case("{@ccc}", X86::COND_B)
51596                            .Case("{@cce}", X86::COND_E)
51597                            .Case("{@ccz}", X86::COND_E)
51598                            .Case("{@ccg}", X86::COND_G)
51599                            .Case("{@ccge}", X86::COND_GE)
51600                            .Case("{@ccl}", X86::COND_L)
51601                            .Case("{@ccle}", X86::COND_LE)
51602                            .Case("{@ccna}", X86::COND_BE)
51603                            .Case("{@ccnae}", X86::COND_B)
51604                            .Case("{@ccnb}", X86::COND_AE)
51605                            .Case("{@ccnbe}", X86::COND_A)
51606                            .Case("{@ccnc}", X86::COND_AE)
51607                            .Case("{@ccne}", X86::COND_NE)
51608                            .Case("{@ccnz}", X86::COND_NE)
51609                            .Case("{@ccng}", X86::COND_LE)
51610                            .Case("{@ccnge}", X86::COND_L)
51611                            .Case("{@ccnl}", X86::COND_GE)
51612                            .Case("{@ccnle}", X86::COND_G)
51613                            .Case("{@ccno}", X86::COND_NO)
51614                            .Case("{@ccnp}", X86::COND_NP)
51615                            .Case("{@ccns}", X86::COND_NS)
51616                            .Case("{@cco}", X86::COND_O)
51617                            .Case("{@ccp}", X86::COND_P)
51618                            .Case("{@ccs}", X86::COND_S)
51619                            .Default(X86::COND_INVALID);
51620   return Cond;
51621 }
51622 
51623 /// Given a constraint letter, return the type of constraint for this target.
51624 X86TargetLowering::ConstraintType
getConstraintType(StringRef Constraint) const51625 X86TargetLowering::getConstraintType(StringRef Constraint) const {
51626   if (Constraint.size() == 1) {
51627     switch (Constraint[0]) {
51628     case 'R':
51629     case 'q':
51630     case 'Q':
51631     case 'f':
51632     case 't':
51633     case 'u':
51634     case 'y':
51635     case 'x':
51636     case 'v':
51637     case 'l':
51638     case 'k': // AVX512 masking registers.
51639       return C_RegisterClass;
51640     case 'a':
51641     case 'b':
51642     case 'c':
51643     case 'd':
51644     case 'S':
51645     case 'D':
51646     case 'A':
51647       return C_Register;
51648     case 'I':
51649     case 'J':
51650     case 'K':
51651     case 'N':
51652     case 'G':
51653     case 'L':
51654     case 'M':
51655       return C_Immediate;
51656     case 'C':
51657     case 'e':
51658     case 'Z':
51659       return C_Other;
51660     default:
51661       break;
51662     }
51663   }
51664   else if (Constraint.size() == 2) {
51665     switch (Constraint[0]) {
51666     default:
51667       break;
51668     case 'Y':
51669       switch (Constraint[1]) {
51670       default:
51671         break;
51672       case 'z':
51673         return C_Register;
51674       case 'i':
51675       case 'm':
51676       case 'k':
51677       case 't':
51678       case '2':
51679         return C_RegisterClass;
51680       }
51681     }
51682   } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
51683     return C_Other;
51684   return TargetLowering::getConstraintType(Constraint);
51685 }
51686 
51687 /// Examine constraint type and operand type and determine a weight value.
51688 /// This object must already have been set up with the operand type
51689 /// and the current alternative constraint selected.
51690 TargetLowering::ConstraintWeight
getSingleConstraintMatchWeight(AsmOperandInfo & info,const char * constraint) const51691   X86TargetLowering::getSingleConstraintMatchWeight(
51692     AsmOperandInfo &info, const char *constraint) const {
51693   ConstraintWeight weight = CW_Invalid;
51694   Value *CallOperandVal = info.CallOperandVal;
51695     // If we don't have a value, we can't do a match,
51696     // but allow it at the lowest weight.
51697   if (!CallOperandVal)
51698     return CW_Default;
51699   Type *type = CallOperandVal->getType();
51700   // Look at the constraint type.
51701   switch (*constraint) {
51702   default:
51703     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
51704     LLVM_FALLTHROUGH;
51705   case 'R':
51706   case 'q':
51707   case 'Q':
51708   case 'a':
51709   case 'b':
51710   case 'c':
51711   case 'd':
51712   case 'S':
51713   case 'D':
51714   case 'A':
51715     if (CallOperandVal->getType()->isIntegerTy())
51716       weight = CW_SpecificReg;
51717     break;
51718   case 'f':
51719   case 't':
51720   case 'u':
51721     if (type->isFloatingPointTy())
51722       weight = CW_SpecificReg;
51723     break;
51724   case 'y':
51725     if (type->isX86_MMXTy() && Subtarget.hasMMX())
51726       weight = CW_SpecificReg;
51727     break;
51728   case 'Y':
51729     if (StringRef(constraint).size() != 2)
51730       break;
51731     switch (constraint[1]) {
51732       default:
51733         return CW_Invalid;
51734       // XMM0
51735       case 'z':
51736         if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
51737             ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
51738             ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
51739           return CW_SpecificReg;
51740         return CW_Invalid;
51741       // Conditional OpMask regs (AVX512)
51742       case 'k':
51743         if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
51744           return CW_Register;
51745         return CW_Invalid;
51746       // Any MMX reg
51747       case 'm':
51748         if (type->isX86_MMXTy() && Subtarget.hasMMX())
51749           return weight;
51750         return CW_Invalid;
51751       // Any SSE reg when ISA >= SSE2, same as 'x'
51752       case 'i':
51753       case 't':
51754       case '2':
51755         if (!Subtarget.hasSSE2())
51756           return CW_Invalid;
51757         break;
51758     }
51759     break;
51760   case 'v':
51761     if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
51762       weight = CW_Register;
51763     LLVM_FALLTHROUGH;
51764   case 'x':
51765     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
51766         ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
51767       weight = CW_Register;
51768     break;
51769   case 'k':
51770     // Enable conditional vector operations using %k<#> registers.
51771     if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
51772       weight = CW_Register;
51773     break;
51774   case 'I':
51775     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
51776       if (C->getZExtValue() <= 31)
51777         weight = CW_Constant;
51778     }
51779     break;
51780   case 'J':
51781     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51782       if (C->getZExtValue() <= 63)
51783         weight = CW_Constant;
51784     }
51785     break;
51786   case 'K':
51787     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51788       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
51789         weight = CW_Constant;
51790     }
51791     break;
51792   case 'L':
51793     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51794       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
51795         weight = CW_Constant;
51796     }
51797     break;
51798   case 'M':
51799     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51800       if (C->getZExtValue() <= 3)
51801         weight = CW_Constant;
51802     }
51803     break;
51804   case 'N':
51805     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51806       if (C->getZExtValue() <= 0xff)
51807         weight = CW_Constant;
51808     }
51809     break;
51810   case 'G':
51811   case 'C':
51812     if (isa<ConstantFP>(CallOperandVal)) {
51813       weight = CW_Constant;
51814     }
51815     break;
51816   case 'e':
51817     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51818       if ((C->getSExtValue() >= -0x80000000LL) &&
51819           (C->getSExtValue() <= 0x7fffffffLL))
51820         weight = CW_Constant;
51821     }
51822     break;
51823   case 'Z':
51824     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51825       if (C->getZExtValue() <= 0xffffffff)
51826         weight = CW_Constant;
51827     }
51828     break;
51829   }
51830   return weight;
51831 }
51832 
51833 /// Try to replace an X constraint, which matches anything, with another that
51834 /// has more specific requirements based on the type of the corresponding
51835 /// operand.
51836 const char *X86TargetLowering::
LowerXConstraint(EVT ConstraintVT) const51837 LowerXConstraint(EVT ConstraintVT) const {
51838   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
51839   // 'f' like normal targets.
51840   if (ConstraintVT.isFloatingPoint()) {
51841     if (Subtarget.hasSSE1())
51842       return "x";
51843   }
51844 
51845   return TargetLowering::LowerXConstraint(ConstraintVT);
51846 }
51847 
51848 // Lower @cc targets via setcc.
LowerAsmOutputForConstraint(SDValue & Chain,SDValue & Flag,const SDLoc & DL,const AsmOperandInfo & OpInfo,SelectionDAG & DAG) const51849 SDValue X86TargetLowering::LowerAsmOutputForConstraint(
51850     SDValue &Chain, SDValue &Flag, const SDLoc &DL,
51851     const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
51852   X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
51853   if (Cond == X86::COND_INVALID)
51854     return SDValue();
51855   // Check that return type is valid.
51856   if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
51857       OpInfo.ConstraintVT.getSizeInBits() < 8)
51858     report_fatal_error("Flag output operand is of invalid type");
51859 
51860   // Get EFLAGS register. Only update chain when copyfrom is glued.
51861   if (Flag.getNode()) {
51862     Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
51863     Chain = Flag.getValue(1);
51864   } else
51865     Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
51866   // Extract CC code.
51867   SDValue CC = getSETCC(Cond, Flag, DL, DAG);
51868   // Extend to 32-bits
51869   SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
51870 
51871   return Result;
51872 }
51873 
51874 /// Lower the specified operand into the Ops vector.
51875 /// If it is invalid, don't add anything to Ops.
LowerAsmOperandForConstraint(SDValue Op,std::string & Constraint,std::vector<SDValue> & Ops,SelectionDAG & DAG) const51876 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
51877                                                      std::string &Constraint,
51878                                                      std::vector<SDValue>&Ops,
51879                                                      SelectionDAG &DAG) const {
51880   SDValue Result;
51881 
51882   // Only support length 1 constraints for now.
51883   if (Constraint.length() > 1) return;
51884 
51885   char ConstraintLetter = Constraint[0];
51886   switch (ConstraintLetter) {
51887   default: break;
51888   case 'I':
51889     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51890       if (C->getZExtValue() <= 31) {
51891         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51892                                        Op.getValueType());
51893         break;
51894       }
51895     }
51896     return;
51897   case 'J':
51898     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51899       if (C->getZExtValue() <= 63) {
51900         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51901                                        Op.getValueType());
51902         break;
51903       }
51904     }
51905     return;
51906   case 'K':
51907     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51908       if (isInt<8>(C->getSExtValue())) {
51909         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51910                                        Op.getValueType());
51911         break;
51912       }
51913     }
51914     return;
51915   case 'L':
51916     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51917       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
51918           (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
51919         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
51920                                        Op.getValueType());
51921         break;
51922       }
51923     }
51924     return;
51925   case 'M':
51926     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51927       if (C->getZExtValue() <= 3) {
51928         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51929                                        Op.getValueType());
51930         break;
51931       }
51932     }
51933     return;
51934   case 'N':
51935     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51936       if (C->getZExtValue() <= 255) {
51937         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51938                                        Op.getValueType());
51939         break;
51940       }
51941     }
51942     return;
51943   case 'O':
51944     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51945       if (C->getZExtValue() <= 127) {
51946         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51947                                        Op.getValueType());
51948         break;
51949       }
51950     }
51951     return;
51952   case 'e': {
51953     // 32-bit signed value
51954     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51955       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
51956                                            C->getSExtValue())) {
51957         // Widen to 64 bits here to get it sign extended.
51958         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
51959         break;
51960       }
51961     // FIXME gcc accepts some relocatable values here too, but only in certain
51962     // memory models; it's complicated.
51963     }
51964     return;
51965   }
51966   case 'Z': {
51967     // 32-bit unsigned value
51968     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51969       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
51970                                            C->getZExtValue())) {
51971         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51972                                        Op.getValueType());
51973         break;
51974       }
51975     }
51976     // FIXME gcc accepts some relocatable values here too, but only in certain
51977     // memory models; it's complicated.
51978     return;
51979   }
51980   case 'i': {
51981     // Literal immediates are always ok.
51982     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
51983       bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
51984       BooleanContent BCont = getBooleanContents(MVT::i64);
51985       ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
51986                                     : ISD::SIGN_EXTEND;
51987       int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
51988                                                   : CST->getSExtValue();
51989       Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
51990       break;
51991     }
51992 
51993     // In any sort of PIC mode addresses need to be computed at runtime by
51994     // adding in a register or some sort of table lookup.  These can't
51995     // be used as immediates.
51996     if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
51997       return;
51998 
51999     // If we are in non-pic codegen mode, we allow the address of a global (with
52000     // an optional displacement) to be used with 'i'.
52001     if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
52002       // If we require an extra load to get this address, as in PIC mode, we
52003       // can't accept it.
52004       if (isGlobalStubReference(
52005               Subtarget.classifyGlobalReference(GA->getGlobal())))
52006         return;
52007     break;
52008   }
52009   }
52010 
52011   if (Result.getNode()) {
52012     Ops.push_back(Result);
52013     return;
52014   }
52015   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
52016 }
52017 
52018 /// Check if \p RC is a general purpose register class.
52019 /// I.e., GR* or one of their variant.
isGRClass(const TargetRegisterClass & RC)52020 static bool isGRClass(const TargetRegisterClass &RC) {
52021   return RC.hasSuperClassEq(&X86::GR8RegClass) ||
52022          RC.hasSuperClassEq(&X86::GR16RegClass) ||
52023          RC.hasSuperClassEq(&X86::GR32RegClass) ||
52024          RC.hasSuperClassEq(&X86::GR64RegClass) ||
52025          RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
52026 }
52027 
52028 /// Check if \p RC is a vector register class.
52029 /// I.e., FR* / VR* or one of their variant.
isFRClass(const TargetRegisterClass & RC)52030 static bool isFRClass(const TargetRegisterClass &RC) {
52031   return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
52032          RC.hasSuperClassEq(&X86::FR64XRegClass) ||
52033          RC.hasSuperClassEq(&X86::VR128XRegClass) ||
52034          RC.hasSuperClassEq(&X86::VR256XRegClass) ||
52035          RC.hasSuperClassEq(&X86::VR512RegClass);
52036 }
52037 
52038 /// Check if \p RC is a mask register class.
52039 /// I.e., VK* or one of their variant.
isVKClass(const TargetRegisterClass & RC)52040 static bool isVKClass(const TargetRegisterClass &RC) {
52041   return RC.hasSuperClassEq(&X86::VK1RegClass) ||
52042          RC.hasSuperClassEq(&X86::VK2RegClass) ||
52043          RC.hasSuperClassEq(&X86::VK4RegClass) ||
52044          RC.hasSuperClassEq(&X86::VK8RegClass) ||
52045          RC.hasSuperClassEq(&X86::VK16RegClass) ||
52046          RC.hasSuperClassEq(&X86::VK32RegClass) ||
52047          RC.hasSuperClassEq(&X86::VK64RegClass);
52048 }
52049 
52050 std::pair<unsigned, const TargetRegisterClass *>
getRegForInlineAsmConstraint(const TargetRegisterInfo * TRI,StringRef Constraint,MVT VT) const52051 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
52052                                                 StringRef Constraint,
52053                                                 MVT VT) const {
52054   // First, see if this is a constraint that directly corresponds to an LLVM
52055   // register class.
52056   if (Constraint.size() == 1) {
52057     // GCC Constraint Letters
52058     switch (Constraint[0]) {
52059     default: break;
52060     // 'A' means [ER]AX + [ER]DX.
52061     case 'A':
52062       if (Subtarget.is64Bit())
52063         return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
52064       assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
52065              "Expecting 64, 32 or 16 bit subtarget");
52066       return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
52067 
52068       // TODO: Slight differences here in allocation order and leaving
52069       // RIP in the class. Do they matter any more here than they do
52070       // in the normal allocation?
52071     case 'k':
52072       if (Subtarget.hasAVX512()) {
52073         if (VT == MVT::i1)
52074           return std::make_pair(0U, &X86::VK1RegClass);
52075         if (VT == MVT::i8)
52076           return std::make_pair(0U, &X86::VK8RegClass);
52077         if (VT == MVT::i16)
52078           return std::make_pair(0U, &X86::VK16RegClass);
52079       }
52080       if (Subtarget.hasBWI()) {
52081         if (VT == MVT::i32)
52082           return std::make_pair(0U, &X86::VK32RegClass);
52083         if (VT == MVT::i64)
52084           return std::make_pair(0U, &X86::VK64RegClass);
52085       }
52086       break;
52087     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
52088       if (Subtarget.is64Bit()) {
52089         if (VT == MVT::i8 || VT == MVT::i1)
52090           return std::make_pair(0U, &X86::GR8RegClass);
52091         if (VT == MVT::i16)
52092           return std::make_pair(0U, &X86::GR16RegClass);
52093         if (VT == MVT::i32 || VT == MVT::f32)
52094           return std::make_pair(0U, &X86::GR32RegClass);
52095         if (VT != MVT::f80 && !VT.isVector())
52096           return std::make_pair(0U, &X86::GR64RegClass);
52097         break;
52098       }
52099       LLVM_FALLTHROUGH;
52100       // 32-bit fallthrough
52101     case 'Q':   // Q_REGS
52102       if (VT == MVT::i8 || VT == MVT::i1)
52103         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
52104       if (VT == MVT::i16)
52105         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
52106       if (VT == MVT::i32 || VT == MVT::f32 ||
52107           (!VT.isVector() && !Subtarget.is64Bit()))
52108         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
52109       if (VT != MVT::f80 && !VT.isVector())
52110         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
52111       break;
52112     case 'r':   // GENERAL_REGS
52113     case 'l':   // INDEX_REGS
52114       if (VT == MVT::i8 || VT == MVT::i1)
52115         return std::make_pair(0U, &X86::GR8RegClass);
52116       if (VT == MVT::i16)
52117         return std::make_pair(0U, &X86::GR16RegClass);
52118       if (VT == MVT::i32 || VT == MVT::f32 ||
52119           (!VT.isVector() && !Subtarget.is64Bit()))
52120         return std::make_pair(0U, &X86::GR32RegClass);
52121       if (VT != MVT::f80 && !VT.isVector())
52122         return std::make_pair(0U, &X86::GR64RegClass);
52123       break;
52124     case 'R':   // LEGACY_REGS
52125       if (VT == MVT::i8 || VT == MVT::i1)
52126         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
52127       if (VT == MVT::i16)
52128         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
52129       if (VT == MVT::i32 || VT == MVT::f32 ||
52130           (!VT.isVector() && !Subtarget.is64Bit()))
52131         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
52132       if (VT != MVT::f80 && !VT.isVector())
52133         return std::make_pair(0U, &X86::GR64_NOREXRegClass);
52134       break;
52135     case 'f':  // FP Stack registers.
52136       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
52137       // value to the correct fpstack register class.
52138       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
52139         return std::make_pair(0U, &X86::RFP32RegClass);
52140       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
52141         return std::make_pair(0U, &X86::RFP64RegClass);
52142       if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
52143         return std::make_pair(0U, &X86::RFP80RegClass);
52144       break;
52145     case 'y':   // MMX_REGS if MMX allowed.
52146       if (!Subtarget.hasMMX()) break;
52147       return std::make_pair(0U, &X86::VR64RegClass);
52148     case 'v':
52149     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
52150       if (!Subtarget.hasSSE1()) break;
52151       bool VConstraint = (Constraint[0] == 'v');
52152 
52153       switch (VT.SimpleTy) {
52154       default: break;
52155       // Scalar SSE types.
52156       case MVT::f32:
52157       case MVT::i32:
52158         if (VConstraint && Subtarget.hasVLX())
52159           return std::make_pair(0U, &X86::FR32XRegClass);
52160         return std::make_pair(0U, &X86::FR32RegClass);
52161       case MVT::f64:
52162       case MVT::i64:
52163         if (VConstraint && Subtarget.hasVLX())
52164           return std::make_pair(0U, &X86::FR64XRegClass);
52165         return std::make_pair(0U, &X86::FR64RegClass);
52166       case MVT::i128:
52167         if (Subtarget.is64Bit()) {
52168           if (VConstraint && Subtarget.hasVLX())
52169             return std::make_pair(0U, &X86::VR128XRegClass);
52170           return std::make_pair(0U, &X86::VR128RegClass);
52171         }
52172         break;
52173       // Vector types and fp128.
52174       case MVT::f128:
52175       case MVT::v16i8:
52176       case MVT::v8i16:
52177       case MVT::v4i32:
52178       case MVT::v2i64:
52179       case MVT::v4f32:
52180       case MVT::v2f64:
52181         if (VConstraint && Subtarget.hasVLX())
52182           return std::make_pair(0U, &X86::VR128XRegClass);
52183         return std::make_pair(0U, &X86::VR128RegClass);
52184       // AVX types.
52185       case MVT::v32i8:
52186       case MVT::v16i16:
52187       case MVT::v8i32:
52188       case MVT::v4i64:
52189       case MVT::v8f32:
52190       case MVT::v4f64:
52191         if (VConstraint && Subtarget.hasVLX())
52192           return std::make_pair(0U, &X86::VR256XRegClass);
52193         if (Subtarget.hasAVX())
52194           return std::make_pair(0U, &X86::VR256RegClass);
52195         break;
52196       case MVT::v64i8:
52197       case MVT::v32i16:
52198       case MVT::v8f64:
52199       case MVT::v16f32:
52200       case MVT::v16i32:
52201       case MVT::v8i64:
52202         if (!Subtarget.hasAVX512()) break;
52203         if (VConstraint)
52204           return std::make_pair(0U, &X86::VR512RegClass);
52205         return std::make_pair(0U, &X86::VR512_0_15RegClass);
52206       }
52207       break;
52208     }
52209   } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
52210     switch (Constraint[1]) {
52211     default:
52212       break;
52213     case 'i':
52214     case 't':
52215     case '2':
52216       return getRegForInlineAsmConstraint(TRI, "x", VT);
52217     case 'm':
52218       if (!Subtarget.hasMMX()) break;
52219       return std::make_pair(0U, &X86::VR64RegClass);
52220     case 'z':
52221       if (!Subtarget.hasSSE1()) break;
52222       switch (VT.SimpleTy) {
52223       default: break;
52224       // Scalar SSE types.
52225       case MVT::f32:
52226       case MVT::i32:
52227         return std::make_pair(X86::XMM0, &X86::FR32RegClass);
52228       case MVT::f64:
52229       case MVT::i64:
52230         return std::make_pair(X86::XMM0, &X86::FR64RegClass);
52231       case MVT::f128:
52232       case MVT::v16i8:
52233       case MVT::v8i16:
52234       case MVT::v4i32:
52235       case MVT::v2i64:
52236       case MVT::v4f32:
52237       case MVT::v2f64:
52238         return std::make_pair(X86::XMM0, &X86::VR128RegClass);
52239       // AVX types.
52240       case MVT::v32i8:
52241       case MVT::v16i16:
52242       case MVT::v8i32:
52243       case MVT::v4i64:
52244       case MVT::v8f32:
52245       case MVT::v4f64:
52246         if (Subtarget.hasAVX())
52247           return std::make_pair(X86::YMM0, &X86::VR256RegClass);
52248         break;
52249       case MVT::v64i8:
52250       case MVT::v32i16:
52251       case MVT::v8f64:
52252       case MVT::v16f32:
52253       case MVT::v16i32:
52254       case MVT::v8i64:
52255         if (Subtarget.hasAVX512())
52256           return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
52257         break;
52258       }
52259       break;
52260     case 'k':
52261       // This register class doesn't allocate k0 for masked vector operation.
52262       if (Subtarget.hasAVX512()) {
52263         if (VT == MVT::i1)
52264           return std::make_pair(0U, &X86::VK1WMRegClass);
52265         if (VT == MVT::i8)
52266           return std::make_pair(0U, &X86::VK8WMRegClass);
52267         if (VT == MVT::i16)
52268           return std::make_pair(0U, &X86::VK16WMRegClass);
52269       }
52270       if (Subtarget.hasBWI()) {
52271         if (VT == MVT::i32)
52272           return std::make_pair(0U, &X86::VK32WMRegClass);
52273         if (VT == MVT::i64)
52274           return std::make_pair(0U, &X86::VK64WMRegClass);
52275       }
52276       break;
52277     }
52278   }
52279 
52280   if (parseConstraintCode(Constraint) != X86::COND_INVALID)
52281     return std::make_pair(0U, &X86::GR32RegClass);
52282 
52283   // Use the default implementation in TargetLowering to convert the register
52284   // constraint into a member of a register class.
52285   std::pair<Register, const TargetRegisterClass*> Res;
52286   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
52287 
52288   // Not found as a standard register?
52289   if (!Res.second) {
52290     // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
52291     // to/from f80.
52292     if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
52293       // Map st(0) -> st(7) -> ST0
52294       if (Constraint.size() == 7 && Constraint[0] == '{' &&
52295           tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
52296           Constraint[3] == '(' &&
52297           (Constraint[4] >= '0' && Constraint[4] <= '7') &&
52298           Constraint[5] == ')' && Constraint[6] == '}') {
52299         // st(7) is not allocatable and thus not a member of RFP80. Return
52300         // singleton class in cases where we have a reference to it.
52301         if (Constraint[4] == '7')
52302           return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
52303         return std::make_pair(X86::FP0 + Constraint[4] - '0',
52304                               &X86::RFP80RegClass);
52305       }
52306 
52307       // GCC allows "st(0)" to be called just plain "st".
52308       if (StringRef("{st}").equals_insensitive(Constraint))
52309         return std::make_pair(X86::FP0, &X86::RFP80RegClass);
52310     }
52311 
52312     // flags -> EFLAGS
52313     if (StringRef("{flags}").equals_insensitive(Constraint))
52314       return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
52315 
52316     // dirflag -> DF
52317     // Only allow for clobber.
52318     if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
52319         VT == MVT::Other)
52320       return std::make_pair(X86::DF, &X86::DFCCRRegClass);
52321 
52322     // fpsr -> FPSW
52323     if (StringRef("{fpsr}").equals_insensitive(Constraint))
52324       return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
52325 
52326     return Res;
52327   }
52328 
52329   // Make sure it isn't a register that requires 64-bit mode.
52330   if (!Subtarget.is64Bit() &&
52331       (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
52332       TRI->getEncodingValue(Res.first) >= 8) {
52333     // Register requires REX prefix, but we're in 32-bit mode.
52334     return std::make_pair(0, nullptr);
52335   }
52336 
52337   // Make sure it isn't a register that requires AVX512.
52338   if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
52339       TRI->getEncodingValue(Res.first) & 0x10) {
52340     // Register requires EVEX prefix.
52341     return std::make_pair(0, nullptr);
52342   }
52343 
52344   // Otherwise, check to see if this is a register class of the wrong value
52345   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
52346   // turn into {ax},{dx}.
52347   // MVT::Other is used to specify clobber names.
52348   if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
52349     return Res;   // Correct type already, nothing to do.
52350 
52351   // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
52352   // return "eax". This should even work for things like getting 64bit integer
52353   // registers when given an f64 type.
52354   const TargetRegisterClass *Class = Res.second;
52355   // The generic code will match the first register class that contains the
52356   // given register. Thus, based on the ordering of the tablegened file,
52357   // the "plain" GR classes might not come first.
52358   // Therefore, use a helper method.
52359   if (isGRClass(*Class)) {
52360     unsigned Size = VT.getSizeInBits();
52361     if (Size == 1) Size = 8;
52362     Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
52363     if (DestReg > 0) {
52364       bool is64Bit = Subtarget.is64Bit();
52365       const TargetRegisterClass *RC =
52366           Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
52367         : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
52368         : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
52369         : Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
52370         : nullptr;
52371       if (Size == 64 && !is64Bit) {
52372         // Model GCC's behavior here and select a fixed pair of 32-bit
52373         // registers.
52374         switch (DestReg) {
52375         case X86::RAX:
52376           return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
52377         case X86::RDX:
52378           return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
52379         case X86::RCX:
52380           return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
52381         case X86::RBX:
52382           return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
52383         case X86::RSI:
52384           return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
52385         case X86::RDI:
52386           return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
52387         case X86::RBP:
52388           return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
52389         default:
52390           return std::make_pair(0, nullptr);
52391         }
52392       }
52393       if (RC && RC->contains(DestReg))
52394         return std::make_pair(DestReg, RC);
52395       return Res;
52396     }
52397     // No register found/type mismatch.
52398     return std::make_pair(0, nullptr);
52399   } else if (isFRClass(*Class)) {
52400     // Handle references to XMM physical registers that got mapped into the
52401     // wrong class.  This can happen with constraints like {xmm0} where the
52402     // target independent register mapper will just pick the first match it can
52403     // find, ignoring the required type.
52404 
52405     // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
52406     if (VT == MVT::f32 || VT == MVT::i32)
52407       Res.second = &X86::FR32XRegClass;
52408     else if (VT == MVT::f64 || VT == MVT::i64)
52409       Res.second = &X86::FR64XRegClass;
52410     else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
52411       Res.second = &X86::VR128XRegClass;
52412     else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
52413       Res.second = &X86::VR256XRegClass;
52414     else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
52415       Res.second = &X86::VR512RegClass;
52416     else {
52417       // Type mismatch and not a clobber: Return an error;
52418       Res.first = 0;
52419       Res.second = nullptr;
52420     }
52421   } else if (isVKClass(*Class)) {
52422     if (VT == MVT::i1)
52423       Res.second = &X86::VK1RegClass;
52424     else if (VT == MVT::i8)
52425       Res.second = &X86::VK8RegClass;
52426     else if (VT == MVT::i16)
52427       Res.second = &X86::VK16RegClass;
52428     else if (VT == MVT::i32)
52429       Res.second = &X86::VK32RegClass;
52430     else if (VT == MVT::i64)
52431       Res.second = &X86::VK64RegClass;
52432     else {
52433       // Type mismatch and not a clobber: Return an error;
52434       Res.first = 0;
52435       Res.second = nullptr;
52436     }
52437   }
52438 
52439   return Res;
52440 }
52441 
getScalingFactorCost(const DataLayout & DL,const AddrMode & AM,Type * Ty,unsigned AS) const52442 InstructionCost X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
52443                                                         const AddrMode &AM,
52444                                                         Type *Ty,
52445                                                         unsigned AS) const {
52446   // Scaling factors are not free at all.
52447   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
52448   // will take 2 allocations in the out of order engine instead of 1
52449   // for plain addressing mode, i.e. inst (reg1).
52450   // E.g.,
52451   // vaddps (%rsi,%rdx), %ymm0, %ymm1
52452   // Requires two allocations (one for the load, one for the computation)
52453   // whereas:
52454   // vaddps (%rsi), %ymm0, %ymm1
52455   // Requires just 1 allocation, i.e., freeing allocations for other operations
52456   // and having less micro operations to execute.
52457   //
52458   // For some X86 architectures, this is even worse because for instance for
52459   // stores, the complex addressing mode forces the instruction to use the
52460   // "load" ports instead of the dedicated "store" port.
52461   // E.g., on Haswell:
52462   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
52463   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
52464   if (isLegalAddressingMode(DL, AM, Ty, AS))
52465     // Scale represents reg2 * scale, thus account for 1
52466     // as soon as we use a second register.
52467     return AM.Scale != 0;
52468   return -1;
52469 }
52470 
isIntDivCheap(EVT VT,AttributeList Attr) const52471 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
52472   // Integer division on x86 is expensive. However, when aggressively optimizing
52473   // for code size, we prefer to use a div instruction, as it is usually smaller
52474   // than the alternative sequence.
52475   // The exception to this is vector division. Since x86 doesn't have vector
52476   // integer division, leaving the division as-is is a loss even in terms of
52477   // size, because it will have to be scalarized, while the alternative code
52478   // sequence can be performed in vector form.
52479   bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
52480   return OptSize && !VT.isVector();
52481 }
52482 
initializeSplitCSR(MachineBasicBlock * Entry) const52483 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
52484   if (!Subtarget.is64Bit())
52485     return;
52486 
52487   // Update IsSplitCSR in X86MachineFunctionInfo.
52488   X86MachineFunctionInfo *AFI =
52489       Entry->getParent()->getInfo<X86MachineFunctionInfo>();
52490   AFI->setIsSplitCSR(true);
52491 }
52492 
insertCopiesSplitCSR(MachineBasicBlock * Entry,const SmallVectorImpl<MachineBasicBlock * > & Exits) const52493 void X86TargetLowering::insertCopiesSplitCSR(
52494     MachineBasicBlock *Entry,
52495     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
52496   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
52497   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
52498   if (!IStart)
52499     return;
52500 
52501   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
52502   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
52503   MachineBasicBlock::iterator MBBI = Entry->begin();
52504   for (const MCPhysReg *I = IStart; *I; ++I) {
52505     const TargetRegisterClass *RC = nullptr;
52506     if (X86::GR64RegClass.contains(*I))
52507       RC = &X86::GR64RegClass;
52508     else
52509       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
52510 
52511     Register NewVR = MRI->createVirtualRegister(RC);
52512     // Create copy from CSR to a virtual register.
52513     // FIXME: this currently does not emit CFI pseudo-instructions, it works
52514     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
52515     // nounwind. If we want to generalize this later, we may need to emit
52516     // CFI pseudo-instructions.
52517     assert(
52518         Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
52519         "Function should be nounwind in insertCopiesSplitCSR!");
52520     Entry->addLiveIn(*I);
52521     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
52522         .addReg(*I);
52523 
52524     // Insert the copy-back instructions right before the terminator.
52525     for (auto *Exit : Exits)
52526       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
52527               TII->get(TargetOpcode::COPY), *I)
52528           .addReg(NewVR);
52529   }
52530 }
52531 
supportSwiftError() const52532 bool X86TargetLowering::supportSwiftError() const {
52533   return Subtarget.is64Bit();
52534 }
52535 
52536 /// Returns true if stack probing through a function call is requested.
hasStackProbeSymbol(MachineFunction & MF) const52537 bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {
52538   return !getStackProbeSymbolName(MF).empty();
52539 }
52540 
52541 /// Returns true if stack probing through inline assembly is requested.
hasInlineStackProbe(MachineFunction & MF) const52542 bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
52543 
52544   // No inline stack probe for Windows, they have their own mechanism.
52545   if (Subtarget.isOSWindows() ||
52546       MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
52547     return false;
52548 
52549   // If the function specifically requests inline stack probes, emit them.
52550   if (MF.getFunction().hasFnAttribute("probe-stack"))
52551     return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
52552            "inline-asm";
52553 
52554   return false;
52555 }
52556 
52557 /// Returns the name of the symbol used to emit stack probes or the empty
52558 /// string if not applicable.
52559 StringRef
getStackProbeSymbolName(MachineFunction & MF) const52560 X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
52561   // Inline Stack probes disable stack probe call
52562   if (hasInlineStackProbe(MF))
52563     return "";
52564 
52565   // If the function specifically requests stack probes, emit them.
52566   if (MF.getFunction().hasFnAttribute("probe-stack"))
52567     return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
52568 
52569   // Generally, if we aren't on Windows, the platform ABI does not include
52570   // support for stack probes, so don't emit them.
52571   if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
52572       MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
52573     return "";
52574 
52575   // We need a stack probe to conform to the Windows ABI. Choose the right
52576   // symbol.
52577   if (Subtarget.is64Bit())
52578     return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
52579   return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
52580 }
52581 
52582 unsigned
getStackProbeSize(MachineFunction & MF) const52583 X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
52584   // The default stack probe size is 4096 if the function has no stackprobesize
52585   // attribute.
52586   unsigned StackProbeSize = 4096;
52587   const Function &Fn = MF.getFunction();
52588   if (Fn.hasFnAttribute("stack-probe-size"))
52589     Fn.getFnAttribute("stack-probe-size")
52590         .getValueAsString()
52591         .getAsInteger(0, StackProbeSize);
52592   return StackProbeSize;
52593 }
52594 
getPrefLoopAlignment(MachineLoop * ML) const52595 Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
52596   if (ML->isInnermost() &&
52597       ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
52598     return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
52599   return TargetLowering::getPrefLoopAlignment();
52600 }
52601