1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the interfaces that X86 uses to lower LLVM code into a
10 // selection DAG.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "X86ISelLowering.h"
15 #include "MCTargetDesc/X86ShuffleDecode.h"
16 #include "X86.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86TargetMachine.h"
23 #include "X86TargetObjectFile.h"
24 #include "llvm/ADT/SmallBitVector.h"
25 #include "llvm/ADT/SmallSet.h"
26 #include "llvm/ADT/Statistic.h"
27 #include "llvm/ADT/StringExtras.h"
28 #include "llvm/ADT/StringSwitch.h"
29 #include "llvm/Analysis/BlockFrequencyInfo.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/Analysis/ObjCARCUtil.h"
32 #include "llvm/Analysis/ProfileSummaryInfo.h"
33 #include "llvm/Analysis/VectorUtils.h"
34 #include "llvm/CodeGen/IntrinsicLowering.h"
35 #include "llvm/CodeGen/MachineFrameInfo.h"
36 #include "llvm/CodeGen/MachineFunction.h"
37 #include "llvm/CodeGen/MachineInstrBuilder.h"
38 #include "llvm/CodeGen/MachineJumpTableInfo.h"
39 #include "llvm/CodeGen/MachineLoopInfo.h"
40 #include "llvm/CodeGen/MachineModuleInfo.h"
41 #include "llvm/CodeGen/MachineRegisterInfo.h"
42 #include "llvm/CodeGen/TargetLowering.h"
43 #include "llvm/CodeGen/WinEHFuncInfo.h"
44 #include "llvm/IR/CallingConv.h"
45 #include "llvm/IR/Constants.h"
46 #include "llvm/IR/DerivedTypes.h"
47 #include "llvm/IR/DiagnosticInfo.h"
48 #include "llvm/IR/Function.h"
49 #include "llvm/IR/GlobalAlias.h"
50 #include "llvm/IR/GlobalVariable.h"
51 #include "llvm/IR/Instructions.h"
52 #include "llvm/IR/Intrinsics.h"
53 #include "llvm/MC/MCAsmInfo.h"
54 #include "llvm/MC/MCContext.h"
55 #include "llvm/MC/MCExpr.h"
56 #include "llvm/MC/MCSymbol.h"
57 #include "llvm/Support/CommandLine.h"
58 #include "llvm/Support/Debug.h"
59 #include "llvm/Support/ErrorHandling.h"
60 #include "llvm/Support/KnownBits.h"
61 #include "llvm/Support/MathExtras.h"
62 #include "llvm/Target/TargetOptions.h"
63 #include <algorithm>
64 #include <bitset>
65 #include <cctype>
66 #include <numeric>
67 using namespace llvm;
68 
69 #define DEBUG_TYPE "x86-isel"
70 
71 STATISTIC(NumTailCalls, "Number of tail calls");
72 
73 static cl::opt<int> ExperimentalPrefLoopAlignment(
74     "x86-experimental-pref-loop-alignment", cl::init(4),
75     cl::desc(
76         "Sets the preferable loop alignment for experiments (as log2 bytes)"
77         "(the last x86-experimental-pref-loop-alignment bits"
78         " of the loop header PC will be 0)."),
79     cl::Hidden);
80 
81 static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
82     "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
83     cl::desc(
84         "Sets the preferable loop alignment for experiments (as log2 bytes) "
85         "for innermost loops only. If specified, this option overrides "
86         "alignment set by x86-experimental-pref-loop-alignment."),
87     cl::Hidden);
88 
89 static cl::opt<bool> MulConstantOptimization(
90     "mul-constant-optimization", cl::init(true),
91     cl::desc("Replace 'mul x, Const' with more effective instructions like "
92              "SHIFT, LEA, etc."),
93     cl::Hidden);
94 
95 static cl::opt<bool> ExperimentalUnorderedISEL(
96     "x86-experimental-unordered-atomic-isel", cl::init(false),
97     cl::desc("Use LoadSDNode and StoreSDNode instead of "
98              "AtomicSDNode for unordered atomic loads and "
99              "stores respectively."),
100     cl::Hidden);
101 
102 /// Call this when the user attempts to do something unsupported, like
103 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
104 /// report_fatal_error, so calling code should attempt to recover without
105 /// crashing.
errorUnsupported(SelectionDAG & DAG,const SDLoc & dl,const char * Msg)106 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
107                              const char *Msg) {
108   MachineFunction &MF = DAG.getMachineFunction();
109   DAG.getContext()->diagnose(
110       DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
111 }
112 
X86TargetLowering(const X86TargetMachine & TM,const X86Subtarget & STI)113 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
114                                      const X86Subtarget &STI)
115     : TargetLowering(TM), Subtarget(STI) {
116   bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
117   X86ScalarSSEf64 = Subtarget.hasSSE2();
118   X86ScalarSSEf32 = Subtarget.hasSSE1();
119   MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
120 
121   // Set up the TargetLowering object.
122 
123   // X86 is weird. It always uses i8 for shift amounts and setcc results.
124   setBooleanContents(ZeroOrOneBooleanContent);
125   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
126   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
127 
128   // For 64-bit, since we have so many registers, use the ILP scheduler.
129   // For 32-bit, use the register pressure specific scheduling.
130   // For Atom, always use ILP scheduling.
131   if (Subtarget.isAtom())
132     setSchedulingPreference(Sched::ILP);
133   else if (Subtarget.is64Bit())
134     setSchedulingPreference(Sched::ILP);
135   else
136     setSchedulingPreference(Sched::RegPressure);
137   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
138   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
139 
140   // Bypass expensive divides and use cheaper ones.
141   if (TM.getOptLevel() >= CodeGenOpt::Default) {
142     if (Subtarget.hasSlowDivide32())
143       addBypassSlowDiv(32, 8);
144     if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
145       addBypassSlowDiv(64, 32);
146   }
147 
148   // Setup Windows compiler runtime calls.
149   if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
150     static const struct {
151       const RTLIB::Libcall Op;
152       const char * const Name;
153       const CallingConv::ID CC;
154     } LibraryCalls[] = {
155       { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
156       { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
157       { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
158       { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
159       { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
160     };
161 
162     for (const auto &LC : LibraryCalls) {
163       setLibcallName(LC.Op, LC.Name);
164       setLibcallCallingConv(LC.Op, LC.CC);
165     }
166   }
167 
168   if (Subtarget.getTargetTriple().isOSMSVCRT()) {
169     // MSVCRT doesn't have powi; fall back to pow
170     setLibcallName(RTLIB::POWI_F32, nullptr);
171     setLibcallName(RTLIB::POWI_F64, nullptr);
172   }
173 
174   // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
175   // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
176   // FIXME: Should we be limiting the atomic size on other configs? Default is
177   // 1024.
178   if (!Subtarget.hasCmpxchg8b())
179     setMaxAtomicSizeInBitsSupported(32);
180 
181   // Set up the register classes.
182   addRegisterClass(MVT::i8, &X86::GR8RegClass);
183   addRegisterClass(MVT::i16, &X86::GR16RegClass);
184   addRegisterClass(MVT::i32, &X86::GR32RegClass);
185   if (Subtarget.is64Bit())
186     addRegisterClass(MVT::i64, &X86::GR64RegClass);
187 
188   for (MVT VT : MVT::integer_valuetypes())
189     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
190 
191   // We don't accept any truncstore of integer registers.
192   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
193   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
194   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
195   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
196   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
197   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
198 
199   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
200 
201   // SETOEQ and SETUNE require checking two conditions.
202   for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
203     setCondCodeAction(ISD::SETOEQ, VT, Expand);
204     setCondCodeAction(ISD::SETUNE, VT, Expand);
205   }
206 
207   // Integer absolute.
208   if (Subtarget.hasCMov()) {
209     setOperationAction(ISD::ABS            , MVT::i16  , Custom);
210     setOperationAction(ISD::ABS            , MVT::i32  , Custom);
211     if (Subtarget.is64Bit())
212       setOperationAction(ISD::ABS          , MVT::i64  , Custom);
213   }
214 
215   // Funnel shifts.
216   for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
217     // For slow shld targets we only lower for code size.
218     LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
219 
220     setOperationAction(ShiftOp             , MVT::i8   , Custom);
221     setOperationAction(ShiftOp             , MVT::i16  , Custom);
222     setOperationAction(ShiftOp             , MVT::i32  , ShiftDoubleAction);
223     if (Subtarget.is64Bit())
224       setOperationAction(ShiftOp           , MVT::i64  , ShiftDoubleAction);
225   }
226 
227   if (!Subtarget.useSoftFloat()) {
228     // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
229     // operation.
230     setOperationAction(ISD::UINT_TO_FP,        MVT::i8, Promote);
231     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
232     setOperationAction(ISD::UINT_TO_FP,        MVT::i16, Promote);
233     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
234     // We have an algorithm for SSE2, and we turn this into a 64-bit
235     // FILD or VCVTUSI2SS/SD for other targets.
236     setOperationAction(ISD::UINT_TO_FP,        MVT::i32, Custom);
237     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
238     // We have an algorithm for SSE2->double, and we turn this into a
239     // 64-bit FILD followed by conditional FADD for other targets.
240     setOperationAction(ISD::UINT_TO_FP,        MVT::i64, Custom);
241     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
242 
243     // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
244     // this operation.
245     setOperationAction(ISD::SINT_TO_FP,        MVT::i8, Promote);
246     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
247     // SSE has no i16 to fp conversion, only i32. We promote in the handler
248     // to allow f80 to use i16 and f64 to use i16 with sse1 only
249     setOperationAction(ISD::SINT_TO_FP,        MVT::i16, Custom);
250     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
251     // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
252     setOperationAction(ISD::SINT_TO_FP,        MVT::i32, Custom);
253     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
254     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
255     // are Legal, f80 is custom lowered.
256     setOperationAction(ISD::SINT_TO_FP,        MVT::i64, Custom);
257     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
258 
259     // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
260     // this operation.
261     setOperationAction(ISD::FP_TO_SINT,        MVT::i8,  Promote);
262     // FIXME: This doesn't generate invalid exception when it should. PR44019.
263     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8,  Promote);
264     setOperationAction(ISD::FP_TO_SINT,        MVT::i16, Custom);
265     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
266     setOperationAction(ISD::FP_TO_SINT,        MVT::i32, Custom);
267     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
268     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
269     // are Legal, f80 is custom lowered.
270     setOperationAction(ISD::FP_TO_SINT,        MVT::i64, Custom);
271     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
272 
273     // Handle FP_TO_UINT by promoting the destination to a larger signed
274     // conversion.
275     setOperationAction(ISD::FP_TO_UINT,        MVT::i8,  Promote);
276     // FIXME: This doesn't generate invalid exception when it should. PR44019.
277     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8,  Promote);
278     setOperationAction(ISD::FP_TO_UINT,        MVT::i16, Promote);
279     // FIXME: This doesn't generate invalid exception when it should. PR44019.
280     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
281     setOperationAction(ISD::FP_TO_UINT,        MVT::i32, Custom);
282     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
283     setOperationAction(ISD::FP_TO_UINT,        MVT::i64, Custom);
284     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
285 
286     setOperationAction(ISD::LRINT,             MVT::f32, Custom);
287     setOperationAction(ISD::LRINT,             MVT::f64, Custom);
288     setOperationAction(ISD::LLRINT,            MVT::f32, Custom);
289     setOperationAction(ISD::LLRINT,            MVT::f64, Custom);
290 
291     if (!Subtarget.is64Bit()) {
292       setOperationAction(ISD::LRINT,  MVT::i64, Custom);
293       setOperationAction(ISD::LLRINT, MVT::i64, Custom);
294     }
295   }
296 
297   if (Subtarget.hasSSE2()) {
298     // Custom lowering for saturating float to int conversions.
299     // We handle promotion to larger result types manually.
300     for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
301       setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
302       setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
303     }
304     if (Subtarget.is64Bit()) {
305       setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
306       setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
307     }
308   }
309 
310   // Handle address space casts between mixed sized pointers.
311   setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
312   setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
313 
314   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
315   if (!X86ScalarSSEf64) {
316     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
317     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
318     if (Subtarget.is64Bit()) {
319       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
320       // Without SSE, i64->f64 goes through memory.
321       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
322     }
323   } else if (!Subtarget.is64Bit())
324     setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);
325 
326   // Scalar integer divide and remainder are lowered to use operations that
327   // produce two results, to match the available instructions. This exposes
328   // the two-result form to trivial CSE, which is able to combine x/y and x%y
329   // into a single instruction.
330   //
331   // Scalar integer multiply-high is also lowered to use two-result
332   // operations, to match the available instructions. However, plain multiply
333   // (low) operations are left as Legal, as there are single-result
334   // instructions for this in x86. Using the two-result multiply instructions
335   // when both high and low results are needed must be arranged by dagcombine.
336   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
337     setOperationAction(ISD::MULHS, VT, Expand);
338     setOperationAction(ISD::MULHU, VT, Expand);
339     setOperationAction(ISD::SDIV, VT, Expand);
340     setOperationAction(ISD::UDIV, VT, Expand);
341     setOperationAction(ISD::SREM, VT, Expand);
342     setOperationAction(ISD::UREM, VT, Expand);
343   }
344 
345   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
346   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
347   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
348                    MVT::i8,  MVT::i16, MVT::i32, MVT::i64 }) {
349     setOperationAction(ISD::BR_CC,     VT, Expand);
350     setOperationAction(ISD::SELECT_CC, VT, Expand);
351   }
352   if (Subtarget.is64Bit())
353     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
354   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
355   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
356   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
357 
358   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
359   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
360   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
361   setOperationAction(ISD::FREM             , MVT::f128 , Expand);
362 
363   if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
364     setOperationAction(ISD::FLT_ROUNDS_    , MVT::i32  , Custom);
365     setOperationAction(ISD::SET_ROUNDING   , MVT::Other, Custom);
366   }
367 
368   // Promote the i8 variants and force them on up to i32 which has a shorter
369   // encoding.
370   setOperationPromotedToType(ISD::CTTZ           , MVT::i8   , MVT::i32);
371   setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
372 
373   if (Subtarget.hasBMI()) {
374     // Promote the i16 zero undef variant and force it on up to i32 when tzcnt
375     // is enabled.
376     setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16, MVT::i32);
377   } else {
378     setOperationAction(ISD::CTTZ, MVT::i16, Custom);
379     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
380     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Legal);
381     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Legal);
382     if (Subtarget.is64Bit()) {
383       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
384       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
385     }
386   }
387 
388   if (Subtarget.hasLZCNT()) {
389     // When promoting the i8 variants, force them to i32 for a shorter
390     // encoding.
391     setOperationPromotedToType(ISD::CTLZ           , MVT::i8   , MVT::i32);
392     setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
393   } else {
394     for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
395       if (VT == MVT::i64 && !Subtarget.is64Bit())
396         continue;
397       setOperationAction(ISD::CTLZ           , VT, Custom);
398       setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
399     }
400   }
401 
402   for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
403                   ISD::STRICT_FP_TO_FP16}) {
404     // Special handling for half-precision floating point conversions.
405     // If we don't have F16C support, then lower half float conversions
406     // into library calls.
407     setOperationAction(
408         Op, MVT::f32,
409         (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
410     // There's never any support for operations beyond MVT::f32.
411     setOperationAction(Op, MVT::f64, Expand);
412     setOperationAction(Op, MVT::f80, Expand);
413     setOperationAction(Op, MVT::f128, Expand);
414   }
415 
416   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
417   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
418   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
419   setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
420   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
421   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
422   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
423   setTruncStoreAction(MVT::f128, MVT::f16, Expand);
424 
425   setOperationAction(ISD::PARITY, MVT::i8, Custom);
426   if (Subtarget.hasPOPCNT()) {
427     setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
428   } else {
429     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
430     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
431     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
432     if (Subtarget.is64Bit())
433       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
434     else
435       setOperationAction(ISD::CTPOP        , MVT::i64  , Custom);
436 
437     setOperationAction(ISD::PARITY, MVT::i16, Custom);
438     setOperationAction(ISD::PARITY, MVT::i32, Custom);
439     if (Subtarget.is64Bit())
440       setOperationAction(ISD::PARITY, MVT::i64, Custom);
441   }
442 
443   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
444 
445   if (!Subtarget.hasMOVBE())
446     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
447 
448   // X86 wants to expand cmov itself.
449   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
450     setOperationAction(ISD::SELECT, VT, Custom);
451     setOperationAction(ISD::SETCC, VT, Custom);
452     setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
453     setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
454   }
455   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
456     if (VT == MVT::i64 && !Subtarget.is64Bit())
457       continue;
458     setOperationAction(ISD::SELECT, VT, Custom);
459     setOperationAction(ISD::SETCC,  VT, Custom);
460   }
461 
462   // Custom action for SELECT MMX and expand action for SELECT_CC MMX
463   setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
464   setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
465 
466   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
467   // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
468   // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
469   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
470   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
471   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
472   if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
473     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
474 
475   // Darwin ABI issue.
476   for (auto VT : { MVT::i32, MVT::i64 }) {
477     if (VT == MVT::i64 && !Subtarget.is64Bit())
478       continue;
479     setOperationAction(ISD::ConstantPool    , VT, Custom);
480     setOperationAction(ISD::JumpTable       , VT, Custom);
481     setOperationAction(ISD::GlobalAddress   , VT, Custom);
482     setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
483     setOperationAction(ISD::ExternalSymbol  , VT, Custom);
484     setOperationAction(ISD::BlockAddress    , VT, Custom);
485   }
486 
487   // 64-bit shl, sra, srl (iff 32-bit x86)
488   for (auto VT : { MVT::i32, MVT::i64 }) {
489     if (VT == MVT::i64 && !Subtarget.is64Bit())
490       continue;
491     setOperationAction(ISD::SHL_PARTS, VT, Custom);
492     setOperationAction(ISD::SRA_PARTS, VT, Custom);
493     setOperationAction(ISD::SRL_PARTS, VT, Custom);
494   }
495 
496   if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
497     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
498 
499   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
500 
501   // Expand certain atomics
502   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
503     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
504     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
505     setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
506     setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
507     setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
508     setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
509     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
510   }
511 
512   if (!Subtarget.is64Bit())
513     setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
514 
515   if (Subtarget.hasCmpxchg16b()) {
516     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
517   }
518 
519   // FIXME - use subtarget debug flags
520   if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
521       !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
522       TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
523     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
524   }
525 
526   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
527   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
528 
529   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
530   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
531 
532   setOperationAction(ISD::TRAP, MVT::Other, Legal);
533   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
534   setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
535 
536   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
537   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
538   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
539   bool Is64Bit = Subtarget.is64Bit();
540   setOperationAction(ISD::VAARG,  MVT::Other, Is64Bit ? Custom : Expand);
541   setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
542 
543   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
544   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
545 
546   setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
547 
548   // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
549   setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
550   setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
551 
552   if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
553     // f32 and f64 use SSE.
554     // Set up the FP register classes.
555     addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
556                                                      : &X86::FR32RegClass);
557     addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
558                                                      : &X86::FR64RegClass);
559 
560     // Disable f32->f64 extload as we can only generate this in one instruction
561     // under optsize. So its easier to pattern match (fpext (load)) for that
562     // case instead of needing to emit 2 instructions for extload in the
563     // non-optsize case.
564     setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
565 
566     for (auto VT : { MVT::f32, MVT::f64 }) {
567       // Use ANDPD to simulate FABS.
568       setOperationAction(ISD::FABS, VT, Custom);
569 
570       // Use XORP to simulate FNEG.
571       setOperationAction(ISD::FNEG, VT, Custom);
572 
573       // Use ANDPD and ORPD to simulate FCOPYSIGN.
574       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
575 
576       // These might be better off as horizontal vector ops.
577       setOperationAction(ISD::FADD, VT, Custom);
578       setOperationAction(ISD::FSUB, VT, Custom);
579 
580       // We don't support sin/cos/fmod
581       setOperationAction(ISD::FSIN   , VT, Expand);
582       setOperationAction(ISD::FCOS   , VT, Expand);
583       setOperationAction(ISD::FSINCOS, VT, Expand);
584     }
585 
586     // Lower this to MOVMSK plus an AND.
587     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
588     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
589 
590   } else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&
591              (UseX87 || Is64Bit)) {
592     // Use SSE for f32, x87 for f64.
593     // Set up the FP register classes.
594     addRegisterClass(MVT::f32, &X86::FR32RegClass);
595     if (UseX87)
596       addRegisterClass(MVT::f64, &X86::RFP64RegClass);
597 
598     // Use ANDPS to simulate FABS.
599     setOperationAction(ISD::FABS , MVT::f32, Custom);
600 
601     // Use XORP to simulate FNEG.
602     setOperationAction(ISD::FNEG , MVT::f32, Custom);
603 
604     if (UseX87)
605       setOperationAction(ISD::UNDEF, MVT::f64, Expand);
606 
607     // Use ANDPS and ORPS to simulate FCOPYSIGN.
608     if (UseX87)
609       setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
610     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
611 
612     // We don't support sin/cos/fmod
613     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
614     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
615     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
616 
617     if (UseX87) {
618       // Always expand sin/cos functions even though x87 has an instruction.
619       setOperationAction(ISD::FSIN, MVT::f64, Expand);
620       setOperationAction(ISD::FCOS, MVT::f64, Expand);
621       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
622     }
623   } else if (UseX87) {
624     // f32 and f64 in x87.
625     // Set up the FP register classes.
626     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
627     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
628 
629     for (auto VT : { MVT::f32, MVT::f64 }) {
630       setOperationAction(ISD::UNDEF,     VT, Expand);
631       setOperationAction(ISD::FCOPYSIGN, VT, Expand);
632 
633       // Always expand sin/cos functions even though x87 has an instruction.
634       setOperationAction(ISD::FSIN   , VT, Expand);
635       setOperationAction(ISD::FCOS   , VT, Expand);
636       setOperationAction(ISD::FSINCOS, VT, Expand);
637     }
638   }
639 
640   // Expand FP32 immediates into loads from the stack, save special cases.
641   if (isTypeLegal(MVT::f32)) {
642     if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
643       addLegalFPImmediate(APFloat(+0.0f)); // FLD0
644       addLegalFPImmediate(APFloat(+1.0f)); // FLD1
645       addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
646       addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
647     } else // SSE immediates.
648       addLegalFPImmediate(APFloat(+0.0f)); // xorps
649   }
650   // Expand FP64 immediates into loads from the stack, save special cases.
651   if (isTypeLegal(MVT::f64)) {
652     if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
653       addLegalFPImmediate(APFloat(+0.0)); // FLD0
654       addLegalFPImmediate(APFloat(+1.0)); // FLD1
655       addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
656       addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
657     } else // SSE immediates.
658       addLegalFPImmediate(APFloat(+0.0)); // xorpd
659   }
660   // Handle constrained floating-point operations of scalar.
661   setOperationAction(ISD::STRICT_FADD,      MVT::f32, Legal);
662   setOperationAction(ISD::STRICT_FADD,      MVT::f64, Legal);
663   setOperationAction(ISD::STRICT_FSUB,      MVT::f32, Legal);
664   setOperationAction(ISD::STRICT_FSUB,      MVT::f64, Legal);
665   setOperationAction(ISD::STRICT_FMUL,      MVT::f32, Legal);
666   setOperationAction(ISD::STRICT_FMUL,      MVT::f64, Legal);
667   setOperationAction(ISD::STRICT_FDIV,      MVT::f32, Legal);
668   setOperationAction(ISD::STRICT_FDIV,      MVT::f64, Legal);
669   setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
670   setOperationAction(ISD::STRICT_FP_ROUND,  MVT::f32, Legal);
671   setOperationAction(ISD::STRICT_FP_ROUND,  MVT::f64, Legal);
672   setOperationAction(ISD::STRICT_FSQRT,     MVT::f32, Legal);
673   setOperationAction(ISD::STRICT_FSQRT,     MVT::f64, Legal);
674 
675   // We don't support FMA.
676   setOperationAction(ISD::FMA, MVT::f64, Expand);
677   setOperationAction(ISD::FMA, MVT::f32, Expand);
678 
679   // f80 always uses X87.
680   if (UseX87) {
681     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
682     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
683     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
684     {
685       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
686       addLegalFPImmediate(TmpFlt);  // FLD0
687       TmpFlt.changeSign();
688       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
689 
690       bool ignored;
691       APFloat TmpFlt2(+1.0);
692       TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
693                       &ignored);
694       addLegalFPImmediate(TmpFlt2);  // FLD1
695       TmpFlt2.changeSign();
696       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
697     }
698 
699     // Always expand sin/cos functions even though x87 has an instruction.
700     setOperationAction(ISD::FSIN   , MVT::f80, Expand);
701     setOperationAction(ISD::FCOS   , MVT::f80, Expand);
702     setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
703 
704     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
705     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
706     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
707     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
708     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
709     setOperationAction(ISD::FMA, MVT::f80, Expand);
710     setOperationAction(ISD::LROUND, MVT::f80, Expand);
711     setOperationAction(ISD::LLROUND, MVT::f80, Expand);
712     setOperationAction(ISD::LRINT, MVT::f80, Custom);
713     setOperationAction(ISD::LLRINT, MVT::f80, Custom);
714 
715     // Handle constrained floating-point operations of scalar.
716     setOperationAction(ISD::STRICT_FADD     , MVT::f80, Legal);
717     setOperationAction(ISD::STRICT_FSUB     , MVT::f80, Legal);
718     setOperationAction(ISD::STRICT_FMUL     , MVT::f80, Legal);
719     setOperationAction(ISD::STRICT_FDIV     , MVT::f80, Legal);
720     setOperationAction(ISD::STRICT_FSQRT    , MVT::f80, Legal);
721     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
722     // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
723     // as Custom.
724     setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
725   }
726 
727   // f128 uses xmm registers, but most operations require libcalls.
728   if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
729     addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
730                                                    : &X86::VR128RegClass);
731 
732     addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
733 
734     setOperationAction(ISD::FADD,        MVT::f128, LibCall);
735     setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
736     setOperationAction(ISD::FSUB,        MVT::f128, LibCall);
737     setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
738     setOperationAction(ISD::FDIV,        MVT::f128, LibCall);
739     setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
740     setOperationAction(ISD::FMUL,        MVT::f128, LibCall);
741     setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
742     setOperationAction(ISD::FMA,         MVT::f128, LibCall);
743     setOperationAction(ISD::STRICT_FMA,  MVT::f128, LibCall);
744 
745     setOperationAction(ISD::FABS, MVT::f128, Custom);
746     setOperationAction(ISD::FNEG, MVT::f128, Custom);
747     setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
748 
749     setOperationAction(ISD::FSIN,         MVT::f128, LibCall);
750     setOperationAction(ISD::STRICT_FSIN,  MVT::f128, LibCall);
751     setOperationAction(ISD::FCOS,         MVT::f128, LibCall);
752     setOperationAction(ISD::STRICT_FCOS,  MVT::f128, LibCall);
753     setOperationAction(ISD::FSINCOS,      MVT::f128, LibCall);
754     // No STRICT_FSINCOS
755     setOperationAction(ISD::FSQRT,        MVT::f128, LibCall);
756     setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
757 
758     setOperationAction(ISD::FP_EXTEND,        MVT::f128, Custom);
759     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
760     // We need to custom handle any FP_ROUND with an f128 input, but
761     // LegalizeDAG uses the result type to know when to run a custom handler.
762     // So we have to list all legal floating point result types here.
763     if (isTypeLegal(MVT::f32)) {
764       setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
765       setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
766     }
767     if (isTypeLegal(MVT::f64)) {
768       setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
769       setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
770     }
771     if (isTypeLegal(MVT::f80)) {
772       setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
773       setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
774     }
775 
776     setOperationAction(ISD::SETCC, MVT::f128, Custom);
777 
778     setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
779     setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
780     setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
781     setTruncStoreAction(MVT::f128, MVT::f32, Expand);
782     setTruncStoreAction(MVT::f128, MVT::f64, Expand);
783     setTruncStoreAction(MVT::f128, MVT::f80, Expand);
784   }
785 
786   // Always use a library call for pow.
787   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
788   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
789   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
790   setOperationAction(ISD::FPOW             , MVT::f128 , Expand);
791 
792   setOperationAction(ISD::FLOG, MVT::f80, Expand);
793   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
794   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
795   setOperationAction(ISD::FEXP, MVT::f80, Expand);
796   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
797   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
798   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
799 
800   // Some FP actions are always expanded for vector types.
801   for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
802                    MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
803     setOperationAction(ISD::FSIN,      VT, Expand);
804     setOperationAction(ISD::FSINCOS,   VT, Expand);
805     setOperationAction(ISD::FCOS,      VT, Expand);
806     setOperationAction(ISD::FREM,      VT, Expand);
807     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
808     setOperationAction(ISD::FPOW,      VT, Expand);
809     setOperationAction(ISD::FLOG,      VT, Expand);
810     setOperationAction(ISD::FLOG2,     VT, Expand);
811     setOperationAction(ISD::FLOG10,    VT, Expand);
812     setOperationAction(ISD::FEXP,      VT, Expand);
813     setOperationAction(ISD::FEXP2,     VT, Expand);
814   }
815 
816   // First set operation action for all vector types to either promote
817   // (for widening) or expand (for scalarization). Then we will selectively
818   // turn on ones that can be effectively codegen'd.
819   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
820     setOperationAction(ISD::SDIV, VT, Expand);
821     setOperationAction(ISD::UDIV, VT, Expand);
822     setOperationAction(ISD::SREM, VT, Expand);
823     setOperationAction(ISD::UREM, VT, Expand);
824     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
825     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
826     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
827     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
828     setOperationAction(ISD::FMA,  VT, Expand);
829     setOperationAction(ISD::FFLOOR, VT, Expand);
830     setOperationAction(ISD::FCEIL, VT, Expand);
831     setOperationAction(ISD::FTRUNC, VT, Expand);
832     setOperationAction(ISD::FRINT, VT, Expand);
833     setOperationAction(ISD::FNEARBYINT, VT, Expand);
834     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
835     setOperationAction(ISD::MULHS, VT, Expand);
836     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
837     setOperationAction(ISD::MULHU, VT, Expand);
838     setOperationAction(ISD::SDIVREM, VT, Expand);
839     setOperationAction(ISD::UDIVREM, VT, Expand);
840     setOperationAction(ISD::CTPOP, VT, Expand);
841     setOperationAction(ISD::CTTZ, VT, Expand);
842     setOperationAction(ISD::CTLZ, VT, Expand);
843     setOperationAction(ISD::ROTL, VT, Expand);
844     setOperationAction(ISD::ROTR, VT, Expand);
845     setOperationAction(ISD::BSWAP, VT, Expand);
846     setOperationAction(ISD::SETCC, VT, Expand);
847     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
848     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
849     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
850     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
851     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
852     setOperationAction(ISD::TRUNCATE, VT, Expand);
853     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
854     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
855     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
856     setOperationAction(ISD::SELECT_CC, VT, Expand);
857     for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
858       setTruncStoreAction(InnerVT, VT, Expand);
859 
860       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
861       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
862 
863       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
864       // types, we have to deal with them whether we ask for Expansion or not.
865       // Setting Expand causes its own optimisation problems though, so leave
866       // them legal.
867       if (VT.getVectorElementType() == MVT::i1)
868         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
869 
870       // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
871       // split/scalarized right now.
872       if (VT.getVectorElementType() == MVT::f16)
873         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
874     }
875   }
876 
877   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
878   // with -msoft-float, disable use of MMX as well.
879   if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
880     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
881     // No operations on x86mmx supported, everything uses intrinsics.
882   }
883 
884   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
885     addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
886                                                     : &X86::VR128RegClass);
887 
888     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
889     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
890     setOperationAction(ISD::FCOPYSIGN,          MVT::v4f32, Custom);
891     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
892     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
893     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
894     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
895     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
896 
897     setOperationAction(ISD::LOAD,               MVT::v2f32, Custom);
898     setOperationAction(ISD::STORE,              MVT::v2f32, Custom);
899 
900     setOperationAction(ISD::STRICT_FADD,        MVT::v4f32, Legal);
901     setOperationAction(ISD::STRICT_FSUB,        MVT::v4f32, Legal);
902     setOperationAction(ISD::STRICT_FMUL,        MVT::v4f32, Legal);
903     setOperationAction(ISD::STRICT_FDIV,        MVT::v4f32, Legal);
904     setOperationAction(ISD::STRICT_FSQRT,       MVT::v4f32, Legal);
905   }
906 
907   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
908     addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
909                                                     : &X86::VR128RegClass);
910 
911     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
912     // registers cannot be used even for integer operations.
913     addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
914                                                     : &X86::VR128RegClass);
915     addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
916                                                     : &X86::VR128RegClass);
917     addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
918                                                     : &X86::VR128RegClass);
919     addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
920                                                     : &X86::VR128RegClass);
921 
922     for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
923                      MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
924       setOperationAction(ISD::SDIV, VT, Custom);
925       setOperationAction(ISD::SREM, VT, Custom);
926       setOperationAction(ISD::UDIV, VT, Custom);
927       setOperationAction(ISD::UREM, VT, Custom);
928     }
929 
930     setOperationAction(ISD::MUL,                MVT::v2i8,  Custom);
931     setOperationAction(ISD::MUL,                MVT::v4i8,  Custom);
932     setOperationAction(ISD::MUL,                MVT::v8i8,  Custom);
933 
934     setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
935     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
936     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
937     setOperationAction(ISD::MULHU,              MVT::v4i32, Custom);
938     setOperationAction(ISD::MULHS,              MVT::v4i32, Custom);
939     setOperationAction(ISD::MULHU,              MVT::v16i8, Custom);
940     setOperationAction(ISD::MULHS,              MVT::v16i8, Custom);
941     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
942     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
943     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
944 
945     setOperationAction(ISD::SMULO,              MVT::v16i8, Custom);
946     setOperationAction(ISD::UMULO,              MVT::v16i8, Custom);
947 
948     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
949     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
950     setOperationAction(ISD::FCOPYSIGN,          MVT::v2f64, Custom);
951 
952     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
953       setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
954       setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
955       setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
956       setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
957     }
958 
959     setOperationAction(ISD::UADDSAT,            MVT::v16i8, Legal);
960     setOperationAction(ISD::SADDSAT,            MVT::v16i8, Legal);
961     setOperationAction(ISD::USUBSAT,            MVT::v16i8, Legal);
962     setOperationAction(ISD::SSUBSAT,            MVT::v16i8, Legal);
963     setOperationAction(ISD::UADDSAT,            MVT::v8i16, Legal);
964     setOperationAction(ISD::SADDSAT,            MVT::v8i16, Legal);
965     setOperationAction(ISD::USUBSAT,            MVT::v8i16, Legal);
966     setOperationAction(ISD::SSUBSAT,            MVT::v8i16, Legal);
967     setOperationAction(ISD::USUBSAT,            MVT::v4i32, Custom);
968     setOperationAction(ISD::USUBSAT,            MVT::v2i64, Custom);
969 
970     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
971     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
972     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
973 
974     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
975       setOperationAction(ISD::SETCC,              VT, Custom);
976       setOperationAction(ISD::STRICT_FSETCC,      VT, Custom);
977       setOperationAction(ISD::STRICT_FSETCCS,     VT, Custom);
978       setOperationAction(ISD::CTPOP,              VT, Custom);
979       setOperationAction(ISD::ABS,                VT, Custom);
980 
981       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
982       // setcc all the way to isel and prefer SETGT in some isel patterns.
983       setCondCodeAction(ISD::SETLT, VT, Custom);
984       setCondCodeAction(ISD::SETLE, VT, Custom);
985     }
986 
987     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
988       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
989       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
990       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
991       setOperationAction(ISD::VSELECT,            VT, Custom);
992       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
993     }
994 
995     for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
996       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
997       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
998       setOperationAction(ISD::VSELECT,            VT, Custom);
999 
1000       if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1001         continue;
1002 
1003       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1004       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1005     }
1006 
1007     // Custom lower v2i64 and v2f64 selects.
1008     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
1009     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
1010     setOperationAction(ISD::SELECT,             MVT::v4i32, Custom);
1011     setOperationAction(ISD::SELECT,             MVT::v8i16, Custom);
1012     setOperationAction(ISD::SELECT,             MVT::v16i8, Custom);
1013 
1014     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
1015     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
1016     setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v4i32, Legal);
1017     setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v2i32, Custom);
1018 
1019     // Custom legalize these to avoid over promotion or custom promotion.
1020     for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1021       setOperationAction(ISD::FP_TO_SINT,        VT, Custom);
1022       setOperationAction(ISD::FP_TO_UINT,        VT, Custom);
1023       setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1024       setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1025     }
1026 
1027     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
1028     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v4i32, Legal);
1029     setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
1030     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v2i32, Custom);
1031 
1032     setOperationAction(ISD::UINT_TO_FP,         MVT::v2i32, Custom);
1033     setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v2i32, Custom);
1034 
1035     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
1036     setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v4i32, Custom);
1037 
1038     // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1039     setOperationAction(ISD::SINT_TO_FP,         MVT::v2f32, Custom);
1040     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v2f32, Custom);
1041     setOperationAction(ISD::UINT_TO_FP,         MVT::v2f32, Custom);
1042     setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v2f32, Custom);
1043 
1044     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
1045     setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v2f32, Custom);
1046     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
1047     setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v2f32, Custom);
1048 
1049     // We want to legalize this to an f64 load rather than an i64 load on
1050     // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1051     // store.
1052     setOperationAction(ISD::LOAD,               MVT::v2i32, Custom);
1053     setOperationAction(ISD::LOAD,               MVT::v4i16, Custom);
1054     setOperationAction(ISD::LOAD,               MVT::v8i8,  Custom);
1055     setOperationAction(ISD::STORE,              MVT::v2i32, Custom);
1056     setOperationAction(ISD::STORE,              MVT::v4i16, Custom);
1057     setOperationAction(ISD::STORE,              MVT::v8i8,  Custom);
1058 
1059     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
1060     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
1061     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
1062     if (!Subtarget.hasAVX512())
1063       setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1064 
1065     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
1066     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
1067     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
1068 
1069     setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1070 
1071     setOperationAction(ISD::TRUNCATE,    MVT::v2i8,  Custom);
1072     setOperationAction(ISD::TRUNCATE,    MVT::v2i16, Custom);
1073     setOperationAction(ISD::TRUNCATE,    MVT::v2i32, Custom);
1074     setOperationAction(ISD::TRUNCATE,    MVT::v4i8,  Custom);
1075     setOperationAction(ISD::TRUNCATE,    MVT::v4i16, Custom);
1076     setOperationAction(ISD::TRUNCATE,    MVT::v8i8,  Custom);
1077 
1078     // In the customized shift lowering, the legal v4i32/v2i64 cases
1079     // in AVX2 will be recognized.
1080     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1081       setOperationAction(ISD::SRL,              VT, Custom);
1082       setOperationAction(ISD::SHL,              VT, Custom);
1083       setOperationAction(ISD::SRA,              VT, Custom);
1084     }
1085 
1086     setOperationAction(ISD::ROTL,               MVT::v4i32, Custom);
1087     setOperationAction(ISD::ROTL,               MVT::v8i16, Custom);
1088 
1089     // With 512-bit registers or AVX512VL+BW, expanding (and promoting the
1090     // shifts) is better.
1091     if (!Subtarget.useAVX512Regs() &&
1092         !(Subtarget.hasBWI() && Subtarget.hasVLX()))
1093       setOperationAction(ISD::ROTL,             MVT::v16i8, Custom);
1094 
1095     setOperationAction(ISD::STRICT_FSQRT,       MVT::v2f64, Legal);
1096     setOperationAction(ISD::STRICT_FADD,        MVT::v2f64, Legal);
1097     setOperationAction(ISD::STRICT_FSUB,        MVT::v2f64, Legal);
1098     setOperationAction(ISD::STRICT_FMUL,        MVT::v2f64, Legal);
1099     setOperationAction(ISD::STRICT_FDIV,        MVT::v2f64, Legal);
1100   }
1101 
1102   if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1103     setOperationAction(ISD::ABS,                MVT::v16i8, Legal);
1104     setOperationAction(ISD::ABS,                MVT::v8i16, Legal);
1105     setOperationAction(ISD::ABS,                MVT::v4i32, Legal);
1106     setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
1107     setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
1108     setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
1109     setOperationAction(ISD::CTLZ,               MVT::v4i32, Custom);
1110     setOperationAction(ISD::CTLZ,               MVT::v2i64, Custom);
1111 
1112     // These might be better off as horizontal vector ops.
1113     setOperationAction(ISD::ADD,                MVT::i16, Custom);
1114     setOperationAction(ISD::ADD,                MVT::i32, Custom);
1115     setOperationAction(ISD::SUB,                MVT::i16, Custom);
1116     setOperationAction(ISD::SUB,                MVT::i32, Custom);
1117   }
1118 
1119   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1120     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1121       setOperationAction(ISD::FFLOOR,            RoundedTy,  Legal);
1122       setOperationAction(ISD::STRICT_FFLOOR,     RoundedTy,  Legal);
1123       setOperationAction(ISD::FCEIL,             RoundedTy,  Legal);
1124       setOperationAction(ISD::STRICT_FCEIL,      RoundedTy,  Legal);
1125       setOperationAction(ISD::FTRUNC,            RoundedTy,  Legal);
1126       setOperationAction(ISD::STRICT_FTRUNC,     RoundedTy,  Legal);
1127       setOperationAction(ISD::FRINT,             RoundedTy,  Legal);
1128       setOperationAction(ISD::STRICT_FRINT,      RoundedTy,  Legal);
1129       setOperationAction(ISD::FNEARBYINT,        RoundedTy,  Legal);
1130       setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy,  Legal);
1131       setOperationAction(ISD::FROUNDEVEN,        RoundedTy,  Legal);
1132       setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy,  Legal);
1133 
1134       setOperationAction(ISD::FROUND,            RoundedTy,  Custom);
1135     }
1136 
1137     setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
1138     setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
1139     setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
1140     setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
1141     setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
1142     setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
1143     setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
1144     setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
1145 
1146     setOperationAction(ISD::UADDSAT,            MVT::v4i32, Custom);
1147 
1148     // FIXME: Do we need to handle scalar-to-vector here?
1149     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
1150 
1151     // We directly match byte blends in the backend as they match the VSELECT
1152     // condition form.
1153     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
1154 
1155     // SSE41 brings specific instructions for doing vector sign extend even in
1156     // cases where we don't have SRA.
1157     for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1158       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1159       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1160     }
1161 
1162     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1163     for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1164       setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8,  Legal);
1165       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8,  Legal);
1166       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8,  Legal);
1167       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1168       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1169       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1170     }
1171 
1172     // i8 vectors are custom because the source register and source
1173     // source memory operand types are not the same width.
1174     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
1175 
1176     if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1177       // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1178       // do the pre and post work in the vector domain.
1179       setOperationAction(ISD::UINT_TO_FP,        MVT::v4i64, Custom);
1180       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
1181       // We need to mark SINT_TO_FP as Custom even though we want to expand it
1182       // so that DAG combine doesn't try to turn it into uint_to_fp.
1183       setOperationAction(ISD::SINT_TO_FP,        MVT::v4i64, Custom);
1184       setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
1185     }
1186   }
1187 
1188   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1189     setOperationAction(ISD::UADDSAT,            MVT::v2i64, Custom);
1190   }
1191 
1192   if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1193     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
1194                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1195       setOperationAction(ISD::ROTL, VT, Custom);
1196 
1197     // XOP can efficiently perform BITREVERSE with VPPERM.
1198     for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1199       setOperationAction(ISD::BITREVERSE, VT, Custom);
1200 
1201     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
1202                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1203       setOperationAction(ISD::BITREVERSE, VT, Custom);
1204   }
1205 
1206   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1207     bool HasInt256 = Subtarget.hasInt256();
1208 
1209     addRegisterClass(MVT::v32i8,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1210                                                      : &X86::VR256RegClass);
1211     addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1212                                                      : &X86::VR256RegClass);
1213     addRegisterClass(MVT::v8i32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1214                                                      : &X86::VR256RegClass);
1215     addRegisterClass(MVT::v8f32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1216                                                      : &X86::VR256RegClass);
1217     addRegisterClass(MVT::v4i64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1218                                                      : &X86::VR256RegClass);
1219     addRegisterClass(MVT::v4f64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1220                                                      : &X86::VR256RegClass);
1221 
1222     for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1223       setOperationAction(ISD::FFLOOR,            VT, Legal);
1224       setOperationAction(ISD::STRICT_FFLOOR,     VT, Legal);
1225       setOperationAction(ISD::FCEIL,             VT, Legal);
1226       setOperationAction(ISD::STRICT_FCEIL,      VT, Legal);
1227       setOperationAction(ISD::FTRUNC,            VT, Legal);
1228       setOperationAction(ISD::STRICT_FTRUNC,     VT, Legal);
1229       setOperationAction(ISD::FRINT,             VT, Legal);
1230       setOperationAction(ISD::STRICT_FRINT,      VT, Legal);
1231       setOperationAction(ISD::FNEARBYINT,        VT, Legal);
1232       setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1233       setOperationAction(ISD::FROUNDEVEN,        VT, Legal);
1234       setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1235 
1236       setOperationAction(ISD::FROUND,            VT, Custom);
1237 
1238       setOperationAction(ISD::FNEG,              VT, Custom);
1239       setOperationAction(ISD::FABS,              VT, Custom);
1240       setOperationAction(ISD::FCOPYSIGN,         VT, Custom);
1241     }
1242 
1243     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1244     // even though v8i16 is a legal type.
1245     setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v8i16, MVT::v8i32);
1246     setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v8i16, MVT::v8i32);
1247     setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1248     setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1249     setOperationAction(ISD::FP_TO_SINT,                MVT::v8i32, Legal);
1250     setOperationAction(ISD::STRICT_FP_TO_SINT,         MVT::v8i32, Legal);
1251 
1252     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1253     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v8i32, Legal);
1254 
1255     setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v4f32, Legal);
1256     setOperationAction(ISD::STRICT_FADD,        MVT::v8f32, Legal);
1257     setOperationAction(ISD::STRICT_FADD,        MVT::v4f64, Legal);
1258     setOperationAction(ISD::STRICT_FSUB,        MVT::v8f32, Legal);
1259     setOperationAction(ISD::STRICT_FSUB,        MVT::v4f64, Legal);
1260     setOperationAction(ISD::STRICT_FMUL,        MVT::v8f32, Legal);
1261     setOperationAction(ISD::STRICT_FMUL,        MVT::v4f64, Legal);
1262     setOperationAction(ISD::STRICT_FDIV,        MVT::v8f32, Legal);
1263     setOperationAction(ISD::STRICT_FDIV,        MVT::v4f64, Legal);
1264     setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v4f64, Legal);
1265     setOperationAction(ISD::STRICT_FSQRT,       MVT::v8f32, Legal);
1266     setOperationAction(ISD::STRICT_FSQRT,       MVT::v4f64, Legal);
1267 
1268     if (!Subtarget.hasAVX512())
1269       setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1270 
1271     // In the customized shift lowering, the legal v8i32/v4i64 cases
1272     // in AVX2 will be recognized.
1273     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1274       setOperationAction(ISD::SRL, VT, Custom);
1275       setOperationAction(ISD::SHL, VT, Custom);
1276       setOperationAction(ISD::SRA, VT, Custom);
1277     }
1278 
1279     // These types need custom splitting if their input is a 128-bit vector.
1280     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i64,  Custom);
1281     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i32, Custom);
1282     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i64,  Custom);
1283     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i32, Custom);
1284 
1285     setOperationAction(ISD::ROTL,              MVT::v8i32,  Custom);
1286     setOperationAction(ISD::ROTL,              MVT::v16i16, Custom);
1287 
1288     // With BWI, expanding (and promoting the shifts) is the better.
1289     if (!Subtarget.useBWIRegs())
1290       setOperationAction(ISD::ROTL,            MVT::v32i8,  Custom);
1291 
1292     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1293     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1294     setOperationAction(ISD::SELECT,            MVT::v8i32, Custom);
1295     setOperationAction(ISD::SELECT,            MVT::v16i16, Custom);
1296     setOperationAction(ISD::SELECT,            MVT::v32i8, Custom);
1297     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1298 
1299     for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1300       setOperationAction(ISD::SIGN_EXTEND,     VT, Custom);
1301       setOperationAction(ISD::ZERO_EXTEND,     VT, Custom);
1302       setOperationAction(ISD::ANY_EXTEND,      VT, Custom);
1303     }
1304 
1305     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1306     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1307     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1308     setOperationAction(ISD::BITREVERSE,        MVT::v32i8, Custom);
1309 
1310     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1311       setOperationAction(ISD::SETCC,           VT, Custom);
1312       setOperationAction(ISD::STRICT_FSETCC,   VT, Custom);
1313       setOperationAction(ISD::STRICT_FSETCCS,  VT, Custom);
1314       setOperationAction(ISD::CTPOP,           VT, Custom);
1315       setOperationAction(ISD::CTLZ,            VT, Custom);
1316 
1317       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1318       // setcc all the way to isel and prefer SETGT in some isel patterns.
1319       setCondCodeAction(ISD::SETLT, VT, Custom);
1320       setCondCodeAction(ISD::SETLE, VT, Custom);
1321     }
1322 
1323     if (Subtarget.hasAnyFMA()) {
1324       for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1325                        MVT::v2f64, MVT::v4f64 }) {
1326         setOperationAction(ISD::FMA, VT, Legal);
1327         setOperationAction(ISD::STRICT_FMA, VT, Legal);
1328       }
1329     }
1330 
1331     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1332       setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1333       setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1334     }
1335 
1336     setOperationAction(ISD::MUL,       MVT::v4i64,  Custom);
1337     setOperationAction(ISD::MUL,       MVT::v8i32,  HasInt256 ? Legal : Custom);
1338     setOperationAction(ISD::MUL,       MVT::v16i16, HasInt256 ? Legal : Custom);
1339     setOperationAction(ISD::MUL,       MVT::v32i8,  Custom);
1340 
1341     setOperationAction(ISD::MULHU,     MVT::v8i32,  Custom);
1342     setOperationAction(ISD::MULHS,     MVT::v8i32,  Custom);
1343     setOperationAction(ISD::MULHU,     MVT::v16i16, HasInt256 ? Legal : Custom);
1344     setOperationAction(ISD::MULHS,     MVT::v16i16, HasInt256 ? Legal : Custom);
1345     setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);
1346     setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
1347 
1348     setOperationAction(ISD::SMULO,     MVT::v32i8, Custom);
1349     setOperationAction(ISD::UMULO,     MVT::v32i8, Custom);
1350 
1351     setOperationAction(ISD::ABS,       MVT::v4i64,  Custom);
1352     setOperationAction(ISD::SMAX,      MVT::v4i64,  Custom);
1353     setOperationAction(ISD::UMAX,      MVT::v4i64,  Custom);
1354     setOperationAction(ISD::SMIN,      MVT::v4i64,  Custom);
1355     setOperationAction(ISD::UMIN,      MVT::v4i64,  Custom);
1356 
1357     setOperationAction(ISD::UADDSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1358     setOperationAction(ISD::SADDSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1359     setOperationAction(ISD::USUBSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1360     setOperationAction(ISD::SSUBSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1361     setOperationAction(ISD::UADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1362     setOperationAction(ISD::SADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1363     setOperationAction(ISD::USUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1364     setOperationAction(ISD::SSUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1365     setOperationAction(ISD::UADDSAT,   MVT::v8i32, Custom);
1366     setOperationAction(ISD::USUBSAT,   MVT::v8i32, Custom);
1367     setOperationAction(ISD::UADDSAT,   MVT::v4i64, Custom);
1368     setOperationAction(ISD::USUBSAT,   MVT::v4i64, Custom);
1369 
1370     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1371       setOperationAction(ISD::ABS,  VT, HasInt256 ? Legal : Custom);
1372       setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1373       setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1374       setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1375       setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1376     }
1377 
1378     for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1379       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1380       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1381     }
1382 
1383     if (HasInt256) {
1384       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1385       // when we have a 256bit-wide blend with immediate.
1386       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1387       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
1388 
1389       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1390       for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1391         setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1392         setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i8,  Legal);
1393         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i8,  Legal);
1394         setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i16, Legal);
1395         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i16, Legal);
1396         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i32, Legal);
1397       }
1398     }
1399 
1400     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1401                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1402       setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);
1403       setOperationAction(ISD::MSTORE, VT, Legal);
1404     }
1405 
1406     // Extract subvector is special because the value type
1407     // (result) is 128-bit but the source is 256-bit wide.
1408     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1409                      MVT::v4f32, MVT::v2f64 }) {
1410       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1411     }
1412 
1413     // Custom lower several nodes for 256-bit types.
1414     for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1415                     MVT::v8f32, MVT::v4f64 }) {
1416       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1417       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1418       setOperationAction(ISD::VSELECT,            VT, Custom);
1419       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1420       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1421       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1422       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
1423       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1424       setOperationAction(ISD::STORE,              VT, Custom);
1425     }
1426 
1427     if (HasInt256) {
1428       setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1429 
1430       // Custom legalize 2x32 to get a little better code.
1431       setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1432       setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1433 
1434       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1435                        MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1436         setOperationAction(ISD::MGATHER,  VT, Custom);
1437     }
1438   }
1439 
1440   // This block controls legalization of the mask vector sizes that are
1441   // available with AVX512. 512-bit vectors are in a separate block controlled
1442   // by useAVX512Regs.
1443   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1444     addRegisterClass(MVT::v1i1,   &X86::VK1RegClass);
1445     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1446     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1447     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1448     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1449 
1450     setOperationAction(ISD::SELECT,             MVT::v1i1, Custom);
1451     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1452     setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i1, Custom);
1453 
1454     setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v8i1,  MVT::v8i32);
1455     setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v8i1,  MVT::v8i32);
1456     setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v4i1,  MVT::v4i32);
1457     setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v4i1,  MVT::v4i32);
1458     setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1,  MVT::v8i32);
1459     setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1,  MVT::v8i32);
1460     setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1,  MVT::v4i32);
1461     setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1,  MVT::v4i32);
1462     setOperationAction(ISD::FP_TO_SINT,                MVT::v2i1,  Custom);
1463     setOperationAction(ISD::FP_TO_UINT,                MVT::v2i1,  Custom);
1464     setOperationAction(ISD::STRICT_FP_TO_SINT,         MVT::v2i1,  Custom);
1465     setOperationAction(ISD::STRICT_FP_TO_UINT,         MVT::v2i1,  Custom);
1466 
1467     // There is no byte sized k-register load or store without AVX512DQ.
1468     if (!Subtarget.hasDQI()) {
1469       setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1470       setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1471       setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1472       setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1473 
1474       setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1475       setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1476       setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1477       setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1478     }
1479 
1480     // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1481     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1482       setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1483       setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1484       setOperationAction(ISD::ANY_EXTEND,  VT, Custom);
1485     }
1486 
1487     for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1488       setOperationAction(ISD::VSELECT,          VT, Expand);
1489 
1490     for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1491       setOperationAction(ISD::SETCC,            VT, Custom);
1492       setOperationAction(ISD::STRICT_FSETCC,    VT, Custom);
1493       setOperationAction(ISD::STRICT_FSETCCS,   VT, Custom);
1494       setOperationAction(ISD::SELECT,           VT, Custom);
1495       setOperationAction(ISD::TRUNCATE,         VT, Custom);
1496 
1497       setOperationAction(ISD::BUILD_VECTOR,     VT, Custom);
1498       setOperationAction(ISD::CONCAT_VECTORS,   VT, Custom);
1499       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1500       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1501       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1502       setOperationAction(ISD::VECTOR_SHUFFLE,   VT,  Custom);
1503     }
1504 
1505     for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1506       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1507   }
1508 
1509   // This block controls legalization for 512-bit operations with 32/64 bit
1510   // elements. 512-bits can be disabled based on prefer-vector-width and
1511   // required-vector-width function attributes.
1512   if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1513     bool HasBWI = Subtarget.hasBWI();
1514 
1515     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1516     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1517     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1518     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1519     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1520     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1521 
1522     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1523       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8,  Legal);
1524       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1525       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i8,   Legal);
1526       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
1527       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
1528       if (HasBWI)
1529         setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1530     }
1531 
1532     for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1533       setOperationAction(ISD::FNEG,  VT, Custom);
1534       setOperationAction(ISD::FABS,  VT, Custom);
1535       setOperationAction(ISD::FMA,   VT, Legal);
1536       setOperationAction(ISD::STRICT_FMA, VT, Legal);
1537       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1538     }
1539 
1540     for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
1541       setOperationPromotedToType(ISD::FP_TO_SINT       , VT, MVT::v16i32);
1542       setOperationPromotedToType(ISD::FP_TO_UINT       , VT, MVT::v16i32);
1543       setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
1544       setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
1545     }
1546     setOperationAction(ISD::FP_TO_SINT,        MVT::v16i32, Legal);
1547     setOperationAction(ISD::FP_TO_UINT,        MVT::v16i32, Legal);
1548     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
1549     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
1550     setOperationAction(ISD::SINT_TO_FP,        MVT::v16i32, Legal);
1551     setOperationAction(ISD::UINT_TO_FP,        MVT::v16i32, Legal);
1552     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
1553     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);
1554 
1555     setOperationAction(ISD::STRICT_FADD,      MVT::v16f32, Legal);
1556     setOperationAction(ISD::STRICT_FADD,      MVT::v8f64,  Legal);
1557     setOperationAction(ISD::STRICT_FSUB,      MVT::v16f32, Legal);
1558     setOperationAction(ISD::STRICT_FSUB,      MVT::v8f64,  Legal);
1559     setOperationAction(ISD::STRICT_FMUL,      MVT::v16f32, Legal);
1560     setOperationAction(ISD::STRICT_FMUL,      MVT::v8f64,  Legal);
1561     setOperationAction(ISD::STRICT_FDIV,      MVT::v16f32, Legal);
1562     setOperationAction(ISD::STRICT_FDIV,      MVT::v8f64,  Legal);
1563     setOperationAction(ISD::STRICT_FSQRT,     MVT::v16f32, Legal);
1564     setOperationAction(ISD::STRICT_FSQRT,     MVT::v8f64,  Legal);
1565     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64,  Legal);
1566     setOperationAction(ISD::STRICT_FP_ROUND,  MVT::v8f32,  Legal);
1567 
1568     setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
1569     setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
1570     setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
1571     setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
1572     setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
1573     if (HasBWI)
1574       setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
1575 
1576     // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1577     // to 512-bit rather than use the AVX2 instructions so that we can use
1578     // k-masks.
1579     if (!Subtarget.hasVLX()) {
1580       for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1581            MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1582         setOperationAction(ISD::MLOAD,  VT, Custom);
1583         setOperationAction(ISD::MSTORE, VT, Custom);
1584       }
1585     }
1586 
1587     setOperationAction(ISD::TRUNCATE,    MVT::v8i32,  Legal);
1588     setOperationAction(ISD::TRUNCATE,    MVT::v16i16, Legal);
1589     setOperationAction(ISD::TRUNCATE,    MVT::v32i8,  HasBWI ? Legal : Custom);
1590     setOperationAction(ISD::TRUNCATE,    MVT::v16i64, Custom);
1591     setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1592     setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1593     setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64,  Custom);
1594     setOperationAction(ISD::ANY_EXTEND,  MVT::v32i16, Custom);
1595     setOperationAction(ISD::ANY_EXTEND,  MVT::v16i32, Custom);
1596     setOperationAction(ISD::ANY_EXTEND,  MVT::v8i64,  Custom);
1597     setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1598     setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1599     setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64,  Custom);
1600 
1601     if (HasBWI) {
1602       // Extends from v64i1 masks to 512-bit vectors.
1603       setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
1604       setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
1605       setOperationAction(ISD::ANY_EXTEND,         MVT::v64i8, Custom);
1606     }
1607 
1608     for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1609       setOperationAction(ISD::FFLOOR,            VT, Legal);
1610       setOperationAction(ISD::STRICT_FFLOOR,     VT, Legal);
1611       setOperationAction(ISD::FCEIL,             VT, Legal);
1612       setOperationAction(ISD::STRICT_FCEIL,      VT, Legal);
1613       setOperationAction(ISD::FTRUNC,            VT, Legal);
1614       setOperationAction(ISD::STRICT_FTRUNC,     VT, Legal);
1615       setOperationAction(ISD::FRINT,             VT, Legal);
1616       setOperationAction(ISD::STRICT_FRINT,      VT, Legal);
1617       setOperationAction(ISD::FNEARBYINT,        VT, Legal);
1618       setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1619       setOperationAction(ISD::FROUNDEVEN,        VT, Legal);
1620       setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1621 
1622       setOperationAction(ISD::FROUND,            VT, Custom);
1623     }
1624 
1625     for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1626       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1627       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1628     }
1629 
1630     setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1631     setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1632     setOperationAction(ISD::ADD, MVT::v64i8,  HasBWI ? Legal : Custom);
1633     setOperationAction(ISD::SUB, MVT::v64i8,  HasBWI ? Legal : Custom);
1634 
1635     setOperationAction(ISD::MUL, MVT::v8i64,  Custom);
1636     setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1637     setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1638     setOperationAction(ISD::MUL, MVT::v64i8,  Custom);
1639 
1640     setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1641     setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1642     setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1643     setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1644     setOperationAction(ISD::MULHS, MVT::v64i8,  Custom);
1645     setOperationAction(ISD::MULHU, MVT::v64i8,  Custom);
1646 
1647     setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1648     setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1649 
1650     setOperationAction(ISD::BITREVERSE, MVT::v64i8,  Custom);
1651 
1652     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1653       setOperationAction(ISD::SRL,              VT, Custom);
1654       setOperationAction(ISD::SHL,              VT, Custom);
1655       setOperationAction(ISD::SRA,              VT, Custom);
1656       setOperationAction(ISD::SETCC,            VT, Custom);
1657 
1658       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1659       // setcc all the way to isel and prefer SETGT in some isel patterns.
1660       setCondCodeAction(ISD::SETLT, VT, Custom);
1661       setCondCodeAction(ISD::SETLE, VT, Custom);
1662     }
1663     for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1664       setOperationAction(ISD::SMAX,             VT, Legal);
1665       setOperationAction(ISD::UMAX,             VT, Legal);
1666       setOperationAction(ISD::SMIN,             VT, Legal);
1667       setOperationAction(ISD::UMIN,             VT, Legal);
1668       setOperationAction(ISD::ABS,              VT, Legal);
1669       setOperationAction(ISD::CTPOP,            VT, Custom);
1670       setOperationAction(ISD::ROTL,             VT, Custom);
1671       setOperationAction(ISD::ROTR,             VT, Custom);
1672       setOperationAction(ISD::STRICT_FSETCC,    VT, Custom);
1673       setOperationAction(ISD::STRICT_FSETCCS,   VT, Custom);
1674     }
1675 
1676     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1677       setOperationAction(ISD::ABS,     VT, HasBWI ? Legal : Custom);
1678       setOperationAction(ISD::CTPOP,   VT, Subtarget.hasBITALG() ? Legal : Custom);
1679       setOperationAction(ISD::CTLZ,    VT, Custom);
1680       setOperationAction(ISD::SMAX,    VT, HasBWI ? Legal : Custom);
1681       setOperationAction(ISD::UMAX,    VT, HasBWI ? Legal : Custom);
1682       setOperationAction(ISD::SMIN,    VT, HasBWI ? Legal : Custom);
1683       setOperationAction(ISD::UMIN,    VT, HasBWI ? Legal : Custom);
1684       setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
1685       setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
1686       setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
1687       setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
1688     }
1689 
1690     if (Subtarget.hasDQI()) {
1691       setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1692       setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1693       setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
1694       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
1695       setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1696       setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1697       setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
1698       setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);
1699 
1700       setOperationAction(ISD::MUL,        MVT::v8i64, Legal);
1701     }
1702 
1703     if (Subtarget.hasCDI()) {
1704       // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1705       for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1706         setOperationAction(ISD::CTLZ,            VT, Legal);
1707       }
1708     } // Subtarget.hasCDI()
1709 
1710     if (Subtarget.hasVPOPCNTDQ()) {
1711       for (auto VT : { MVT::v16i32, MVT::v8i64 })
1712         setOperationAction(ISD::CTPOP, VT, Legal);
1713     }
1714 
1715     // Extract subvector is special because the value type
1716     // (result) is 256-bit but the source is 512-bit wide.
1717     // 128-bit was made Legal under AVX1.
1718     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1719                      MVT::v8f32, MVT::v4f64 })
1720       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1721 
1722     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1723                      MVT::v16f32, MVT::v8f64 }) {
1724       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1725       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
1726       setOperationAction(ISD::SELECT,             VT, Custom);
1727       setOperationAction(ISD::VSELECT,            VT, Custom);
1728       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1729       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1730       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1731       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1732       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1733     }
1734 
1735     for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1736       setOperationAction(ISD::MLOAD,               VT, Legal);
1737       setOperationAction(ISD::MSTORE,              VT, Legal);
1738       setOperationAction(ISD::MGATHER,             VT, Custom);
1739       setOperationAction(ISD::MSCATTER,            VT, Custom);
1740     }
1741     if (HasBWI) {
1742       for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1743         setOperationAction(ISD::MLOAD,        VT, Legal);
1744         setOperationAction(ISD::MSTORE,       VT, Legal);
1745       }
1746     } else {
1747       setOperationAction(ISD::STORE, MVT::v32i16, Custom);
1748       setOperationAction(ISD::STORE, MVT::v64i8,  Custom);
1749     }
1750 
1751     if (Subtarget.hasVBMI2()) {
1752       for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1753                        MVT::v16i16, MVT::v8i32, MVT::v4i64,
1754                        MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1755         setOperationAction(ISD::FSHL, VT, Custom);
1756         setOperationAction(ISD::FSHR, VT, Custom);
1757       }
1758 
1759       setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
1760       setOperationAction(ISD::ROTR, MVT::v8i16,  Custom);
1761       setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
1762       setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
1763     }
1764   }// useAVX512Regs
1765 
1766   // This block controls legalization for operations that don't have
1767   // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1768   // narrower widths.
1769   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1770     // These operations are handled on non-VLX by artificially widening in
1771     // isel patterns.
1772 
1773     setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
1774                        Subtarget.hasVLX() ? Legal : Custom);
1775     setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
1776                        Subtarget.hasVLX() ? Legal : Custom);
1777     setOperationAction(ISD::FP_TO_UINT,         MVT::v2i32, Custom);
1778     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
1779                        Subtarget.hasVLX() ? Legal : Custom);
1780     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
1781                        Subtarget.hasVLX() ? Legal : Custom);
1782     setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v2i32, Custom);
1783     setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
1784                        Subtarget.hasVLX() ? Legal : Custom);
1785     setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
1786                        Subtarget.hasVLX() ? Legal : Custom);
1787     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
1788                        Subtarget.hasVLX() ? Legal : Custom);
1789     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
1790                        Subtarget.hasVLX() ? Legal : Custom);
1791 
1792     if (Subtarget.hasDQI()) {
1793       // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1794       // v2f32 UINT_TO_FP is already custom under SSE2.
1795       assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
1796              isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
1797              "Unexpected operation action!");
1798       // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1799       setOperationAction(ISD::FP_TO_SINT,        MVT::v2f32, Custom);
1800       setOperationAction(ISD::FP_TO_UINT,        MVT::v2f32, Custom);
1801       setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
1802       setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
1803     }
1804 
1805     for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1806       setOperationAction(ISD::SMAX, VT, Legal);
1807       setOperationAction(ISD::UMAX, VT, Legal);
1808       setOperationAction(ISD::SMIN, VT, Legal);
1809       setOperationAction(ISD::UMIN, VT, Legal);
1810       setOperationAction(ISD::ABS,  VT, Legal);
1811     }
1812 
1813     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1814       setOperationAction(ISD::ROTL,     VT, Custom);
1815       setOperationAction(ISD::ROTR,     VT, Custom);
1816     }
1817 
1818     // Custom legalize 2x32 to get a little better code.
1819     setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1820     setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1821 
1822     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1823                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1824       setOperationAction(ISD::MSCATTER, VT, Custom);
1825 
1826     if (Subtarget.hasDQI()) {
1827       for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1828         setOperationAction(ISD::SINT_TO_FP, VT,
1829                            Subtarget.hasVLX() ? Legal : Custom);
1830         setOperationAction(ISD::UINT_TO_FP, VT,
1831                            Subtarget.hasVLX() ? Legal : Custom);
1832         setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
1833                            Subtarget.hasVLX() ? Legal : Custom);
1834         setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
1835                            Subtarget.hasVLX() ? Legal : Custom);
1836         setOperationAction(ISD::FP_TO_SINT, VT,
1837                            Subtarget.hasVLX() ? Legal : Custom);
1838         setOperationAction(ISD::FP_TO_UINT, VT,
1839                            Subtarget.hasVLX() ? Legal : Custom);
1840         setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
1841                            Subtarget.hasVLX() ? Legal : Custom);
1842         setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
1843                            Subtarget.hasVLX() ? Legal : Custom);
1844         setOperationAction(ISD::MUL,               VT, Legal);
1845       }
1846     }
1847 
1848     if (Subtarget.hasCDI()) {
1849       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1850         setOperationAction(ISD::CTLZ,            VT, Legal);
1851       }
1852     } // Subtarget.hasCDI()
1853 
1854     if (Subtarget.hasVPOPCNTDQ()) {
1855       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1856         setOperationAction(ISD::CTPOP, VT, Legal);
1857     }
1858   }
1859 
1860   // This block control legalization of v32i1/v64i1 which are available with
1861   // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1862   // useBWIRegs.
1863   if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1864     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
1865     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
1866 
1867     for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1868       setOperationAction(ISD::VSELECT,            VT, Expand);
1869       setOperationAction(ISD::TRUNCATE,           VT, Custom);
1870       setOperationAction(ISD::SETCC,              VT, Custom);
1871       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1872       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1873       setOperationAction(ISD::SELECT,             VT, Custom);
1874       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1875       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1876       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1877       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1878     }
1879 
1880     for (auto VT : { MVT::v16i1, MVT::v32i1 })
1881       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1882 
1883     // Extends from v32i1 masks to 256-bit vectors.
1884     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
1885     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
1886     setOperationAction(ISD::ANY_EXTEND,         MVT::v32i8, Custom);
1887 
1888     for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1889       setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);
1890       setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1891     }
1892 
1893     // These operations are handled on non-VLX by artificially widening in
1894     // isel patterns.
1895     // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1896 
1897     if (Subtarget.hasBITALG()) {
1898       for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1899         setOperationAction(ISD::CTPOP, VT, Legal);
1900     }
1901   }
1902 
1903   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1904     setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
1905     setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1906     setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1907     setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);
1908     setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1909 
1910     setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);
1911     setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1912     setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1913     setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
1914     setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1915 
1916     if (Subtarget.hasBWI()) {
1917       setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
1918       setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
1919     }
1920 
1921     setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
1922     setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
1923     setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1924   }
1925 
1926   if (Subtarget.hasAMXTILE()) {
1927     addRegisterClass(MVT::x86amx, &X86::TILERegClass);
1928   }
1929 
1930   // We want to custom lower some of our intrinsics.
1931   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1932   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1933   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1934   if (!Subtarget.is64Bit()) {
1935     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1936   }
1937 
1938   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1939   // handle type legalization for these operations here.
1940   //
1941   // FIXME: We really should do custom legalization for addition and
1942   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1943   // than generic legalization for 64-bit multiplication-with-overflow, though.
1944   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1945     if (VT == MVT::i64 && !Subtarget.is64Bit())
1946       continue;
1947     // Add/Sub/Mul with overflow operations are custom lowered.
1948     setOperationAction(ISD::SADDO, VT, Custom);
1949     setOperationAction(ISD::UADDO, VT, Custom);
1950     setOperationAction(ISD::SSUBO, VT, Custom);
1951     setOperationAction(ISD::USUBO, VT, Custom);
1952     setOperationAction(ISD::SMULO, VT, Custom);
1953     setOperationAction(ISD::UMULO, VT, Custom);
1954 
1955     // Support carry in as value rather than glue.
1956     setOperationAction(ISD::ADDCARRY, VT, Custom);
1957     setOperationAction(ISD::SUBCARRY, VT, Custom);
1958     setOperationAction(ISD::SETCCCARRY, VT, Custom);
1959     setOperationAction(ISD::SADDO_CARRY, VT, Custom);
1960     setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
1961   }
1962 
1963   if (!Subtarget.is64Bit()) {
1964     // These libcalls are not available in 32-bit.
1965     setLibcallName(RTLIB::SHL_I128, nullptr);
1966     setLibcallName(RTLIB::SRL_I128, nullptr);
1967     setLibcallName(RTLIB::SRA_I128, nullptr);
1968     setLibcallName(RTLIB::MUL_I128, nullptr);
1969   }
1970 
1971   // Combine sin / cos into _sincos_stret if it is available.
1972   if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1973       getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1974     setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1975     setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1976   }
1977 
1978   if (Subtarget.isTargetWin64()) {
1979     setOperationAction(ISD::SDIV, MVT::i128, Custom);
1980     setOperationAction(ISD::UDIV, MVT::i128, Custom);
1981     setOperationAction(ISD::SREM, MVT::i128, Custom);
1982     setOperationAction(ISD::UREM, MVT::i128, Custom);
1983   }
1984 
1985   // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1986   // is. We should promote the value to 64-bits to solve this.
1987   // This is what the CRT headers do - `fmodf` is an inline header
1988   // function casting to f64 and calling `fmod`.
1989   if (Subtarget.is32Bit() &&
1990       (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
1991     for (ISD::NodeType Op :
1992          {ISD::FCEIL,  ISD::STRICT_FCEIL,
1993           ISD::FCOS,   ISD::STRICT_FCOS,
1994           ISD::FEXP,   ISD::STRICT_FEXP,
1995           ISD::FFLOOR, ISD::STRICT_FFLOOR,
1996           ISD::FREM,   ISD::STRICT_FREM,
1997           ISD::FLOG,   ISD::STRICT_FLOG,
1998           ISD::FLOG10, ISD::STRICT_FLOG10,
1999           ISD::FPOW,   ISD::STRICT_FPOW,
2000           ISD::FSIN,   ISD::STRICT_FSIN})
2001       if (isOperationExpand(Op, MVT::f32))
2002         setOperationAction(Op, MVT::f32, Promote);
2003 
2004   // We have target-specific dag combine patterns for the following nodes:
2005   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
2006   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
2007   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
2008   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
2009   setTargetDAGCombine(ISD::CONCAT_VECTORS);
2010   setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
2011   setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
2012   setTargetDAGCombine(ISD::BITCAST);
2013   setTargetDAGCombine(ISD::VSELECT);
2014   setTargetDAGCombine(ISD::SELECT);
2015   setTargetDAGCombine(ISD::SHL);
2016   setTargetDAGCombine(ISD::SRA);
2017   setTargetDAGCombine(ISD::SRL);
2018   setTargetDAGCombine(ISD::OR);
2019   setTargetDAGCombine(ISD::AND);
2020   setTargetDAGCombine(ISD::ADD);
2021   setTargetDAGCombine(ISD::FADD);
2022   setTargetDAGCombine(ISD::FSUB);
2023   setTargetDAGCombine(ISD::FNEG);
2024   setTargetDAGCombine(ISD::FMA);
2025   setTargetDAGCombine(ISD::STRICT_FMA);
2026   setTargetDAGCombine(ISD::FMINNUM);
2027   setTargetDAGCombine(ISD::FMAXNUM);
2028   setTargetDAGCombine(ISD::SUB);
2029   setTargetDAGCombine(ISD::LOAD);
2030   setTargetDAGCombine(ISD::MLOAD);
2031   setTargetDAGCombine(ISD::STORE);
2032   setTargetDAGCombine(ISD::MSTORE);
2033   setTargetDAGCombine(ISD::TRUNCATE);
2034   setTargetDAGCombine(ISD::ZERO_EXTEND);
2035   setTargetDAGCombine(ISD::ANY_EXTEND);
2036   setTargetDAGCombine(ISD::SIGN_EXTEND);
2037   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
2038   setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
2039   setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
2040   setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
2041   setTargetDAGCombine(ISD::SINT_TO_FP);
2042   setTargetDAGCombine(ISD::UINT_TO_FP);
2043   setTargetDAGCombine(ISD::STRICT_SINT_TO_FP);
2044   setTargetDAGCombine(ISD::STRICT_UINT_TO_FP);
2045   setTargetDAGCombine(ISD::SETCC);
2046   setTargetDAGCombine(ISD::MUL);
2047   setTargetDAGCombine(ISD::XOR);
2048   setTargetDAGCombine(ISD::MSCATTER);
2049   setTargetDAGCombine(ISD::MGATHER);
2050   setTargetDAGCombine(ISD::FP16_TO_FP);
2051   setTargetDAGCombine(ISD::FP_EXTEND);
2052   setTargetDAGCombine(ISD::STRICT_FP_EXTEND);
2053   setTargetDAGCombine(ISD::FP_ROUND);
2054 
2055   computeRegisterProperties(Subtarget.getRegisterInfo());
2056 
2057   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2058   MaxStoresPerMemsetOptSize = 8;
2059   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2060   MaxStoresPerMemcpyOptSize = 4;
2061   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2062   MaxStoresPerMemmoveOptSize = 4;
2063 
2064   // TODO: These control memcmp expansion in CGP and could be raised higher, but
2065   // that needs to benchmarked and balanced with the potential use of vector
2066   // load/store types (PR33329, PR33914).
2067   MaxLoadsPerMemcmp = 2;
2068   MaxLoadsPerMemcmpOptSize = 2;
2069 
2070   // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
2071   setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment));
2072 
2073   // An out-of-order CPU can speculatively execute past a predictable branch,
2074   // but a conditional move could be stalled by an expensive earlier operation.
2075   PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2076   EnableExtLdPromotion = true;
2077   setPrefFunctionAlignment(Align(16));
2078 
2079   verifyIntrinsicTables();
2080 
2081   // Default to having -disable-strictnode-mutation on
2082   IsStrictFPEnabled = true;
2083 }
2084 
2085 // This has so far only been implemented for 64-bit MachO.
useLoadStackGuardNode() const2086 bool X86TargetLowering::useLoadStackGuardNode() const {
2087   return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2088 }
2089 
useStackGuardXorFP() const2090 bool X86TargetLowering::useStackGuardXorFP() const {
2091   // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2092   return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2093 }
2094 
emitStackGuardXorFP(SelectionDAG & DAG,SDValue Val,const SDLoc & DL) const2095 SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
2096                                                const SDLoc &DL) const {
2097   EVT PtrTy = getPointerTy(DAG.getDataLayout());
2098   unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2099   MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2100   return SDValue(Node, 0);
2101 }
2102 
2103 TargetLoweringBase::LegalizeTypeAction
getPreferredVectorAction(MVT VT) const2104 X86TargetLowering::getPreferredVectorAction(MVT VT) const {
2105   if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2106       !Subtarget.hasBWI())
2107     return TypeSplitVector;
2108 
2109   if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2110       VT.getVectorElementType() != MVT::i1)
2111     return TypeWidenVector;
2112 
2113   return TargetLoweringBase::getPreferredVectorAction(VT);
2114 }
2115 
2116 static std::pair<MVT, unsigned>
handleMaskRegisterForCallingConv(unsigned NumElts,CallingConv::ID CC,const X86Subtarget & Subtarget)2117 handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
2118                                  const X86Subtarget &Subtarget) {
2119   // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
2120   // convention is one that uses k registers.
2121   if (NumElts == 2)
2122     return {MVT::v2i64, 1};
2123   if (NumElts == 4)
2124     return {MVT::v4i32, 1};
2125   if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
2126       CC != CallingConv::Intel_OCL_BI)
2127     return {MVT::v8i16, 1};
2128   if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
2129       CC != CallingConv::Intel_OCL_BI)
2130     return {MVT::v16i8, 1};
2131   // v32i1 passes in ymm unless we have BWI and the calling convention is
2132   // regcall.
2133   if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
2134     return {MVT::v32i8, 1};
2135   // Split v64i1 vectors if we don't have v64i8 available.
2136   if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
2137     if (Subtarget.useAVX512Regs())
2138       return {MVT::v64i8, 1};
2139     return {MVT::v32i8, 2};
2140   }
2141 
2142   // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2143   if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
2144       NumElts > 64)
2145     return {MVT::i8, NumElts};
2146 
2147   return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
2148 }
2149 
getRegisterTypeForCallingConv(LLVMContext & Context,CallingConv::ID CC,EVT VT) const2150 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
2151                                                      CallingConv::ID CC,
2152                                                      EVT VT) const {
2153   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2154       Subtarget.hasAVX512()) {
2155     unsigned NumElts = VT.getVectorNumElements();
2156 
2157     MVT RegisterVT;
2158     unsigned NumRegisters;
2159     std::tie(RegisterVT, NumRegisters) =
2160         handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2161     if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2162       return RegisterVT;
2163   }
2164 
2165   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
2166 }
2167 
getNumRegistersForCallingConv(LLVMContext & Context,CallingConv::ID CC,EVT VT) const2168 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
2169                                                           CallingConv::ID CC,
2170                                                           EVT VT) const {
2171   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2172       Subtarget.hasAVX512()) {
2173     unsigned NumElts = VT.getVectorNumElements();
2174 
2175     MVT RegisterVT;
2176     unsigned NumRegisters;
2177     std::tie(RegisterVT, NumRegisters) =
2178         handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2179     if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2180       return NumRegisters;
2181   }
2182 
2183   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
2184 }
2185 
getVectorTypeBreakdownForCallingConv(LLVMContext & Context,CallingConv::ID CC,EVT VT,EVT & IntermediateVT,unsigned & NumIntermediates,MVT & RegisterVT) const2186 unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
2187     LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
2188     unsigned &NumIntermediates, MVT &RegisterVT) const {
2189   // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2190   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2191       Subtarget.hasAVX512() &&
2192       (!isPowerOf2_32(VT.getVectorNumElements()) ||
2193        (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
2194        VT.getVectorNumElements() > 64)) {
2195     RegisterVT = MVT::i8;
2196     IntermediateVT = MVT::i1;
2197     NumIntermediates = VT.getVectorNumElements();
2198     return NumIntermediates;
2199   }
2200 
2201   // Split v64i1 vectors if we don't have v64i8 available.
2202   if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
2203       CC != CallingConv::X86_RegCall) {
2204     RegisterVT = MVT::v32i8;
2205     IntermediateVT = MVT::v32i1;
2206     NumIntermediates = 2;
2207     return 2;
2208   }
2209 
2210   return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
2211                                               NumIntermediates, RegisterVT);
2212 }
2213 
getSetCCResultType(const DataLayout & DL,LLVMContext & Context,EVT VT) const2214 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
2215                                           LLVMContext& Context,
2216                                           EVT VT) const {
2217   if (!VT.isVector())
2218     return MVT::i8;
2219 
2220   if (Subtarget.hasAVX512()) {
2221     // Figure out what this type will be legalized to.
2222     EVT LegalVT = VT;
2223     while (getTypeAction(Context, LegalVT) != TypeLegal)
2224       LegalVT = getTypeToTransformTo(Context, LegalVT);
2225 
2226     // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
2227     if (LegalVT.getSimpleVT().is512BitVector())
2228       return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2229 
2230     if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
2231       // If we legalized to less than a 512-bit vector, then we will use a vXi1
2232       // compare for vXi32/vXi64 for sure. If we have BWI we will also support
2233       // vXi16/vXi8.
2234       MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
2235       if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
2236         return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2237     }
2238   }
2239 
2240   return VT.changeVectorElementTypeToInteger();
2241 }
2242 
2243 /// Helper for getByValTypeAlignment to determine
2244 /// the desired ByVal argument alignment.
getMaxByValAlign(Type * Ty,Align & MaxAlign)2245 static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
2246   if (MaxAlign == 16)
2247     return;
2248   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2249     if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128)
2250       MaxAlign = Align(16);
2251   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2252     Align EltAlign;
2253     getMaxByValAlign(ATy->getElementType(), EltAlign);
2254     if (EltAlign > MaxAlign)
2255       MaxAlign = EltAlign;
2256   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2257     for (auto *EltTy : STy->elements()) {
2258       Align EltAlign;
2259       getMaxByValAlign(EltTy, EltAlign);
2260       if (EltAlign > MaxAlign)
2261         MaxAlign = EltAlign;
2262       if (MaxAlign == 16)
2263         break;
2264     }
2265   }
2266 }
2267 
2268 /// Return the desired alignment for ByVal aggregate
2269 /// function arguments in the caller parameter area. For X86, aggregates
2270 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
2271 /// are at 4-byte boundaries.
getByValTypeAlignment(Type * Ty,const DataLayout & DL) const2272 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
2273                                                   const DataLayout &DL) const {
2274   if (Subtarget.is64Bit()) {
2275     // Max of 8 and alignment of type.
2276     Align TyAlign = DL.getABITypeAlign(Ty);
2277     if (TyAlign > 8)
2278       return TyAlign.value();
2279     return 8;
2280   }
2281 
2282   Align Alignment(4);
2283   if (Subtarget.hasSSE1())
2284     getMaxByValAlign(Ty, Alignment);
2285   return Alignment.value();
2286 }
2287 
2288 /// It returns EVT::Other if the type should be determined using generic
2289 /// target-independent logic.
2290 /// For vector ops we check that the overall size isn't larger than our
2291 /// preferred vector width.
getOptimalMemOpType(const MemOp & Op,const AttributeList & FuncAttributes) const2292 EVT X86TargetLowering::getOptimalMemOpType(
2293     const MemOp &Op, const AttributeList &FuncAttributes) const {
2294   if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
2295     if (Op.size() >= 16 &&
2296         (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
2297       // FIXME: Check if unaligned 64-byte accesses are slow.
2298       if (Op.size() >= 64 && Subtarget.hasAVX512() &&
2299           (Subtarget.getPreferVectorWidth() >= 512)) {
2300         return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
2301       }
2302       // FIXME: Check if unaligned 32-byte accesses are slow.
2303       if (Op.size() >= 32 && Subtarget.hasAVX() &&
2304           (Subtarget.getPreferVectorWidth() >= 256)) {
2305         // Although this isn't a well-supported type for AVX1, we'll let
2306         // legalization and shuffle lowering produce the optimal codegen. If we
2307         // choose an optimal type with a vector element larger than a byte,
2308         // getMemsetStores() may create an intermediate splat (using an integer
2309         // multiply) before we splat as a vector.
2310         return MVT::v32i8;
2311       }
2312       if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
2313         return MVT::v16i8;
2314       // TODO: Can SSE1 handle a byte vector?
2315       // If we have SSE1 registers we should be able to use them.
2316       if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
2317           (Subtarget.getPreferVectorWidth() >= 128))
2318         return MVT::v4f32;
2319     } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
2320                Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2321       // Do not use f64 to lower memcpy if source is string constant. It's
2322       // better to use i32 to avoid the loads.
2323       // Also, do not use f64 to lower memset unless this is a memset of zeros.
2324       // The gymnastics of splatting a byte value into an XMM register and then
2325       // only using 8-byte stores (because this is a CPU with slow unaligned
2326       // 16-byte accesses) makes that a loser.
2327       return MVT::f64;
2328     }
2329   }
2330   // This is a compromise. If we reach here, unaligned accesses may be slow on
2331   // this target. However, creating smaller, aligned accesses could be even
2332   // slower and would certainly be a lot more code.
2333   if (Subtarget.is64Bit() && Op.size() >= 8)
2334     return MVT::i64;
2335   return MVT::i32;
2336 }
2337 
isSafeMemOpType(MVT VT) const2338 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
2339   if (VT == MVT::f32)
2340     return X86ScalarSSEf32;
2341   if (VT == MVT::f64)
2342     return X86ScalarSSEf64;
2343   return true;
2344 }
2345 
allowsMisalignedMemoryAccesses(EVT VT,unsigned,Align Alignment,MachineMemOperand::Flags Flags,bool * Fast) const2346 bool X86TargetLowering::allowsMisalignedMemoryAccesses(
2347     EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
2348     bool *Fast) const {
2349   if (Fast) {
2350     switch (VT.getSizeInBits()) {
2351     default:
2352       // 8-byte and under are always assumed to be fast.
2353       *Fast = true;
2354       break;
2355     case 128:
2356       *Fast = !Subtarget.isUnalignedMem16Slow();
2357       break;
2358     case 256:
2359       *Fast = !Subtarget.isUnalignedMem32Slow();
2360       break;
2361     // TODO: What about AVX-512 (512-bit) accesses?
2362     }
2363   }
2364   // NonTemporal vector memory ops must be aligned.
2365   if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2366     // NT loads can only be vector aligned, so if its less aligned than the
2367     // minimum vector size (which we can split the vector down to), we might as
2368     // well use a regular unaligned vector load.
2369     // We don't have any NT loads pre-SSE41.
2370     if (!!(Flags & MachineMemOperand::MOLoad))
2371       return (Alignment < 16 || !Subtarget.hasSSE41());
2372     return false;
2373   }
2374   // Misaligned accesses of any size are always allowed.
2375   return true;
2376 }
2377 
2378 /// Return the entry encoding for a jump table in the
2379 /// current function.  The returned value is a member of the
2380 /// MachineJumpTableInfo::JTEntryKind enum.
getJumpTableEncoding() const2381 unsigned X86TargetLowering::getJumpTableEncoding() const {
2382   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2383   // symbol.
2384   if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2385     return MachineJumpTableInfo::EK_Custom32;
2386 
2387   // Otherwise, use the normal jump table encoding heuristics.
2388   return TargetLowering::getJumpTableEncoding();
2389 }
2390 
useSoftFloat() const2391 bool X86TargetLowering::useSoftFloat() const {
2392   return Subtarget.useSoftFloat();
2393 }
2394 
markLibCallAttributes(MachineFunction * MF,unsigned CC,ArgListTy & Args) const2395 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2396                                               ArgListTy &Args) const {
2397 
2398   // Only relabel X86-32 for C / Stdcall CCs.
2399   if (Subtarget.is64Bit())
2400     return;
2401   if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2402     return;
2403   unsigned ParamRegs = 0;
2404   if (auto *M = MF->getFunction().getParent())
2405     ParamRegs = M->getNumberRegisterParameters();
2406 
2407   // Mark the first N int arguments as having reg
2408   for (auto &Arg : Args) {
2409     Type *T = Arg.Ty;
2410     if (T->isIntOrPtrTy())
2411       if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2412         unsigned numRegs = 1;
2413         if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2414           numRegs = 2;
2415         if (ParamRegs < numRegs)
2416           return;
2417         ParamRegs -= numRegs;
2418         Arg.IsInReg = true;
2419       }
2420   }
2421 }
2422 
2423 const MCExpr *
LowerCustomJumpTableEntry(const MachineJumpTableInfo * MJTI,const MachineBasicBlock * MBB,unsigned uid,MCContext & Ctx) const2424 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2425                                              const MachineBasicBlock *MBB,
2426                                              unsigned uid,MCContext &Ctx) const{
2427   assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
2428   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2429   // entries.
2430   return MCSymbolRefExpr::create(MBB->getSymbol(),
2431                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
2432 }
2433 
2434 /// Returns relocation base for the given PIC jumptable.
getPICJumpTableRelocBase(SDValue Table,SelectionDAG & DAG) const2435 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2436                                                     SelectionDAG &DAG) const {
2437   if (!Subtarget.is64Bit())
2438     // This doesn't have SDLoc associated with it, but is not really the
2439     // same as a Register.
2440     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2441                        getPointerTy(DAG.getDataLayout()));
2442   return Table;
2443 }
2444 
2445 /// This returns the relocation base for the given PIC jumptable,
2446 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
2447 const MCExpr *X86TargetLowering::
getPICJumpTableRelocBaseExpr(const MachineFunction * MF,unsigned JTI,MCContext & Ctx) const2448 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2449                              MCContext &Ctx) const {
2450   // X86-64 uses RIP relative addressing based on the jump table label.
2451   if (Subtarget.isPICStyleRIPRel())
2452     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2453 
2454   // Otherwise, the reference is relative to the PIC base.
2455   return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2456 }
2457 
2458 std::pair<const TargetRegisterClass *, uint8_t>
findRepresentativeClass(const TargetRegisterInfo * TRI,MVT VT) const2459 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2460                                            MVT VT) const {
2461   const TargetRegisterClass *RRC = nullptr;
2462   uint8_t Cost = 1;
2463   switch (VT.SimpleTy) {
2464   default:
2465     return TargetLowering::findRepresentativeClass(TRI, VT);
2466   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2467     RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2468     break;
2469   case MVT::x86mmx:
2470     RRC = &X86::VR64RegClass;
2471     break;
2472   case MVT::f32: case MVT::f64:
2473   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2474   case MVT::v4f32: case MVT::v2f64:
2475   case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2476   case MVT::v8f32: case MVT::v4f64:
2477   case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2478   case MVT::v16f32: case MVT::v8f64:
2479     RRC = &X86::VR128XRegClass;
2480     break;
2481   }
2482   return std::make_pair(RRC, Cost);
2483 }
2484 
getAddressSpace() const2485 unsigned X86TargetLowering::getAddressSpace() const {
2486   if (Subtarget.is64Bit())
2487     return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2488   return 256;
2489 }
2490 
hasStackGuardSlotTLS(const Triple & TargetTriple)2491 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2492   return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2493          (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2494 }
2495 
SegmentOffset(IRBuilder<> & IRB,int Offset,unsigned AddressSpace)2496 static Constant* SegmentOffset(IRBuilder<> &IRB,
2497                                int Offset, unsigned AddressSpace) {
2498   return ConstantExpr::getIntToPtr(
2499       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2500       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2501 }
2502 
getIRStackGuard(IRBuilder<> & IRB) const2503 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2504   // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2505   // tcbhead_t; use it instead of the usual global variable (see
2506   // sysdeps/{i386,x86_64}/nptl/tls.h)
2507   if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2508     if (Subtarget.isTargetFuchsia()) {
2509       // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2510       return SegmentOffset(IRB, 0x10, getAddressSpace());
2511     } else {
2512       unsigned AddressSpace = getAddressSpace();
2513       Module *M = IRB.GetInsertBlock()->getParent()->getParent();
2514       // Specially, some users may customize the base reg and offset.
2515       int Offset = M->getStackProtectorGuardOffset();
2516       // If we don't set -stack-protector-guard-offset value:
2517       // %fs:0x28, unless we're using a Kernel code model, in which case
2518       // it's %gs:0x28.  gs:0x14 on i386.
2519       if (Offset == INT_MAX)
2520         Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2521 
2522       StringRef GuardReg = M->getStackProtectorGuardReg();
2523       if (GuardReg == "fs")
2524         AddressSpace = X86AS::FS;
2525       else if (GuardReg == "gs")
2526         AddressSpace = X86AS::GS;
2527       return SegmentOffset(IRB, Offset, AddressSpace);
2528     }
2529   }
2530   return TargetLowering::getIRStackGuard(IRB);
2531 }
2532 
insertSSPDeclarations(Module & M) const2533 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2534   // MSVC CRT provides functionalities for stack protection.
2535   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2536       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2537     // MSVC CRT has a global variable holding security cookie.
2538     M.getOrInsertGlobal("__security_cookie",
2539                         Type::getInt8PtrTy(M.getContext()));
2540 
2541     // MSVC CRT has a function to validate security cookie.
2542     FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
2543         "__security_check_cookie", Type::getVoidTy(M.getContext()),
2544         Type::getInt8PtrTy(M.getContext()));
2545     if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
2546       F->setCallingConv(CallingConv::X86_FastCall);
2547       F->addAttribute(1, Attribute::AttrKind::InReg);
2548     }
2549     return;
2550   }
2551 
2552   StringRef GuardMode = M.getStackProtectorGuard();
2553 
2554   // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2555   if ((GuardMode == "tls" || GuardMode.empty()) &&
2556       hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2557     return;
2558   TargetLowering::insertSSPDeclarations(M);
2559 }
2560 
getSDagStackGuard(const Module & M) const2561 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2562   // MSVC CRT has a global variable holding security cookie.
2563   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2564       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2565     return M.getGlobalVariable("__security_cookie");
2566   }
2567   return TargetLowering::getSDagStackGuard(M);
2568 }
2569 
getSSPStackGuardCheck(const Module & M) const2570 Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2571   // MSVC CRT has a function to validate security cookie.
2572   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2573       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2574     return M.getFunction("__security_check_cookie");
2575   }
2576   return TargetLowering::getSSPStackGuardCheck(M);
2577 }
2578 
getSafeStackPointerLocation(IRBuilder<> & IRB) const2579 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2580   if (Subtarget.getTargetTriple().isOSContiki())
2581     return getDefaultSafeStackPointerLocation(IRB, false);
2582 
2583   // Android provides a fixed TLS slot for the SafeStack pointer. See the
2584   // definition of TLS_SLOT_SAFESTACK in
2585   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2586   if (Subtarget.isTargetAndroid()) {
2587     // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2588     // %gs:0x24 on i386
2589     int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2590     return SegmentOffset(IRB, Offset, getAddressSpace());
2591   }
2592 
2593   // Fuchsia is similar.
2594   if (Subtarget.isTargetFuchsia()) {
2595     // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2596     return SegmentOffset(IRB, 0x18, getAddressSpace());
2597   }
2598 
2599   return TargetLowering::getSafeStackPointerLocation(IRB);
2600 }
2601 
2602 //===----------------------------------------------------------------------===//
2603 //               Return Value Calling Convention Implementation
2604 //===----------------------------------------------------------------------===//
2605 
CanLowerReturn(CallingConv::ID CallConv,MachineFunction & MF,bool isVarArg,const SmallVectorImpl<ISD::OutputArg> & Outs,LLVMContext & Context) const2606 bool X86TargetLowering::CanLowerReturn(
2607     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2608     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2609   SmallVector<CCValAssign, 16> RVLocs;
2610   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2611   return CCInfo.CheckReturn(Outs, RetCC_X86);
2612 }
2613 
getScratchRegisters(CallingConv::ID) const2614 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2615   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2616   return ScratchRegs;
2617 }
2618 
2619 /// Lowers masks values (v*i1) to the local register values
2620 /// \returns DAG node after lowering to register type
lowerMasksToReg(const SDValue & ValArg,const EVT & ValLoc,const SDLoc & Dl,SelectionDAG & DAG)2621 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2622                                const SDLoc &Dl, SelectionDAG &DAG) {
2623   EVT ValVT = ValArg.getValueType();
2624 
2625   if (ValVT == MVT::v1i1)
2626     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2627                        DAG.getIntPtrConstant(0, Dl));
2628 
2629   if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2630       (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2631     // Two stage lowering might be required
2632     // bitcast:   v8i1 -> i8 / v16i1 -> i16
2633     // anyextend: i8   -> i32 / i16   -> i32
2634     EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2635     SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2636     if (ValLoc == MVT::i32)
2637       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2638     return ValToCopy;
2639   }
2640 
2641   if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2642       (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2643     // One stage lowering is required
2644     // bitcast:   v32i1 -> i32 / v64i1 -> i64
2645     return DAG.getBitcast(ValLoc, ValArg);
2646   }
2647 
2648   return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2649 }
2650 
2651 /// Breaks v64i1 value into two registers and adds the new node to the DAG
Passv64i1ArgInRegs(const SDLoc & Dl,SelectionDAG & DAG,SDValue & Arg,SmallVectorImpl<std::pair<Register,SDValue>> & RegsToPass,CCValAssign & VA,CCValAssign & NextVA,const X86Subtarget & Subtarget)2652 static void Passv64i1ArgInRegs(
2653     const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
2654     SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
2655     CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2656   assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
2657   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2658   assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2659   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2660          "The value should reside in two registers");
2661 
2662   // Before splitting the value we cast it to i64
2663   Arg = DAG.getBitcast(MVT::i64, Arg);
2664 
2665   // Splitting the value into two i32 types
2666   SDValue Lo, Hi;
2667   Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2668                    DAG.getConstant(0, Dl, MVT::i32));
2669   Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2670                    DAG.getConstant(1, Dl, MVT::i32));
2671 
2672   // Attach the two i32 types into corresponding registers
2673   RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2674   RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2675 }
2676 
2677 SDValue
LowerReturn(SDValue Chain,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::OutputArg> & Outs,const SmallVectorImpl<SDValue> & OutVals,const SDLoc & dl,SelectionDAG & DAG) const2678 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2679                                bool isVarArg,
2680                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2681                                const SmallVectorImpl<SDValue> &OutVals,
2682                                const SDLoc &dl, SelectionDAG &DAG) const {
2683   MachineFunction &MF = DAG.getMachineFunction();
2684   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2685 
2686   // In some cases we need to disable registers from the default CSR list.
2687   // For example, when they are used for argument passing.
2688   bool ShouldDisableCalleeSavedRegister =
2689       CallConv == CallingConv::X86_RegCall ||
2690       MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2691 
2692   if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2693     report_fatal_error("X86 interrupts may not return any value");
2694 
2695   SmallVector<CCValAssign, 16> RVLocs;
2696   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2697   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2698 
2699   SmallVector<std::pair<Register, SDValue>, 4> RetVals;
2700   for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2701        ++I, ++OutsIndex) {
2702     CCValAssign &VA = RVLocs[I];
2703     assert(VA.isRegLoc() && "Can only return in registers!");
2704 
2705     // Add the register to the CalleeSaveDisableRegs list.
2706     if (ShouldDisableCalleeSavedRegister)
2707       MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2708 
2709     SDValue ValToCopy = OutVals[OutsIndex];
2710     EVT ValVT = ValToCopy.getValueType();
2711 
2712     // Promote values to the appropriate types.
2713     if (VA.getLocInfo() == CCValAssign::SExt)
2714       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2715     else if (VA.getLocInfo() == CCValAssign::ZExt)
2716       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2717     else if (VA.getLocInfo() == CCValAssign::AExt) {
2718       if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2719         ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2720       else
2721         ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2722     }
2723     else if (VA.getLocInfo() == CCValAssign::BCvt)
2724       ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2725 
2726     assert(VA.getLocInfo() != CCValAssign::FPExt &&
2727            "Unexpected FP-extend for return value.");
2728 
2729     // Report an error if we have attempted to return a value via an XMM
2730     // register and SSE was disabled.
2731     if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
2732       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2733       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2734     } else if (!Subtarget.hasSSE2() &&
2735                X86::FR64XRegClass.contains(VA.getLocReg()) &&
2736                ValVT == MVT::f64) {
2737       // When returning a double via an XMM register, report an error if SSE2 is
2738       // not enabled.
2739       errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2740       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2741     }
2742 
2743     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2744     // the RET instruction and handled by the FP Stackifier.
2745     if (VA.getLocReg() == X86::FP0 ||
2746         VA.getLocReg() == X86::FP1) {
2747       // If this is a copy from an xmm register to ST(0), use an FPExtend to
2748       // change the value to the FP stack register class.
2749       if (isScalarFPTypeInSSEReg(VA.getValVT()))
2750         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2751       RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2752       // Don't emit a copytoreg.
2753       continue;
2754     }
2755 
2756     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2757     // which is returned in RAX / RDX.
2758     if (Subtarget.is64Bit()) {
2759       if (ValVT == MVT::x86mmx) {
2760         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2761           ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2762           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2763                                   ValToCopy);
2764           // If we don't have SSE2 available, convert to v4f32 so the generated
2765           // register is legal.
2766           if (!Subtarget.hasSSE2())
2767             ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2768         }
2769       }
2770     }
2771 
2772     if (VA.needsCustom()) {
2773       assert(VA.getValVT() == MVT::v64i1 &&
2774              "Currently the only custom case is when we split v64i1 to 2 regs");
2775 
2776       Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
2777                          Subtarget);
2778 
2779       // Add the second register to the CalleeSaveDisableRegs list.
2780       if (ShouldDisableCalleeSavedRegister)
2781         MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2782     } else {
2783       RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2784     }
2785   }
2786 
2787   SDValue Flag;
2788   SmallVector<SDValue, 6> RetOps;
2789   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2790   // Operand #1 = Bytes To Pop
2791   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2792                    MVT::i32));
2793 
2794   // Copy the result values into the output registers.
2795   for (auto &RetVal : RetVals) {
2796     if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
2797       RetOps.push_back(RetVal.second);
2798       continue; // Don't emit a copytoreg.
2799     }
2800 
2801     Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);
2802     Flag = Chain.getValue(1);
2803     RetOps.push_back(
2804         DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
2805   }
2806 
2807   // Swift calling convention does not require we copy the sret argument
2808   // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2809 
2810   // All x86 ABIs require that for returning structs by value we copy
2811   // the sret argument into %rax/%eax (depending on ABI) for the return.
2812   // We saved the argument into a virtual register in the entry block,
2813   // so now we copy the value out and into %rax/%eax.
2814   //
2815   // Checking Function.hasStructRetAttr() here is insufficient because the IR
2816   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2817   // false, then an sret argument may be implicitly inserted in the SelDAG. In
2818   // either case FuncInfo->setSRetReturnReg() will have been called.
2819   if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
2820     // When we have both sret and another return value, we should use the
2821     // original Chain stored in RetOps[0], instead of the current Chain updated
2822     // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2823 
2824     // For the case of sret and another return value, we have
2825     //   Chain_0 at the function entry
2826     //   Chain_1 = getCopyToReg(Chain_0) in the above loop
2827     // If we use Chain_1 in getCopyFromReg, we will have
2828     //   Val = getCopyFromReg(Chain_1)
2829     //   Chain_2 = getCopyToReg(Chain_1, Val) from below
2830 
2831     // getCopyToReg(Chain_0) will be glued together with
2832     // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2833     // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2834     //   Data dependency from Unit B to Unit A due to usage of Val in
2835     //     getCopyToReg(Chain_1, Val)
2836     //   Chain dependency from Unit A to Unit B
2837 
2838     // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2839     SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2840                                      getPointerTy(MF.getDataLayout()));
2841 
2842     Register RetValReg
2843         = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2844           X86::RAX : X86::EAX;
2845     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2846     Flag = Chain.getValue(1);
2847 
2848     // RAX/EAX now acts like a return value.
2849     RetOps.push_back(
2850         DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2851 
2852     // Add the returned register to the CalleeSaveDisableRegs list.
2853     if (ShouldDisableCalleeSavedRegister)
2854       MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2855   }
2856 
2857   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2858   const MCPhysReg *I =
2859       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2860   if (I) {
2861     for (; *I; ++I) {
2862       if (X86::GR64RegClass.contains(*I))
2863         RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2864       else
2865         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2866     }
2867   }
2868 
2869   RetOps[0] = Chain;  // Update chain.
2870 
2871   // Add the flag if we have it.
2872   if (Flag.getNode())
2873     RetOps.push_back(Flag);
2874 
2875   X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2876   if (CallConv == CallingConv::X86_INTR)
2877     opcode = X86ISD::IRET;
2878   return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2879 }
2880 
isUsedByReturnOnly(SDNode * N,SDValue & Chain) const2881 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2882   if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2883     return false;
2884 
2885   SDValue TCChain = Chain;
2886   SDNode *Copy = *N->use_begin();
2887   if (Copy->getOpcode() == ISD::CopyToReg) {
2888     // If the copy has a glue operand, we conservatively assume it isn't safe to
2889     // perform a tail call.
2890     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2891       return false;
2892     TCChain = Copy->getOperand(0);
2893   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2894     return false;
2895 
2896   bool HasRet = false;
2897   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2898        UI != UE; ++UI) {
2899     if (UI->getOpcode() != X86ISD::RET_FLAG)
2900       return false;
2901     // If we are returning more than one value, we can definitely
2902     // not make a tail call see PR19530
2903     if (UI->getNumOperands() > 4)
2904       return false;
2905     if (UI->getNumOperands() == 4 &&
2906         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2907       return false;
2908     HasRet = true;
2909   }
2910 
2911   if (!HasRet)
2912     return false;
2913 
2914   Chain = TCChain;
2915   return true;
2916 }
2917 
getTypeForExtReturn(LLVMContext & Context,EVT VT,ISD::NodeType ExtendKind) const2918 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2919                                            ISD::NodeType ExtendKind) const {
2920   MVT ReturnMVT = MVT::i32;
2921 
2922   bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2923   if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2924     // The ABI does not require i1, i8 or i16 to be extended.
2925     //
2926     // On Darwin, there is code in the wild relying on Clang's old behaviour of
2927     // always extending i8/i16 return values, so keep doing that for now.
2928     // (PR26665).
2929     ReturnMVT = MVT::i8;
2930   }
2931 
2932   EVT MinVT = getRegisterType(Context, ReturnMVT);
2933   return VT.bitsLT(MinVT) ? MinVT : VT;
2934 }
2935 
2936 /// Reads two 32 bit registers and creates a 64 bit mask value.
2937 /// \param VA The current 32 bit value that need to be assigned.
2938 /// \param NextVA The next 32 bit value that need to be assigned.
2939 /// \param Root The parent DAG node.
2940 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2941 ///                        glue purposes. In the case the DAG is already using
2942 ///                        physical register instead of virtual, we should glue
2943 ///                        our new SDValue to InFlag SDvalue.
2944 /// \return a new SDvalue of size 64bit.
getv64i1Argument(CCValAssign & VA,CCValAssign & NextVA,SDValue & Root,SelectionDAG & DAG,const SDLoc & Dl,const X86Subtarget & Subtarget,SDValue * InFlag=nullptr)2945 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2946                                 SDValue &Root, SelectionDAG &DAG,
2947                                 const SDLoc &Dl, const X86Subtarget &Subtarget,
2948                                 SDValue *InFlag = nullptr) {
2949   assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2950   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2951   assert(VA.getValVT() == MVT::v64i1 &&
2952          "Expecting first location of 64 bit width type");
2953   assert(NextVA.getValVT() == VA.getValVT() &&
2954          "The locations should have the same type");
2955   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2956          "The values should reside in two registers");
2957 
2958   SDValue Lo, Hi;
2959   SDValue ArgValueLo, ArgValueHi;
2960 
2961   MachineFunction &MF = DAG.getMachineFunction();
2962   const TargetRegisterClass *RC = &X86::GR32RegClass;
2963 
2964   // Read a 32 bit value from the registers.
2965   if (nullptr == InFlag) {
2966     // When no physical register is present,
2967     // create an intermediate virtual register.
2968     Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
2969     ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2970     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2971     ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2972   } else {
2973     // When a physical register is available read the value from it and glue
2974     // the reads together.
2975     ArgValueLo =
2976       DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2977     *InFlag = ArgValueLo.getValue(2);
2978     ArgValueHi =
2979       DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2980     *InFlag = ArgValueHi.getValue(2);
2981   }
2982 
2983   // Convert the i32 type into v32i1 type.
2984   Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2985 
2986   // Convert the i32 type into v32i1 type.
2987   Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2988 
2989   // Concatenate the two values together.
2990   return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2991 }
2992 
2993 /// The function will lower a register of various sizes (8/16/32/64)
2994 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2995 /// \returns a DAG node contains the operand after lowering to mask type.
lowerRegToMasks(const SDValue & ValArg,const EVT & ValVT,const EVT & ValLoc,const SDLoc & Dl,SelectionDAG & DAG)2996 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2997                                const EVT &ValLoc, const SDLoc &Dl,
2998                                SelectionDAG &DAG) {
2999   SDValue ValReturned = ValArg;
3000 
3001   if (ValVT == MVT::v1i1)
3002     return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
3003 
3004   if (ValVT == MVT::v64i1) {
3005     // In 32 bit machine, this case is handled by getv64i1Argument
3006     assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
3007     // In 64 bit machine, There is no need to truncate the value only bitcast
3008   } else {
3009     MVT maskLen;
3010     switch (ValVT.getSimpleVT().SimpleTy) {
3011     case MVT::v8i1:
3012       maskLen = MVT::i8;
3013       break;
3014     case MVT::v16i1:
3015       maskLen = MVT::i16;
3016       break;
3017     case MVT::v32i1:
3018       maskLen = MVT::i32;
3019       break;
3020     default:
3021       llvm_unreachable("Expecting a vector of i1 types");
3022     }
3023 
3024     ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
3025   }
3026   return DAG.getBitcast(ValVT, ValReturned);
3027 }
3028 
3029 /// Lower the result values of a call into the
3030 /// appropriate copies out of appropriate physical registers.
3031 ///
LowerCallResult(SDValue Chain,SDValue InFlag,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & dl,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals,uint32_t * RegMask) const3032 SDValue X86TargetLowering::LowerCallResult(
3033     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3034     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3035     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
3036     uint32_t *RegMask) const {
3037 
3038   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3039   // Assign locations to each value returned by this call.
3040   SmallVector<CCValAssign, 16> RVLocs;
3041   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3042                  *DAG.getContext());
3043   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3044 
3045   // Copy all of the result registers out of their specified physreg.
3046   for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
3047        ++I, ++InsIndex) {
3048     CCValAssign &VA = RVLocs[I];
3049     EVT CopyVT = VA.getLocVT();
3050 
3051     // In some calling conventions we need to remove the used registers
3052     // from the register mask.
3053     if (RegMask) {
3054       for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
3055            SubRegs.isValid(); ++SubRegs)
3056         RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3057     }
3058 
3059     // Report an error if there was an attempt to return FP values via XMM
3060     // registers.
3061     if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3062       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3063       if (VA.getLocReg() == X86::XMM1)
3064         VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3065       else
3066         VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3067     } else if (!Subtarget.hasSSE2() &&
3068                X86::FR64XRegClass.contains(VA.getLocReg()) &&
3069                CopyVT == MVT::f64) {
3070       errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3071       if (VA.getLocReg() == X86::XMM1)
3072         VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3073       else
3074         VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3075     }
3076 
3077     // If we prefer to use the value in xmm registers, copy it out as f80 and
3078     // use a truncate to move it from fp stack reg to xmm reg.
3079     bool RoundAfterCopy = false;
3080     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
3081         isScalarFPTypeInSSEReg(VA.getValVT())) {
3082       if (!Subtarget.hasX87())
3083         report_fatal_error("X87 register return with X87 disabled");
3084       CopyVT = MVT::f80;
3085       RoundAfterCopy = (CopyVT != VA.getLocVT());
3086     }
3087 
3088     SDValue Val;
3089     if (VA.needsCustom()) {
3090       assert(VA.getValVT() == MVT::v64i1 &&
3091              "Currently the only custom case is when we split v64i1 to 2 regs");
3092       Val =
3093           getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
3094     } else {
3095       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
3096                   .getValue(1);
3097       Val = Chain.getValue(0);
3098       InFlag = Chain.getValue(2);
3099     }
3100 
3101     if (RoundAfterCopy)
3102       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
3103                         // This truncation won't change the value.
3104                         DAG.getIntPtrConstant(1, dl));
3105 
3106     if (VA.isExtInLoc()) {
3107       if (VA.getValVT().isVector() &&
3108           VA.getValVT().getScalarType() == MVT::i1 &&
3109           ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3110            (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3111         // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3112         Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
3113       } else
3114         Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
3115     }
3116 
3117     if (VA.getLocInfo() == CCValAssign::BCvt)
3118       Val = DAG.getBitcast(VA.getValVT(), Val);
3119 
3120     InVals.push_back(Val);
3121   }
3122 
3123   return Chain;
3124 }
3125 
3126 //===----------------------------------------------------------------------===//
3127 //                C & StdCall & Fast Calling Convention implementation
3128 //===----------------------------------------------------------------------===//
3129 //  StdCall calling convention seems to be standard for many Windows' API
3130 //  routines and around. It differs from C calling convention just a little:
3131 //  callee should clean up the stack, not caller. Symbols should be also
3132 //  decorated in some fancy way :) It doesn't support any vector arguments.
3133 //  For info on fast calling convention see Fast Calling Convention (tail call)
3134 //  implementation LowerX86_32FastCCCallTo.
3135 
3136 /// CallIsStructReturn - Determines whether a call uses struct return
3137 /// semantics.
3138 enum StructReturnType {
3139   NotStructReturn,
3140   RegStructReturn,
3141   StackStructReturn
3142 };
3143 static StructReturnType
callIsStructReturn(ArrayRef<ISD::OutputArg> Outs,bool IsMCU)3144 callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
3145   if (Outs.empty())
3146     return NotStructReturn;
3147 
3148   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
3149   if (!Flags.isSRet())
3150     return NotStructReturn;
3151   if (Flags.isInReg() || IsMCU)
3152     return RegStructReturn;
3153   return StackStructReturn;
3154 }
3155 
3156 /// Determines whether a function uses struct return semantics.
3157 static StructReturnType
argsAreStructReturn(ArrayRef<ISD::InputArg> Ins,bool IsMCU)3158 argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
3159   if (Ins.empty())
3160     return NotStructReturn;
3161 
3162   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
3163   if (!Flags.isSRet())
3164     return NotStructReturn;
3165   if (Flags.isInReg() || IsMCU)
3166     return RegStructReturn;
3167   return StackStructReturn;
3168 }
3169 
3170 /// Make a copy of an aggregate at address specified by "Src" to address
3171 /// "Dst" with size and alignment information specified by the specific
3172 /// parameter attribute. The copy will be passed as a byval function parameter.
CreateCopyOfByValArgument(SDValue Src,SDValue Dst,SDValue Chain,ISD::ArgFlagsTy Flags,SelectionDAG & DAG,const SDLoc & dl)3173 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
3174                                          SDValue Chain, ISD::ArgFlagsTy Flags,
3175                                          SelectionDAG &DAG, const SDLoc &dl) {
3176   SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
3177 
3178   return DAG.getMemcpy(
3179       Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
3180       /*isVolatile*/ false, /*AlwaysInline=*/true,
3181       /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
3182 }
3183 
3184 /// Return true if the calling convention is one that we can guarantee TCO for.
canGuaranteeTCO(CallingConv::ID CC)3185 static bool canGuaranteeTCO(CallingConv::ID CC) {
3186   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3187           CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
3188           CC == CallingConv::HHVM || CC == CallingConv::Tail ||
3189           CC == CallingConv::SwiftTail);
3190 }
3191 
3192 /// Return true if we might ever do TCO for calls with this calling convention.
mayTailCallThisCC(CallingConv::ID CC)3193 static bool mayTailCallThisCC(CallingConv::ID CC) {
3194   switch (CC) {
3195   // C calling conventions:
3196   case CallingConv::C:
3197   case CallingConv::Win64:
3198   case CallingConv::X86_64_SysV:
3199   // Callee pop conventions:
3200   case CallingConv::X86_ThisCall:
3201   case CallingConv::X86_StdCall:
3202   case CallingConv::X86_VectorCall:
3203   case CallingConv::X86_FastCall:
3204   // Swift:
3205   case CallingConv::Swift:
3206     return true;
3207   default:
3208     return canGuaranteeTCO(CC);
3209   }
3210 }
3211 
3212 /// Return true if the function is being made into a tailcall target by
3213 /// changing its ABI.
shouldGuaranteeTCO(CallingConv::ID CC,bool GuaranteedTailCallOpt)3214 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
3215   return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
3216          CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
3217 }
3218 
mayBeEmittedAsTailCall(const CallInst * CI) const3219 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3220   if (!CI->isTailCall())
3221     return false;
3222 
3223   CallingConv::ID CalleeCC = CI->getCallingConv();
3224   if (!mayTailCallThisCC(CalleeCC))
3225     return false;
3226 
3227   return true;
3228 }
3229 
3230 SDValue
LowerMemArgument(SDValue Chain,CallingConv::ID CallConv,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & dl,SelectionDAG & DAG,const CCValAssign & VA,MachineFrameInfo & MFI,unsigned i) const3231 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
3232                                     const SmallVectorImpl<ISD::InputArg> &Ins,
3233                                     const SDLoc &dl, SelectionDAG &DAG,
3234                                     const CCValAssign &VA,
3235                                     MachineFrameInfo &MFI, unsigned i) const {
3236   // Create the nodes corresponding to a load from this parameter slot.
3237   ISD::ArgFlagsTy Flags = Ins[i].Flags;
3238   bool AlwaysUseMutable = shouldGuaranteeTCO(
3239       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
3240   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
3241   EVT ValVT;
3242   MVT PtrVT = getPointerTy(DAG.getDataLayout());
3243 
3244   // If value is passed by pointer we have address passed instead of the value
3245   // itself. No need to extend if the mask value and location share the same
3246   // absolute size.
3247   bool ExtendedInMem =
3248       VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
3249       VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
3250 
3251   if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
3252     ValVT = VA.getLocVT();
3253   else
3254     ValVT = VA.getValVT();
3255 
3256   // FIXME: For now, all byval parameter objects are marked mutable. This can be
3257   // changed with more analysis.
3258   // In case of tail call optimization mark all arguments mutable. Since they
3259   // could be overwritten by lowering of arguments in case of a tail call.
3260   if (Flags.isByVal()) {
3261     unsigned Bytes = Flags.getByValSize();
3262     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
3263 
3264     // FIXME: For now, all byval parameter objects are marked as aliasing. This
3265     // can be improved with deeper analysis.
3266     int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
3267                                    /*isAliased=*/true);
3268     return DAG.getFrameIndex(FI, PtrVT);
3269   }
3270 
3271   EVT ArgVT = Ins[i].ArgVT;
3272 
3273   // If this is a vector that has been split into multiple parts, and the
3274   // scalar size of the parts don't match the vector element size, then we can't
3275   // elide the copy. The parts will have padding between them instead of being
3276   // packed like a vector.
3277   bool ScalarizedAndExtendedVector =
3278       ArgVT.isVector() && !VA.getLocVT().isVector() &&
3279       VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
3280 
3281   // This is an argument in memory. We might be able to perform copy elision.
3282   // If the argument is passed directly in memory without any extension, then we
3283   // can perform copy elision. Large vector types, for example, may be passed
3284   // indirectly by pointer.
3285   if (Flags.isCopyElisionCandidate() &&
3286       VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
3287       !ScalarizedAndExtendedVector) {
3288     SDValue PartAddr;
3289     if (Ins[i].PartOffset == 0) {
3290       // If this is a one-part value or the first part of a multi-part value,
3291       // create a stack object for the entire argument value type and return a
3292       // load from our portion of it. This assumes that if the first part of an
3293       // argument is in memory, the rest will also be in memory.
3294       int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
3295                                      /*IsImmutable=*/false);
3296       PartAddr = DAG.getFrameIndex(FI, PtrVT);
3297       return DAG.getLoad(
3298           ValVT, dl, Chain, PartAddr,
3299           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3300     } else {
3301       // This is not the first piece of an argument in memory. See if there is
3302       // already a fixed stack object including this offset. If so, assume it
3303       // was created by the PartOffset == 0 branch above and create a load from
3304       // the appropriate offset into it.
3305       int64_t PartBegin = VA.getLocMemOffset();
3306       int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
3307       int FI = MFI.getObjectIndexBegin();
3308       for (; MFI.isFixedObjectIndex(FI); ++FI) {
3309         int64_t ObjBegin = MFI.getObjectOffset(FI);
3310         int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
3311         if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
3312           break;
3313       }
3314       if (MFI.isFixedObjectIndex(FI)) {
3315         SDValue Addr =
3316             DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
3317                         DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
3318         return DAG.getLoad(
3319             ValVT, dl, Chain, Addr,
3320             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
3321                                               Ins[i].PartOffset));
3322       }
3323     }
3324   }
3325 
3326   int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
3327                                  VA.getLocMemOffset(), isImmutable);
3328 
3329   // Set SExt or ZExt flag.
3330   if (VA.getLocInfo() == CCValAssign::ZExt) {
3331     MFI.setObjectZExt(FI, true);
3332   } else if (VA.getLocInfo() == CCValAssign::SExt) {
3333     MFI.setObjectSExt(FI, true);
3334   }
3335 
3336   SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3337   SDValue Val = DAG.getLoad(
3338       ValVT, dl, Chain, FIN,
3339       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3340   return ExtendedInMem
3341              ? (VA.getValVT().isVector()
3342                     ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
3343                     : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
3344              : Val;
3345 }
3346 
3347 // FIXME: Get this from tablegen.
get64BitArgumentGPRs(CallingConv::ID CallConv,const X86Subtarget & Subtarget)3348 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
3349                                                 const X86Subtarget &Subtarget) {
3350   assert(Subtarget.is64Bit());
3351 
3352   if (Subtarget.isCallingConvWin64(CallConv)) {
3353     static const MCPhysReg GPR64ArgRegsWin64[] = {
3354       X86::RCX, X86::RDX, X86::R8,  X86::R9
3355     };
3356     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
3357   }
3358 
3359   static const MCPhysReg GPR64ArgRegs64Bit[] = {
3360     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
3361   };
3362   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
3363 }
3364 
3365 // FIXME: Get this from tablegen.
get64BitArgumentXMMs(MachineFunction & MF,CallingConv::ID CallConv,const X86Subtarget & Subtarget)3366 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
3367                                                 CallingConv::ID CallConv,
3368                                                 const X86Subtarget &Subtarget) {
3369   assert(Subtarget.is64Bit());
3370   if (Subtarget.isCallingConvWin64(CallConv)) {
3371     // The XMM registers which might contain var arg parameters are shadowed
3372     // in their paired GPR.  So we only need to save the GPR to their home
3373     // slots.
3374     // TODO: __vectorcall will change this.
3375     return None;
3376   }
3377 
3378   const Function &F = MF.getFunction();
3379   bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
3380   bool isSoftFloat = Subtarget.useSoftFloat();
3381   assert(!(isSoftFloat && NoImplicitFloatOps) &&
3382          "SSE register cannot be used when SSE is disabled!");
3383   if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
3384     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
3385     // registers.
3386     return None;
3387 
3388   static const MCPhysReg XMMArgRegs64Bit[] = {
3389     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3390     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3391   };
3392   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
3393 }
3394 
3395 #ifndef NDEBUG
isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs)3396 static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
3397   return llvm::is_sorted(
3398       ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
3399         return A.getValNo() < B.getValNo();
3400       });
3401 }
3402 #endif
3403 
3404 namespace {
3405 /// This is a helper class for lowering variable arguments parameters.
3406 class VarArgsLoweringHelper {
3407 public:
VarArgsLoweringHelper(X86MachineFunctionInfo * FuncInfo,const SDLoc & Loc,SelectionDAG & DAG,const X86Subtarget & Subtarget,CallingConv::ID CallConv,CCState & CCInfo)3408   VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
3409                         SelectionDAG &DAG, const X86Subtarget &Subtarget,
3410                         CallingConv::ID CallConv, CCState &CCInfo)
3411       : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
3412         TheMachineFunction(DAG.getMachineFunction()),
3413         TheFunction(TheMachineFunction.getFunction()),
3414         FrameInfo(TheMachineFunction.getFrameInfo()),
3415         FrameLowering(*Subtarget.getFrameLowering()),
3416         TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
3417         CCInfo(CCInfo) {}
3418 
3419   // Lower variable arguments parameters.
3420   void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
3421 
3422 private:
3423   void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
3424 
3425   void forwardMustTailParameters(SDValue &Chain);
3426 
is64Bit() const3427   bool is64Bit() const { return Subtarget.is64Bit(); }
isWin64() const3428   bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
3429 
3430   X86MachineFunctionInfo *FuncInfo;
3431   const SDLoc &DL;
3432   SelectionDAG &DAG;
3433   const X86Subtarget &Subtarget;
3434   MachineFunction &TheMachineFunction;
3435   const Function &TheFunction;
3436   MachineFrameInfo &FrameInfo;
3437   const TargetFrameLowering &FrameLowering;
3438   const TargetLowering &TargLowering;
3439   CallingConv::ID CallConv;
3440   CCState &CCInfo;
3441 };
3442 } // namespace
3443 
createVarArgAreaAndStoreRegisters(SDValue & Chain,unsigned StackSize)3444 void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
3445     SDValue &Chain, unsigned StackSize) {
3446   // If the function takes variable number of arguments, make a frame index for
3447   // the start of the first vararg value... for expansion of llvm.va_start. We
3448   // can skip this if there are no va_start calls.
3449   if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
3450                     CallConv != CallingConv::X86_ThisCall)) {
3451     FuncInfo->setVarArgsFrameIndex(
3452         FrameInfo.CreateFixedObject(1, StackSize, true));
3453   }
3454 
3455   // Figure out if XMM registers are in use.
3456   assert(!(Subtarget.useSoftFloat() &&
3457            TheFunction.hasFnAttribute(Attribute::NoImplicitFloat)) &&
3458          "SSE register cannot be used when SSE is disabled!");
3459 
3460   // 64-bit calling conventions support varargs and register parameters, so we
3461   // have to do extra work to spill them in the prologue.
3462   if (is64Bit()) {
3463     // Find the first unallocated argument registers.
3464     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3465     ArrayRef<MCPhysReg> ArgXMMs =
3466         get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
3467     unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3468     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3469 
3470     assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3471            "SSE register cannot be used when SSE is disabled!");
3472 
3473     if (isWin64()) {
3474       // Get to the caller-allocated home save location.  Add 8 to account
3475       // for the return address.
3476       int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
3477       FuncInfo->setRegSaveFrameIndex(
3478           FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3479       // Fixup to set vararg frame on shadow area (4 x i64).
3480       if (NumIntRegs < 4)
3481         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3482     } else {
3483       // For X86-64, if there are vararg parameters that are passed via
3484       // registers, then we must store them to their spots on the stack so
3485       // they may be loaded by dereferencing the result of va_next.
3486       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3487       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3488       FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
3489           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
3490     }
3491 
3492     SmallVector<SDValue, 6>
3493         LiveGPRs; // list of SDValue for GPR registers keeping live input value
3494     SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
3495                                          // keeping live input value
3496     SDValue ALVal; // if applicable keeps SDValue for %al register
3497 
3498     // Gather all the live in physical registers.
3499     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3500       Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
3501       LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
3502     }
3503     const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
3504     if (!AvailableXmms.empty()) {
3505       Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3506       ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
3507       for (MCPhysReg Reg : AvailableXmms) {
3508         // FastRegisterAllocator spills virtual registers at basic
3509         // block boundary. That leads to usages of xmm registers
3510         // outside of check for %al. Pass physical registers to
3511         // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
3512         TheMachineFunction.getRegInfo().addLiveIn(Reg);
3513         LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
3514       }
3515     }
3516 
3517     // Store the integer parameter registers.
3518     SmallVector<SDValue, 8> MemOps;
3519     SDValue RSFIN =
3520         DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3521                           TargLowering.getPointerTy(DAG.getDataLayout()));
3522     unsigned Offset = FuncInfo->getVarArgsGPOffset();
3523     for (SDValue Val : LiveGPRs) {
3524       SDValue FIN = DAG.getNode(ISD::ADD, DL,
3525                                 TargLowering.getPointerTy(DAG.getDataLayout()),
3526                                 RSFIN, DAG.getIntPtrConstant(Offset, DL));
3527       SDValue Store =
3528           DAG.getStore(Val.getValue(1), DL, Val, FIN,
3529                        MachinePointerInfo::getFixedStack(
3530                            DAG.getMachineFunction(),
3531                            FuncInfo->getRegSaveFrameIndex(), Offset));
3532       MemOps.push_back(Store);
3533       Offset += 8;
3534     }
3535 
3536     // Now store the XMM (fp + vector) parameter registers.
3537     if (!LiveXMMRegs.empty()) {
3538       SmallVector<SDValue, 12> SaveXMMOps;
3539       SaveXMMOps.push_back(Chain);
3540       SaveXMMOps.push_back(ALVal);
3541       SaveXMMOps.push_back(
3542           DAG.getTargetConstant(FuncInfo->getRegSaveFrameIndex(), DL, MVT::i32));
3543       SaveXMMOps.push_back(
3544           DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
3545       llvm::append_range(SaveXMMOps, LiveXMMRegs);
3546       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, DL,
3547                                    MVT::Other, SaveXMMOps));
3548     }
3549 
3550     if (!MemOps.empty())
3551       Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3552   }
3553 }
3554 
forwardMustTailParameters(SDValue & Chain)3555 void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
3556   // Find the largest legal vector type.
3557   MVT VecVT = MVT::Other;
3558   // FIXME: Only some x86_32 calling conventions support AVX512.
3559   if (Subtarget.useAVX512Regs() &&
3560       (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
3561                      CallConv == CallingConv::Intel_OCL_BI)))
3562     VecVT = MVT::v16f32;
3563   else if (Subtarget.hasAVX())
3564     VecVT = MVT::v8f32;
3565   else if (Subtarget.hasSSE2())
3566     VecVT = MVT::v4f32;
3567 
3568   // We forward some GPRs and some vector types.
3569   SmallVector<MVT, 2> RegParmTypes;
3570   MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
3571   RegParmTypes.push_back(IntVT);
3572   if (VecVT != MVT::Other)
3573     RegParmTypes.push_back(VecVT);
3574 
3575   // Compute the set of forwarded registers. The rest are scratch.
3576   SmallVectorImpl<ForwardedRegister> &Forwards =
3577       FuncInfo->getForwardedMustTailRegParms();
3578   CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3579 
3580   // Forward AL for SysV x86_64 targets, since it is used for varargs.
3581   if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
3582     Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3583     Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3584   }
3585 
3586   // Copy all forwards from physical to virtual registers.
3587   for (ForwardedRegister &FR : Forwards) {
3588     // FIXME: Can we use a less constrained schedule?
3589     SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
3590     FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
3591         TargLowering.getRegClassFor(FR.VT));
3592     Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
3593   }
3594 }
3595 
lowerVarArgsParameters(SDValue & Chain,unsigned StackSize)3596 void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
3597                                                    unsigned StackSize) {
3598   // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
3599   // If necessary, it would be set into the correct value later.
3600   FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3601   FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3602 
3603   if (FrameInfo.hasVAStart())
3604     createVarArgAreaAndStoreRegisters(Chain, StackSize);
3605 
3606   if (FrameInfo.hasMustTailInVarArgFunc())
3607     forwardMustTailParameters(Chain);
3608 }
3609 
LowerFormalArguments(SDValue Chain,CallingConv::ID CallConv,bool IsVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & dl,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals) const3610 SDValue X86TargetLowering::LowerFormalArguments(
3611     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
3612     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3613     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3614   MachineFunction &MF = DAG.getMachineFunction();
3615   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3616 
3617   const Function &F = MF.getFunction();
3618   if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
3619       F.getName() == "main")
3620     FuncInfo->setForceFramePointer(true);
3621 
3622   MachineFrameInfo &MFI = MF.getFrameInfo();
3623   bool Is64Bit = Subtarget.is64Bit();
3624   bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3625 
3626   assert(
3627       !(IsVarArg && canGuaranteeTCO(CallConv)) &&
3628       "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
3629 
3630   // Assign locations to all of the incoming arguments.
3631   SmallVector<CCValAssign, 16> ArgLocs;
3632   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3633 
3634   // Allocate shadow area for Win64.
3635   if (IsWin64)
3636     CCInfo.AllocateStack(32, Align(8));
3637 
3638   CCInfo.AnalyzeArguments(Ins, CC_X86);
3639 
3640   // In vectorcall calling convention a second pass is required for the HVA
3641   // types.
3642   if (CallingConv::X86_VectorCall == CallConv) {
3643     CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
3644   }
3645 
3646   // The next loop assumes that the locations are in the same order of the
3647   // input arguments.
3648   assert(isSortedByValueNo(ArgLocs) &&
3649          "Argument Location list must be sorted before lowering");
3650 
3651   SDValue ArgValue;
3652   for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
3653        ++I, ++InsIndex) {
3654     assert(InsIndex < Ins.size() && "Invalid Ins index");
3655     CCValAssign &VA = ArgLocs[I];
3656 
3657     if (VA.isRegLoc()) {
3658       EVT RegVT = VA.getLocVT();
3659       if (VA.needsCustom()) {
3660         assert(
3661             VA.getValVT() == MVT::v64i1 &&
3662             "Currently the only custom case is when we split v64i1 to 2 regs");
3663 
3664         // v64i1 values, in regcall calling convention, that are
3665         // compiled to 32 bit arch, are split up into two registers.
3666         ArgValue =
3667             getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3668       } else {
3669         const TargetRegisterClass *RC;
3670         if (RegVT == MVT::i8)
3671           RC = &X86::GR8RegClass;
3672         else if (RegVT == MVT::i16)
3673           RC = &X86::GR16RegClass;
3674         else if (RegVT == MVT::i32)
3675           RC = &X86::GR32RegClass;
3676         else if (Is64Bit && RegVT == MVT::i64)
3677           RC = &X86::GR64RegClass;
3678         else if (RegVT == MVT::f32)
3679           RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3680         else if (RegVT == MVT::f64)
3681           RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3682         else if (RegVT == MVT::f80)
3683           RC = &X86::RFP80RegClass;
3684         else if (RegVT == MVT::f128)
3685           RC = &X86::VR128RegClass;
3686         else if (RegVT.is512BitVector())
3687           RC = &X86::VR512RegClass;
3688         else if (RegVT.is256BitVector())
3689           RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3690         else if (RegVT.is128BitVector())
3691           RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3692         else if (RegVT == MVT::x86mmx)
3693           RC = &X86::VR64RegClass;
3694         else if (RegVT == MVT::v1i1)
3695           RC = &X86::VK1RegClass;
3696         else if (RegVT == MVT::v8i1)
3697           RC = &X86::VK8RegClass;
3698         else if (RegVT == MVT::v16i1)
3699           RC = &X86::VK16RegClass;
3700         else if (RegVT == MVT::v32i1)
3701           RC = &X86::VK32RegClass;
3702         else if (RegVT == MVT::v64i1)
3703           RC = &X86::VK64RegClass;
3704         else
3705           llvm_unreachable("Unknown argument type!");
3706 
3707         Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
3708         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3709       }
3710 
3711       // If this is an 8 or 16-bit value, it is really passed promoted to 32
3712       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
3713       // right size.
3714       if (VA.getLocInfo() == CCValAssign::SExt)
3715         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3716                                DAG.getValueType(VA.getValVT()));
3717       else if (VA.getLocInfo() == CCValAssign::ZExt)
3718         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3719                                DAG.getValueType(VA.getValVT()));
3720       else if (VA.getLocInfo() == CCValAssign::BCvt)
3721         ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3722 
3723       if (VA.isExtInLoc()) {
3724         // Handle MMX values passed in XMM regs.
3725         if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3726           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3727         else if (VA.getValVT().isVector() &&
3728                  VA.getValVT().getScalarType() == MVT::i1 &&
3729                  ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3730                   (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3731           // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3732           ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3733         } else
3734           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3735       }
3736     } else {
3737       assert(VA.isMemLoc());
3738       ArgValue =
3739           LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3740     }
3741 
3742     // If value is passed via pointer - do a load.
3743     if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
3744       ArgValue =
3745           DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3746 
3747     InVals.push_back(ArgValue);
3748   }
3749 
3750   for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3751     if (Ins[I].Flags.isSwiftAsync()) {
3752       auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
3753       if (Subtarget.is64Bit())
3754         X86FI->setHasSwiftAsyncContext(true);
3755       else {
3756         int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
3757         X86FI->setSwiftAsyncContextFrameIdx(FI);
3758         SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],
3759                                   DAG.getFrameIndex(FI, MVT::i32),
3760                                   MachinePointerInfo::getFixedStack(MF, FI));
3761         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
3762       }
3763     }
3764 
3765     // Swift calling convention does not require we copy the sret argument
3766     // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3767     if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
3768       continue;
3769 
3770     // All x86 ABIs require that for returning structs by value we copy the
3771     // sret argument into %rax/%eax (depending on ABI) for the return. Save
3772     // the argument into a virtual register so that we can access it from the
3773     // return points.
3774     if (Ins[I].Flags.isSRet()) {
3775       Register Reg = FuncInfo->getSRetReturnReg();
3776       if (!Reg) {
3777         MVT PtrTy = getPointerTy(DAG.getDataLayout());
3778         Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3779         FuncInfo->setSRetReturnReg(Reg);
3780       }
3781       SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3782       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3783       break;
3784     }
3785   }
3786 
3787   unsigned StackSize = CCInfo.getNextStackOffset();
3788   // Align stack specially for tail calls.
3789   if (shouldGuaranteeTCO(CallConv,
3790                          MF.getTarget().Options.GuaranteedTailCallOpt))
3791     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3792 
3793   if (IsVarArg)
3794     VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
3795         .lowerVarArgsParameters(Chain, StackSize);
3796 
3797   // Some CCs need callee pop.
3798   if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
3799                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
3800     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3801   } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3802     // X86 interrupts must pop the error code (and the alignment padding) if
3803     // present.
3804     FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3805   } else {
3806     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3807     // If this is an sret function, the return should pop the hidden pointer.
3808     if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3809         !Subtarget.getTargetTriple().isOSMSVCRT() &&
3810         argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3811       FuncInfo->setBytesToPopOnReturn(4);
3812   }
3813 
3814   if (!Is64Bit) {
3815     // RegSaveFrameIndex is X86-64 only.
3816     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3817   }
3818 
3819   FuncInfo->setArgumentStackSize(StackSize);
3820 
3821   if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3822     EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
3823     if (Personality == EHPersonality::CoreCLR) {
3824       assert(Is64Bit);
3825       // TODO: Add a mechanism to frame lowering that will allow us to indicate
3826       // that we'd prefer this slot be allocated towards the bottom of the frame
3827       // (i.e. near the stack pointer after allocating the frame).  Every
3828       // funclet needs a copy of this slot in its (mostly empty) frame, and the
3829       // offset from the bottom of this and each funclet's frame must be the
3830       // same, so the size of funclets' (mostly empty) frames is dictated by
3831       // how far this slot is from the bottom (since they allocate just enough
3832       // space to accommodate holding this slot at the correct offset).
3833       int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
3834       EHInfo->PSPSymFrameIdx = PSPSymFI;
3835     }
3836   }
3837 
3838   if (CallConv == CallingConv::X86_RegCall ||
3839       F.hasFnAttribute("no_caller_saved_registers")) {
3840     MachineRegisterInfo &MRI = MF.getRegInfo();
3841     for (std::pair<Register, Register> Pair : MRI.liveins())
3842       MRI.disableCalleeSavedRegister(Pair.first);
3843   }
3844 
3845   return Chain;
3846 }
3847 
LowerMemOpCallTo(SDValue Chain,SDValue StackPtr,SDValue Arg,const SDLoc & dl,SelectionDAG & DAG,const CCValAssign & VA,ISD::ArgFlagsTy Flags,bool isByVal) const3848 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3849                                             SDValue Arg, const SDLoc &dl,
3850                                             SelectionDAG &DAG,
3851                                             const CCValAssign &VA,
3852                                             ISD::ArgFlagsTy Flags,
3853                                             bool isByVal) const {
3854   unsigned LocMemOffset = VA.getLocMemOffset();
3855   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3856   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3857                        StackPtr, PtrOff);
3858   if (isByVal)
3859     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3860 
3861   return DAG.getStore(
3862       Chain, dl, Arg, PtrOff,
3863       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3864 }
3865 
3866 /// Emit a load of return address if tail call
3867 /// optimization is performed and it is required.
EmitTailCallLoadRetAddr(SelectionDAG & DAG,SDValue & OutRetAddr,SDValue Chain,bool IsTailCall,bool Is64Bit,int FPDiff,const SDLoc & dl) const3868 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3869     SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3870     bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3871   // Adjust the Return address stack slot.
3872   EVT VT = getPointerTy(DAG.getDataLayout());
3873   OutRetAddr = getReturnAddressFrameIndex(DAG);
3874 
3875   // Load the "old" Return address.
3876   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3877   return SDValue(OutRetAddr.getNode(), 1);
3878 }
3879 
3880 /// Emit a store of the return address if tail call
3881 /// optimization is performed and it is required (FPDiff!=0).
EmitTailCallStoreRetAddr(SelectionDAG & DAG,MachineFunction & MF,SDValue Chain,SDValue RetAddrFrIdx,EVT PtrVT,unsigned SlotSize,int FPDiff,const SDLoc & dl)3882 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3883                                         SDValue Chain, SDValue RetAddrFrIdx,
3884                                         EVT PtrVT, unsigned SlotSize,
3885                                         int FPDiff, const SDLoc &dl) {
3886   // Store the return address to the appropriate stack slot.
3887   if (!FPDiff) return Chain;
3888   // Calculate the new stack slot for the return address.
3889   int NewReturnAddrFI =
3890     MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3891                                          false);
3892   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3893   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3894                        MachinePointerInfo::getFixedStack(
3895                            DAG.getMachineFunction(), NewReturnAddrFI));
3896   return Chain;
3897 }
3898 
3899 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3900 /// operation of specified width.
getMOVL(SelectionDAG & DAG,const SDLoc & dl,MVT VT,SDValue V1,SDValue V2)3901 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3902                        SDValue V2) {
3903   unsigned NumElems = VT.getVectorNumElements();
3904   SmallVector<int, 8> Mask;
3905   Mask.push_back(NumElems);
3906   for (unsigned i = 1; i != NumElems; ++i)
3907     Mask.push_back(i);
3908   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3909 }
3910 
3911 SDValue
LowerCall(TargetLowering::CallLoweringInfo & CLI,SmallVectorImpl<SDValue> & InVals) const3912 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3913                              SmallVectorImpl<SDValue> &InVals) const {
3914   SelectionDAG &DAG                     = CLI.DAG;
3915   SDLoc &dl                             = CLI.DL;
3916   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3917   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
3918   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
3919   SDValue Chain                         = CLI.Chain;
3920   SDValue Callee                        = CLI.Callee;
3921   CallingConv::ID CallConv              = CLI.CallConv;
3922   bool &isTailCall                      = CLI.IsTailCall;
3923   bool isVarArg                         = CLI.IsVarArg;
3924   const auto *CB                        = CLI.CB;
3925 
3926   MachineFunction &MF = DAG.getMachineFunction();
3927   bool Is64Bit        = Subtarget.is64Bit();
3928   bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
3929   StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3930   bool IsSibcall      = false;
3931   bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
3932       CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
3933   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3934   bool HasNCSR = (CB && isa<CallInst>(CB) &&
3935                   CB->hasFnAttr("no_caller_saved_registers"));
3936   bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
3937   bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
3938   const Module *M = MF.getMMI().getModule();
3939   Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
3940 
3941   MachineFunction::CallSiteInfo CSInfo;
3942   if (CallConv == CallingConv::X86_INTR)
3943     report_fatal_error("X86 interrupts may not be called directly");
3944 
3945   if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) {
3946     // If we are using a GOT, disable tail calls to external symbols with
3947     // default visibility. Tail calling such a symbol requires using a GOT
3948     // relocation, which forces early binding of the symbol. This breaks code
3949     // that require lazy function symbol resolution. Using musttail or
3950     // GuaranteedTailCallOpt will override this.
3951     GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3952     if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3953                G->getGlobal()->hasDefaultVisibility()))
3954       isTailCall = false;
3955   }
3956 
3957   bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
3958   if (IsMustTail) {
3959     // Force this to be a tail call.  The verifier rules are enough to ensure
3960     // that we can lower this successfully without moving the return address
3961     // around.
3962     isTailCall = true;
3963   } else if (isTailCall) {
3964     // Check if it's really possible to do a tail call.
3965     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3966                     isVarArg, SR != NotStructReturn,
3967                     MF.getFunction().hasStructRetAttr(), CLI.RetTy,
3968                     Outs, OutVals, Ins, DAG);
3969 
3970     // Sibcalls are automatically detected tailcalls which do not require
3971     // ABI changes.
3972     if (!IsGuaranteeTCO && isTailCall)
3973       IsSibcall = true;
3974 
3975     if (isTailCall)
3976       ++NumTailCalls;
3977   }
3978 
3979   assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3980          "Var args not supported with calling convention fastcc, ghc or hipe");
3981 
3982   // Analyze operands of the call, assigning locations to each operand.
3983   SmallVector<CCValAssign, 16> ArgLocs;
3984   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3985 
3986   // Allocate shadow area for Win64.
3987   if (IsWin64)
3988     CCInfo.AllocateStack(32, Align(8));
3989 
3990   CCInfo.AnalyzeArguments(Outs, CC_X86);
3991 
3992   // In vectorcall calling convention a second pass is required for the HVA
3993   // types.
3994   if (CallingConv::X86_VectorCall == CallConv) {
3995     CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3996   }
3997 
3998   // Get a count of how many bytes are to be pushed on the stack.
3999   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
4000   if (IsSibcall)
4001     // This is a sibcall. The memory operands are available in caller's
4002     // own caller's stack.
4003     NumBytes = 0;
4004   else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
4005     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
4006 
4007   int FPDiff = 0;
4008   if (isTailCall && !IsSibcall && !IsMustTail) {
4009     // Lower arguments at fp - stackoffset + fpdiff.
4010     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
4011 
4012     FPDiff = NumBytesCallerPushed - NumBytes;
4013 
4014     // Set the delta of movement of the returnaddr stackslot.
4015     // But only set if delta is greater than previous delta.
4016     if (FPDiff < X86Info->getTCReturnAddrDelta())
4017       X86Info->setTCReturnAddrDelta(FPDiff);
4018   }
4019 
4020   unsigned NumBytesToPush = NumBytes;
4021   unsigned NumBytesToPop = NumBytes;
4022 
4023   // If we have an inalloca argument, all stack space has already been allocated
4024   // for us and be right at the top of the stack.  We don't support multiple
4025   // arguments passed in memory when using inalloca.
4026   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
4027     NumBytesToPush = 0;
4028     if (!ArgLocs.back().isMemLoc())
4029       report_fatal_error("cannot use inalloca attribute on a register "
4030                          "parameter");
4031     if (ArgLocs.back().getLocMemOffset() != 0)
4032       report_fatal_error("any parameter with the inalloca attribute must be "
4033                          "the only memory argument");
4034   } else if (CLI.IsPreallocated) {
4035     assert(ArgLocs.back().isMemLoc() &&
4036            "cannot use preallocated attribute on a register "
4037            "parameter");
4038     SmallVector<size_t, 4> PreallocatedOffsets;
4039     for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
4040       if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
4041         PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
4042       }
4043     }
4044     auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
4045     size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
4046     MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
4047     MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
4048     NumBytesToPush = 0;
4049   }
4050 
4051   if (!IsSibcall && !IsMustTail)
4052     Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
4053                                  NumBytes - NumBytesToPush, dl);
4054 
4055   SDValue RetAddrFrIdx;
4056   // Load return address for tail calls.
4057   if (isTailCall && FPDiff)
4058     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
4059                                     Is64Bit, FPDiff, dl);
4060 
4061   SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
4062   SmallVector<SDValue, 8> MemOpChains;
4063   SDValue StackPtr;
4064 
4065   // The next loop assumes that the locations are in the same order of the
4066   // input arguments.
4067   assert(isSortedByValueNo(ArgLocs) &&
4068          "Argument Location list must be sorted before lowering");
4069 
4070   // Walk the register/memloc assignments, inserting copies/loads.  In the case
4071   // of tail call optimization arguments are handle later.
4072   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4073   for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
4074        ++I, ++OutIndex) {
4075     assert(OutIndex < Outs.size() && "Invalid Out index");
4076     // Skip inalloca/preallocated arguments, they have already been written.
4077     ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
4078     if (Flags.isInAlloca() || Flags.isPreallocated())
4079       continue;
4080 
4081     CCValAssign &VA = ArgLocs[I];
4082     EVT RegVT = VA.getLocVT();
4083     SDValue Arg = OutVals[OutIndex];
4084     bool isByVal = Flags.isByVal();
4085 
4086     // Promote the value if needed.
4087     switch (VA.getLocInfo()) {
4088     default: llvm_unreachable("Unknown loc info!");
4089     case CCValAssign::Full: break;
4090     case CCValAssign::SExt:
4091       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
4092       break;
4093     case CCValAssign::ZExt:
4094       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
4095       break;
4096     case CCValAssign::AExt:
4097       if (Arg.getValueType().isVector() &&
4098           Arg.getValueType().getVectorElementType() == MVT::i1)
4099         Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
4100       else if (RegVT.is128BitVector()) {
4101         // Special case: passing MMX values in XMM registers.
4102         Arg = DAG.getBitcast(MVT::i64, Arg);
4103         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
4104         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
4105       } else
4106         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
4107       break;
4108     case CCValAssign::BCvt:
4109       Arg = DAG.getBitcast(RegVT, Arg);
4110       break;
4111     case CCValAssign::Indirect: {
4112       if (isByVal) {
4113         // Memcpy the argument to a temporary stack slot to prevent
4114         // the caller from seeing any modifications the callee may make
4115         // as guaranteed by the `byval` attribute.
4116         int FrameIdx = MF.getFrameInfo().CreateStackObject(
4117             Flags.getByValSize(),
4118             std::max(Align(16), Flags.getNonZeroByValAlign()), false);
4119         SDValue StackSlot =
4120             DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
4121         Chain =
4122             CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
4123         // From now on treat this as a regular pointer
4124         Arg = StackSlot;
4125         isByVal = false;
4126       } else {
4127         // Store the argument.
4128         SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
4129         int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
4130         Chain = DAG.getStore(
4131             Chain, dl, Arg, SpillSlot,
4132             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4133         Arg = SpillSlot;
4134       }
4135       break;
4136     }
4137     }
4138 
4139     if (VA.needsCustom()) {
4140       assert(VA.getValVT() == MVT::v64i1 &&
4141              "Currently the only custom case is when we split v64i1 to 2 regs");
4142       // Split v64i1 value into two registers
4143       Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
4144     } else if (VA.isRegLoc()) {
4145       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
4146       const TargetOptions &Options = DAG.getTarget().Options;
4147       if (Options.EmitCallSiteInfo)
4148         CSInfo.emplace_back(VA.getLocReg(), I);
4149       if (isVarArg && IsWin64) {
4150         // Win64 ABI requires argument XMM reg to be copied to the corresponding
4151         // shadow reg if callee is a varargs function.
4152         Register ShadowReg;
4153         switch (VA.getLocReg()) {
4154         case X86::XMM0: ShadowReg = X86::RCX; break;
4155         case X86::XMM1: ShadowReg = X86::RDX; break;
4156         case X86::XMM2: ShadowReg = X86::R8; break;
4157         case X86::XMM3: ShadowReg = X86::R9; break;
4158         }
4159         if (ShadowReg)
4160           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
4161       }
4162     } else if (!IsSibcall && (!isTailCall || isByVal)) {
4163       assert(VA.isMemLoc());
4164       if (!StackPtr.getNode())
4165         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4166                                       getPointerTy(DAG.getDataLayout()));
4167       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
4168                                              dl, DAG, VA, Flags, isByVal));
4169     }
4170   }
4171 
4172   if (!MemOpChains.empty())
4173     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
4174 
4175   if (Subtarget.isPICStyleGOT()) {
4176     // ELF / PIC requires GOT in the EBX register before function calls via PLT
4177     // GOT pointer (except regcall).
4178     if (!isTailCall) {
4179       // Indirect call with RegCall calling convertion may use up all the
4180       // general registers, so it is not suitable to bind EBX reister for
4181       // GOT address, just let register allocator handle it.
4182       if (CallConv != CallingConv::X86_RegCall)
4183         RegsToPass.push_back(std::make_pair(
4184           Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
4185                                           getPointerTy(DAG.getDataLayout()))));
4186     } else {
4187       // If we are tail calling and generating PIC/GOT style code load the
4188       // address of the callee into ECX. The value in ecx is used as target of
4189       // the tail jump. This is done to circumvent the ebx/callee-saved problem
4190       // for tail calls on PIC/GOT architectures. Normally we would just put the
4191       // address of GOT into ebx and then call target@PLT. But for tail calls
4192       // ebx would be restored (since ebx is callee saved) before jumping to the
4193       // target@PLT.
4194 
4195       // Note: The actual moving to ECX is done further down.
4196       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4197       if (G && !G->getGlobal()->hasLocalLinkage() &&
4198           G->getGlobal()->hasDefaultVisibility())
4199         Callee = LowerGlobalAddress(Callee, DAG);
4200       else if (isa<ExternalSymbolSDNode>(Callee))
4201         Callee = LowerExternalSymbol(Callee, DAG);
4202     }
4203   }
4204 
4205   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
4206     // From AMD64 ABI document:
4207     // For calls that may call functions that use varargs or stdargs
4208     // (prototype-less calls or calls to functions containing ellipsis (...) in
4209     // the declaration) %al is used as hidden argument to specify the number
4210     // of SSE registers used. The contents of %al do not need to match exactly
4211     // the number of registers, but must be an ubound on the number of SSE
4212     // registers used and is in the range 0 - 8 inclusive.
4213 
4214     // Count the number of XMM registers allocated.
4215     static const MCPhysReg XMMArgRegs[] = {
4216       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
4217       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
4218     };
4219     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
4220     assert((Subtarget.hasSSE1() || !NumXMMRegs)
4221            && "SSE registers cannot be used when SSE is disabled");
4222     RegsToPass.push_back(std::make_pair(Register(X86::AL),
4223                                         DAG.getConstant(NumXMMRegs, dl,
4224                                                         MVT::i8)));
4225   }
4226 
4227   if (isVarArg && IsMustTail) {
4228     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
4229     for (const auto &F : Forwards) {
4230       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
4231       RegsToPass.push_back(std::make_pair(F.PReg, Val));
4232     }
4233   }
4234 
4235   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
4236   // don't need this because the eligibility check rejects calls that require
4237   // shuffling arguments passed in memory.
4238   if (!IsSibcall && isTailCall) {
4239     // Force all the incoming stack arguments to be loaded from the stack
4240     // before any new outgoing arguments are stored to the stack, because the
4241     // outgoing stack slots may alias the incoming argument stack slots, and
4242     // the alias isn't otherwise explicit. This is slightly more conservative
4243     // than necessary, because it means that each store effectively depends
4244     // on every argument instead of just those arguments it would clobber.
4245     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
4246 
4247     SmallVector<SDValue, 8> MemOpChains2;
4248     SDValue FIN;
4249     int FI = 0;
4250     for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
4251          ++I, ++OutsIndex) {
4252       CCValAssign &VA = ArgLocs[I];
4253 
4254       if (VA.isRegLoc()) {
4255         if (VA.needsCustom()) {
4256           assert((CallConv == CallingConv::X86_RegCall) &&
4257                  "Expecting custom case only in regcall calling convention");
4258           // This means that we are in special case where one argument was
4259           // passed through two register locations - Skip the next location
4260           ++I;
4261         }
4262 
4263         continue;
4264       }
4265 
4266       assert(VA.isMemLoc());
4267       SDValue Arg = OutVals[OutsIndex];
4268       ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
4269       // Skip inalloca/preallocated arguments.  They don't require any work.
4270       if (Flags.isInAlloca() || Flags.isPreallocated())
4271         continue;
4272       // Create frame index.
4273       int32_t Offset = VA.getLocMemOffset()+FPDiff;
4274       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
4275       FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
4276       FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4277 
4278       if (Flags.isByVal()) {
4279         // Copy relative to framepointer.
4280         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
4281         if (!StackPtr.getNode())
4282           StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4283                                         getPointerTy(DAG.getDataLayout()));
4284         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4285                              StackPtr, Source);
4286 
4287         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
4288                                                          ArgChain,
4289                                                          Flags, DAG, dl));
4290       } else {
4291         // Store relative to framepointer.
4292         MemOpChains2.push_back(DAG.getStore(
4293             ArgChain, dl, Arg, FIN,
4294             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
4295       }
4296     }
4297 
4298     if (!MemOpChains2.empty())
4299       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
4300 
4301     // Store the return address to the appropriate stack slot.
4302     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
4303                                      getPointerTy(DAG.getDataLayout()),
4304                                      RegInfo->getSlotSize(), FPDiff, dl);
4305   }
4306 
4307   // Build a sequence of copy-to-reg nodes chained together with token chain
4308   // and flag operands which copy the outgoing args into registers.
4309   SDValue InFlag;
4310   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
4311     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
4312                              RegsToPass[i].second, InFlag);
4313     InFlag = Chain.getValue(1);
4314   }
4315 
4316   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
4317     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
4318     // In the 64-bit large code model, we have to make all calls
4319     // through a register, since the call instruction's 32-bit
4320     // pc-relative offset may not be large enough to hold the whole
4321     // address.
4322   } else if (Callee->getOpcode() == ISD::GlobalAddress ||
4323              Callee->getOpcode() == ISD::ExternalSymbol) {
4324     // Lower direct calls to global addresses and external symbols. Setting
4325     // ForCall to true here has the effect of removing WrapperRIP when possible
4326     // to allow direct calls to be selected without first materializing the
4327     // address into a register.
4328     Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
4329   } else if (Subtarget.isTarget64BitILP32() &&
4330              Callee->getValueType(0) == MVT::i32) {
4331     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
4332     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
4333   }
4334 
4335   // Returns a chain & a flag for retval copy to use.
4336   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4337   SmallVector<SDValue, 8> Ops;
4338 
4339   if (!IsSibcall && isTailCall && !IsMustTail) {
4340     Chain = DAG.getCALLSEQ_END(Chain,
4341                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4342                                DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
4343     InFlag = Chain.getValue(1);
4344   }
4345 
4346   Ops.push_back(Chain);
4347   Ops.push_back(Callee);
4348 
4349   if (isTailCall)
4350     Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
4351 
4352   // Add argument registers to the end of the list so that they are known live
4353   // into the call.
4354   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
4355     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
4356                                   RegsToPass[i].second.getValueType()));
4357 
4358   // Add a register mask operand representing the call-preserved registers.
4359   const uint32_t *Mask = [&]() {
4360     auto AdaptedCC = CallConv;
4361     // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
4362     // use X86_INTR calling convention because it has the same CSR mask
4363     // (same preserved registers).
4364     if (HasNCSR)
4365       AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
4366     // If NoCalleeSavedRegisters is requested, than use GHC since it happens
4367     // to use the CSR_NoRegs_RegMask.
4368     if (CB && CB->hasFnAttr("no_callee_saved_registers"))
4369       AdaptedCC = (CallingConv::ID)CallingConv::GHC;
4370     return RegInfo->getCallPreservedMask(MF, AdaptedCC);
4371   }();
4372   assert(Mask && "Missing call preserved mask for calling convention");
4373 
4374   // If this is an invoke in a 32-bit function using a funclet-based
4375   // personality, assume the function clobbers all registers. If an exception
4376   // is thrown, the runtime will not restore CSRs.
4377   // FIXME: Model this more precisely so that we can register allocate across
4378   // the normal edge and spill and fill across the exceptional edge.
4379   if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
4380     const Function &CallerFn = MF.getFunction();
4381     EHPersonality Pers =
4382         CallerFn.hasPersonalityFn()
4383             ? classifyEHPersonality(CallerFn.getPersonalityFn())
4384             : EHPersonality::Unknown;
4385     if (isFuncletEHPersonality(Pers))
4386       Mask = RegInfo->getNoPreservedMask();
4387   }
4388 
4389   // Define a new register mask from the existing mask.
4390   uint32_t *RegMask = nullptr;
4391 
4392   // In some calling conventions we need to remove the used physical registers
4393   // from the reg mask.
4394   if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
4395     const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
4396 
4397     // Allocate a new Reg Mask and copy Mask.
4398     RegMask = MF.allocateRegMask();
4399     unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
4400     memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
4401 
4402     // Make sure all sub registers of the argument registers are reset
4403     // in the RegMask.
4404     for (auto const &RegPair : RegsToPass)
4405       for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
4406            SubRegs.isValid(); ++SubRegs)
4407         RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
4408 
4409     // Create the RegMask Operand according to our updated mask.
4410     Ops.push_back(DAG.getRegisterMask(RegMask));
4411   } else {
4412     // Create the RegMask Operand according to the static mask.
4413     Ops.push_back(DAG.getRegisterMask(Mask));
4414   }
4415 
4416   if (InFlag.getNode())
4417     Ops.push_back(InFlag);
4418 
4419   if (isTailCall) {
4420     // We used to do:
4421     //// If this is the first return lowered for this function, add the regs
4422     //// to the liveout set for the function.
4423     // This isn't right, although it's probably harmless on x86; liveouts
4424     // should be computed from returns not tail calls.  Consider a void
4425     // function making a tail call to a function returning int.
4426     MF.getFrameInfo().setHasTailCall();
4427     SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
4428     DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
4429     return Ret;
4430   }
4431 
4432   if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
4433     Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
4434   } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
4435     // Calls with a "clang.arc.attachedcall" bundle are special. They should be
4436     // expanded to the call, directly followed by a special marker sequence and
4437     // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
4438     assert(!isTailCall &&
4439            "tail calls cannot be marked with clang.arc.attachedcall");
4440     assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode");
4441 
4442     // Add target constant to select ObjC runtime call just before the call
4443     // target. RuntimeCallType == 0 selects objc_retainAutoreleasedReturnValue,
4444     // RuntimeCallType == 0 selects objc_unsafeClaimAutoreleasedReturnValue when
4445     // epxanding the pseudo.
4446     unsigned RuntimeCallType =
4447         objcarc::hasAttachedCallOpBundle(CLI.CB, true) ? 0 : 1;
4448     Ops.insert(Ops.begin() + 1,
4449                DAG.getTargetConstant(RuntimeCallType, dl, MVT::i32));
4450     Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
4451   } else {
4452     Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
4453   }
4454 
4455   InFlag = Chain.getValue(1);
4456   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
4457   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
4458 
4459   // Save heapallocsite metadata.
4460   if (CLI.CB)
4461     if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
4462       DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
4463 
4464   // Create the CALLSEQ_END node.
4465   unsigned NumBytesForCalleeToPop;
4466   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
4467                        DAG.getTarget().Options.GuaranteedTailCallOpt))
4468     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
4469   else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
4470            !Subtarget.getTargetTriple().isOSMSVCRT() &&
4471            SR == StackStructReturn)
4472     // If this is a call to a struct-return function, the callee
4473     // pops the hidden struct pointer, so we have to push it back.
4474     // This is common for Darwin/X86, Linux & Mingw32 targets.
4475     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
4476     NumBytesForCalleeToPop = 4;
4477   else
4478     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
4479 
4480   // Returns a flag for retval copy to use.
4481   if (!IsSibcall) {
4482     Chain = DAG.getCALLSEQ_END(Chain,
4483                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4484                                DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
4485                                                      true),
4486                                InFlag, dl);
4487     InFlag = Chain.getValue(1);
4488   }
4489 
4490   // Handle result values, copying them out of physregs into vregs that we
4491   // return.
4492   return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
4493                          InVals, RegMask);
4494 }
4495 
4496 //===----------------------------------------------------------------------===//
4497 //                Fast Calling Convention (tail call) implementation
4498 //===----------------------------------------------------------------------===//
4499 
4500 //  Like std call, callee cleans arguments, convention except that ECX is
4501 //  reserved for storing the tail called function address. Only 2 registers are
4502 //  free for argument passing (inreg). Tail call optimization is performed
4503 //  provided:
4504 //                * tailcallopt is enabled
4505 //                * caller/callee are fastcc
4506 //  On X86_64 architecture with GOT-style position independent code only local
4507 //  (within module) calls are supported at the moment.
4508 //  To keep the stack aligned according to platform abi the function
4509 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
4510 //  of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
4511 //  If a tail called function callee has more arguments than the caller the
4512 //  caller needs to make sure that there is room to move the RETADDR to. This is
4513 //  achieved by reserving an area the size of the argument delta right after the
4514 //  original RETADDR, but before the saved framepointer or the spilled registers
4515 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
4516 //  stack layout:
4517 //    arg1
4518 //    arg2
4519 //    RETADDR
4520 //    [ new RETADDR
4521 //      move area ]
4522 //    (possible EBP)
4523 //    ESI
4524 //    EDI
4525 //    local1 ..
4526 
4527 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
4528 /// requirement.
4529 unsigned
GetAlignedArgumentStackSize(const unsigned StackSize,SelectionDAG & DAG) const4530 X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
4531                                                SelectionDAG &DAG) const {
4532   const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
4533   const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
4534   assert(StackSize % SlotSize == 0 &&
4535          "StackSize must be a multiple of SlotSize");
4536   return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
4537 }
4538 
4539 /// Return true if the given stack call argument is already available in the
4540 /// same position (relatively) of the caller's incoming argument stack.
4541 static
MatchingStackOffset(SDValue Arg,unsigned Offset,ISD::ArgFlagsTy Flags,MachineFrameInfo & MFI,const MachineRegisterInfo * MRI,const X86InstrInfo * TII,const CCValAssign & VA)4542 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
4543                          MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
4544                          const X86InstrInfo *TII, const CCValAssign &VA) {
4545   unsigned Bytes = Arg.getValueSizeInBits() / 8;
4546 
4547   for (;;) {
4548     // Look through nodes that don't alter the bits of the incoming value.
4549     unsigned Op = Arg.getOpcode();
4550     if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
4551       Arg = Arg.getOperand(0);
4552       continue;
4553     }
4554     if (Op == ISD::TRUNCATE) {
4555       const SDValue &TruncInput = Arg.getOperand(0);
4556       if (TruncInput.getOpcode() == ISD::AssertZext &&
4557           cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
4558               Arg.getValueType()) {
4559         Arg = TruncInput.getOperand(0);
4560         continue;
4561       }
4562     }
4563     break;
4564   }
4565 
4566   int FI = INT_MAX;
4567   if (Arg.getOpcode() == ISD::CopyFromReg) {
4568     Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
4569     if (!VR.isVirtual())
4570       return false;
4571     MachineInstr *Def = MRI->getVRegDef(VR);
4572     if (!Def)
4573       return false;
4574     if (!Flags.isByVal()) {
4575       if (!TII->isLoadFromStackSlot(*Def, FI))
4576         return false;
4577     } else {
4578       unsigned Opcode = Def->getOpcode();
4579       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
4580            Opcode == X86::LEA64_32r) &&
4581           Def->getOperand(1).isFI()) {
4582         FI = Def->getOperand(1).getIndex();
4583         Bytes = Flags.getByValSize();
4584       } else
4585         return false;
4586     }
4587   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4588     if (Flags.isByVal())
4589       // ByVal argument is passed in as a pointer but it's now being
4590       // dereferenced. e.g.
4591       // define @foo(%struct.X* %A) {
4592       //   tail call @bar(%struct.X* byval %A)
4593       // }
4594       return false;
4595     SDValue Ptr = Ld->getBasePtr();
4596     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4597     if (!FINode)
4598       return false;
4599     FI = FINode->getIndex();
4600   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4601     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4602     FI = FINode->getIndex();
4603     Bytes = Flags.getByValSize();
4604   } else
4605     return false;
4606 
4607   assert(FI != INT_MAX);
4608   if (!MFI.isFixedObjectIndex(FI))
4609     return false;
4610 
4611   if (Offset != MFI.getObjectOffset(FI))
4612     return false;
4613 
4614   // If this is not byval, check that the argument stack object is immutable.
4615   // inalloca and argument copy elision can create mutable argument stack
4616   // objects. Byval objects can be mutated, but a byval call intends to pass the
4617   // mutated memory.
4618   if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4619     return false;
4620 
4621   if (VA.getLocVT().getFixedSizeInBits() >
4622       Arg.getValueSizeInBits().getFixedSize()) {
4623     // If the argument location is wider than the argument type, check that any
4624     // extension flags match.
4625     if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4626         Flags.isSExt() != MFI.isObjectSExt(FI)) {
4627       return false;
4628     }
4629   }
4630 
4631   return Bytes == MFI.getObjectSize(FI);
4632 }
4633 
4634 /// Check whether the call is eligible for tail call optimization. Targets
4635 /// that want to do tail call optimization should implement this function.
IsEligibleForTailCallOptimization(SDValue Callee,CallingConv::ID CalleeCC,bool isVarArg,bool isCalleeStructRet,bool isCallerStructRet,Type * RetTy,const SmallVectorImpl<ISD::OutputArg> & Outs,const SmallVectorImpl<SDValue> & OutVals,const SmallVectorImpl<ISD::InputArg> & Ins,SelectionDAG & DAG) const4636 bool X86TargetLowering::IsEligibleForTailCallOptimization(
4637     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4638     bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4639     const SmallVectorImpl<ISD::OutputArg> &Outs,
4640     const SmallVectorImpl<SDValue> &OutVals,
4641     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4642   if (!mayTailCallThisCC(CalleeCC))
4643     return false;
4644 
4645   // If -tailcallopt is specified, make fastcc functions tail-callable.
4646   MachineFunction &MF = DAG.getMachineFunction();
4647   const Function &CallerF = MF.getFunction();
4648 
4649   // If the function return type is x86_fp80 and the callee return type is not,
4650   // then the FP_EXTEND of the call result is not a nop. It's not safe to
4651   // perform a tailcall optimization here.
4652   if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4653     return false;
4654 
4655   CallingConv::ID CallerCC = CallerF.getCallingConv();
4656   bool CCMatch = CallerCC == CalleeCC;
4657   bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4658   bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4659   bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
4660       CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
4661 
4662   // Win64 functions have extra shadow space for argument homing. Don't do the
4663   // sibcall if the caller and callee have mismatched expectations for this
4664   // space.
4665   if (IsCalleeWin64 != IsCallerWin64)
4666     return false;
4667 
4668   if (IsGuaranteeTCO) {
4669     if (canGuaranteeTCO(CalleeCC) && CCMatch)
4670       return true;
4671     return false;
4672   }
4673 
4674   // Look for obvious safe cases to perform tail call optimization that do not
4675   // require ABI changes. This is what gcc calls sibcall.
4676 
4677   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4678   // emit a special epilogue.
4679   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4680   if (RegInfo->hasStackRealignment(MF))
4681     return false;
4682 
4683   // Also avoid sibcall optimization if either caller or callee uses struct
4684   // return semantics.
4685   if (isCalleeStructRet || isCallerStructRet)
4686     return false;
4687 
4688   // Do not sibcall optimize vararg calls unless all arguments are passed via
4689   // registers.
4690   LLVMContext &C = *DAG.getContext();
4691   if (isVarArg && !Outs.empty()) {
4692     // Optimizing for varargs on Win64 is unlikely to be safe without
4693     // additional testing.
4694     if (IsCalleeWin64 || IsCallerWin64)
4695       return false;
4696 
4697     SmallVector<CCValAssign, 16> ArgLocs;
4698     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4699 
4700     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4701     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4702       if (!ArgLocs[i].isRegLoc())
4703         return false;
4704   }
4705 
4706   // If the call result is in ST0 / ST1, it needs to be popped off the x87
4707   // stack.  Therefore, if it's not used by the call it is not safe to optimize
4708   // this into a sibcall.
4709   bool Unused = false;
4710   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4711     if (!Ins[i].Used) {
4712       Unused = true;
4713       break;
4714     }
4715   }
4716   if (Unused) {
4717     SmallVector<CCValAssign, 16> RVLocs;
4718     CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4719     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4720     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4721       CCValAssign &VA = RVLocs[i];
4722       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4723         return false;
4724     }
4725   }
4726 
4727   // Check that the call results are passed in the same way.
4728   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4729                                   RetCC_X86, RetCC_X86))
4730     return false;
4731   // The callee has to preserve all registers the caller needs to preserve.
4732   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4733   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4734   if (!CCMatch) {
4735     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4736     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4737       return false;
4738   }
4739 
4740   unsigned StackArgsSize = 0;
4741 
4742   // If the callee takes no arguments then go on to check the results of the
4743   // call.
4744   if (!Outs.empty()) {
4745     // Check if stack adjustment is needed. For now, do not do this if any
4746     // argument is passed on the stack.
4747     SmallVector<CCValAssign, 16> ArgLocs;
4748     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4749 
4750     // Allocate shadow area for Win64
4751     if (IsCalleeWin64)
4752       CCInfo.AllocateStack(32, Align(8));
4753 
4754     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4755     StackArgsSize = CCInfo.getNextStackOffset();
4756 
4757     if (CCInfo.getNextStackOffset()) {
4758       // Check if the arguments are already laid out in the right way as
4759       // the caller's fixed stack objects.
4760       MachineFrameInfo &MFI = MF.getFrameInfo();
4761       const MachineRegisterInfo *MRI = &MF.getRegInfo();
4762       const X86InstrInfo *TII = Subtarget.getInstrInfo();
4763       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4764         CCValAssign &VA = ArgLocs[i];
4765         SDValue Arg = OutVals[i];
4766         ISD::ArgFlagsTy Flags = Outs[i].Flags;
4767         if (VA.getLocInfo() == CCValAssign::Indirect)
4768           return false;
4769         if (!VA.isRegLoc()) {
4770           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4771                                    MFI, MRI, TII, VA))
4772             return false;
4773         }
4774       }
4775     }
4776 
4777     bool PositionIndependent = isPositionIndependent();
4778     // If the tailcall address may be in a register, then make sure it's
4779     // possible to register allocate for it. In 32-bit, the call address can
4780     // only target EAX, EDX, or ECX since the tail call must be scheduled after
4781     // callee-saved registers are restored. These happen to be the same
4782     // registers used to pass 'inreg' arguments so watch out for those.
4783     if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4784                                   !isa<ExternalSymbolSDNode>(Callee)) ||
4785                                  PositionIndependent)) {
4786       unsigned NumInRegs = 0;
4787       // In PIC we need an extra register to formulate the address computation
4788       // for the callee.
4789       unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4790 
4791       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4792         CCValAssign &VA = ArgLocs[i];
4793         if (!VA.isRegLoc())
4794           continue;
4795         Register Reg = VA.getLocReg();
4796         switch (Reg) {
4797         default: break;
4798         case X86::EAX: case X86::EDX: case X86::ECX:
4799           if (++NumInRegs == MaxInRegs)
4800             return false;
4801           break;
4802         }
4803       }
4804     }
4805 
4806     const MachineRegisterInfo &MRI = MF.getRegInfo();
4807     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4808       return false;
4809   }
4810 
4811   bool CalleeWillPop =
4812       X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4813                        MF.getTarget().Options.GuaranteedTailCallOpt);
4814 
4815   if (unsigned BytesToPop =
4816           MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4817     // If we have bytes to pop, the callee must pop them.
4818     bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4819     if (!CalleePopMatches)
4820       return false;
4821   } else if (CalleeWillPop && StackArgsSize > 0) {
4822     // If we don't have bytes to pop, make sure the callee doesn't pop any.
4823     return false;
4824   }
4825 
4826   return true;
4827 }
4828 
4829 FastISel *
createFastISel(FunctionLoweringInfo & funcInfo,const TargetLibraryInfo * libInfo) const4830 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4831                                   const TargetLibraryInfo *libInfo) const {
4832   return X86::createFastISel(funcInfo, libInfo);
4833 }
4834 
4835 //===----------------------------------------------------------------------===//
4836 //                           Other Lowering Hooks
4837 //===----------------------------------------------------------------------===//
4838 
MayFoldLoad(SDValue Op)4839 static bool MayFoldLoad(SDValue Op) {
4840   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4841 }
4842 
MayFoldIntoStore(SDValue Op)4843 static bool MayFoldIntoStore(SDValue Op) {
4844   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4845 }
4846 
MayFoldIntoZeroExtend(SDValue Op)4847 static bool MayFoldIntoZeroExtend(SDValue Op) {
4848   if (Op.hasOneUse()) {
4849     unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4850     return (ISD::ZERO_EXTEND == Opcode);
4851   }
4852   return false;
4853 }
4854 
isTargetShuffle(unsigned Opcode)4855 static bool isTargetShuffle(unsigned Opcode) {
4856   switch(Opcode) {
4857   default: return false;
4858   case X86ISD::BLENDI:
4859   case X86ISD::PSHUFB:
4860   case X86ISD::PSHUFD:
4861   case X86ISD::PSHUFHW:
4862   case X86ISD::PSHUFLW:
4863   case X86ISD::SHUFP:
4864   case X86ISD::INSERTPS:
4865   case X86ISD::EXTRQI:
4866   case X86ISD::INSERTQI:
4867   case X86ISD::VALIGN:
4868   case X86ISD::PALIGNR:
4869   case X86ISD::VSHLDQ:
4870   case X86ISD::VSRLDQ:
4871   case X86ISD::MOVLHPS:
4872   case X86ISD::MOVHLPS:
4873   case X86ISD::MOVSHDUP:
4874   case X86ISD::MOVSLDUP:
4875   case X86ISD::MOVDDUP:
4876   case X86ISD::MOVSS:
4877   case X86ISD::MOVSD:
4878   case X86ISD::UNPCKL:
4879   case X86ISD::UNPCKH:
4880   case X86ISD::VBROADCAST:
4881   case X86ISD::VPERMILPI:
4882   case X86ISD::VPERMILPV:
4883   case X86ISD::VPERM2X128:
4884   case X86ISD::SHUF128:
4885   case X86ISD::VPERMIL2:
4886   case X86ISD::VPERMI:
4887   case X86ISD::VPPERM:
4888   case X86ISD::VPERMV:
4889   case X86ISD::VPERMV3:
4890   case X86ISD::VZEXT_MOVL:
4891     return true;
4892   }
4893 }
4894 
isTargetShuffleVariableMask(unsigned Opcode)4895 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4896   switch (Opcode) {
4897   default: return false;
4898   // Target Shuffles.
4899   case X86ISD::PSHUFB:
4900   case X86ISD::VPERMILPV:
4901   case X86ISD::VPERMIL2:
4902   case X86ISD::VPPERM:
4903   case X86ISD::VPERMV:
4904   case X86ISD::VPERMV3:
4905     return true;
4906   // 'Faux' Target Shuffles.
4907   case ISD::OR:
4908   case ISD::AND:
4909   case X86ISD::ANDNP:
4910     return true;
4911   }
4912 }
4913 
isTargetShuffleSplat(SDValue Op)4914 static bool isTargetShuffleSplat(SDValue Op) {
4915   unsigned Opcode = Op.getOpcode();
4916   if (Opcode == ISD::EXTRACT_SUBVECTOR)
4917     return isTargetShuffleSplat(Op.getOperand(0));
4918   return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD;
4919 }
4920 
getReturnAddressFrameIndex(SelectionDAG & DAG) const4921 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4922   MachineFunction &MF = DAG.getMachineFunction();
4923   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4924   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4925   int ReturnAddrIndex = FuncInfo->getRAIndex();
4926 
4927   if (ReturnAddrIndex == 0) {
4928     // Set up a frame object for the return address.
4929     unsigned SlotSize = RegInfo->getSlotSize();
4930     ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4931                                                           -(int64_t)SlotSize,
4932                                                           false);
4933     FuncInfo->setRAIndex(ReturnAddrIndex);
4934   }
4935 
4936   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4937 }
4938 
isOffsetSuitableForCodeModel(int64_t Offset,CodeModel::Model M,bool hasSymbolicDisplacement)4939 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4940                                        bool hasSymbolicDisplacement) {
4941   // Offset should fit into 32 bit immediate field.
4942   if (!isInt<32>(Offset))
4943     return false;
4944 
4945   // If we don't have a symbolic displacement - we don't have any extra
4946   // restrictions.
4947   if (!hasSymbolicDisplacement)
4948     return true;
4949 
4950   // FIXME: Some tweaks might be needed for medium code model.
4951   if (M != CodeModel::Small && M != CodeModel::Kernel)
4952     return false;
4953 
4954   // For small code model we assume that latest object is 16MB before end of 31
4955   // bits boundary. We may also accept pretty large negative constants knowing
4956   // that all objects are in the positive half of address space.
4957   if (M == CodeModel::Small && Offset < 16*1024*1024)
4958     return true;
4959 
4960   // For kernel code model we know that all object resist in the negative half
4961   // of 32bits address space. We may not accept negative offsets, since they may
4962   // be just off and we may accept pretty large positive ones.
4963   if (M == CodeModel::Kernel && Offset >= 0)
4964     return true;
4965 
4966   return false;
4967 }
4968 
4969 /// Determines whether the callee is required to pop its own arguments.
4970 /// Callee pop is necessary to support tail calls.
isCalleePop(CallingConv::ID CallingConv,bool is64Bit,bool IsVarArg,bool GuaranteeTCO)4971 bool X86::isCalleePop(CallingConv::ID CallingConv,
4972                       bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4973   // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4974   // can guarantee TCO.
4975   if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4976     return true;
4977 
4978   switch (CallingConv) {
4979   default:
4980     return false;
4981   case CallingConv::X86_StdCall:
4982   case CallingConv::X86_FastCall:
4983   case CallingConv::X86_ThisCall:
4984   case CallingConv::X86_VectorCall:
4985     return !is64Bit;
4986   }
4987 }
4988 
4989 /// Return true if the condition is an signed comparison operation.
isX86CCSigned(unsigned X86CC)4990 static bool isX86CCSigned(unsigned X86CC) {
4991   switch (X86CC) {
4992   default:
4993     llvm_unreachable("Invalid integer condition!");
4994   case X86::COND_E:
4995   case X86::COND_NE:
4996   case X86::COND_B:
4997   case X86::COND_A:
4998   case X86::COND_BE:
4999   case X86::COND_AE:
5000     return false;
5001   case X86::COND_G:
5002   case X86::COND_GE:
5003   case X86::COND_L:
5004   case X86::COND_LE:
5005     return true;
5006   }
5007 }
5008 
TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)5009 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
5010   switch (SetCCOpcode) {
5011   default: llvm_unreachable("Invalid integer condition!");
5012   case ISD::SETEQ:  return X86::COND_E;
5013   case ISD::SETGT:  return X86::COND_G;
5014   case ISD::SETGE:  return X86::COND_GE;
5015   case ISD::SETLT:  return X86::COND_L;
5016   case ISD::SETLE:  return X86::COND_LE;
5017   case ISD::SETNE:  return X86::COND_NE;
5018   case ISD::SETULT: return X86::COND_B;
5019   case ISD::SETUGT: return X86::COND_A;
5020   case ISD::SETULE: return X86::COND_BE;
5021   case ISD::SETUGE: return X86::COND_AE;
5022   }
5023 }
5024 
5025 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
5026 /// condition code, returning the condition code and the LHS/RHS of the
5027 /// comparison to make.
TranslateX86CC(ISD::CondCode SetCCOpcode,const SDLoc & DL,bool isFP,SDValue & LHS,SDValue & RHS,SelectionDAG & DAG)5028 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
5029                                bool isFP, SDValue &LHS, SDValue &RHS,
5030                                SelectionDAG &DAG) {
5031   if (!isFP) {
5032     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
5033       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
5034         // X > -1   -> X == 0, jump !sign.
5035         RHS = DAG.getConstant(0, DL, RHS.getValueType());
5036         return X86::COND_NS;
5037       }
5038       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
5039         // X < 0   -> X == 0, jump on sign.
5040         return X86::COND_S;
5041       }
5042       if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) {
5043         // X >= 0   -> X == 0, jump on !sign.
5044         return X86::COND_NS;
5045       }
5046       if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
5047         // X < 1   -> X <= 0
5048         RHS = DAG.getConstant(0, DL, RHS.getValueType());
5049         return X86::COND_LE;
5050       }
5051     }
5052 
5053     return TranslateIntegerX86CC(SetCCOpcode);
5054   }
5055 
5056   // First determine if it is required or is profitable to flip the operands.
5057 
5058   // If LHS is a foldable load, but RHS is not, flip the condition.
5059   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
5060       !ISD::isNON_EXTLoad(RHS.getNode())) {
5061     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
5062     std::swap(LHS, RHS);
5063   }
5064 
5065   switch (SetCCOpcode) {
5066   default: break;
5067   case ISD::SETOLT:
5068   case ISD::SETOLE:
5069   case ISD::SETUGT:
5070   case ISD::SETUGE:
5071     std::swap(LHS, RHS);
5072     break;
5073   }
5074 
5075   // On a floating point condition, the flags are set as follows:
5076   // ZF  PF  CF   op
5077   //  0 | 0 | 0 | X > Y
5078   //  0 | 0 | 1 | X < Y
5079   //  1 | 0 | 0 | X == Y
5080   //  1 | 1 | 1 | unordered
5081   switch (SetCCOpcode) {
5082   default: llvm_unreachable("Condcode should be pre-legalized away");
5083   case ISD::SETUEQ:
5084   case ISD::SETEQ:   return X86::COND_E;
5085   case ISD::SETOLT:              // flipped
5086   case ISD::SETOGT:
5087   case ISD::SETGT:   return X86::COND_A;
5088   case ISD::SETOLE:              // flipped
5089   case ISD::SETOGE:
5090   case ISD::SETGE:   return X86::COND_AE;
5091   case ISD::SETUGT:              // flipped
5092   case ISD::SETULT:
5093   case ISD::SETLT:   return X86::COND_B;
5094   case ISD::SETUGE:              // flipped
5095   case ISD::SETULE:
5096   case ISD::SETLE:   return X86::COND_BE;
5097   case ISD::SETONE:
5098   case ISD::SETNE:   return X86::COND_NE;
5099   case ISD::SETUO:   return X86::COND_P;
5100   case ISD::SETO:    return X86::COND_NP;
5101   case ISD::SETOEQ:
5102   case ISD::SETUNE:  return X86::COND_INVALID;
5103   }
5104 }
5105 
5106 /// Is there a floating point cmov for the specific X86 condition code?
5107 /// Current x86 isa includes the following FP cmov instructions:
5108 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
hasFPCMov(unsigned X86CC)5109 static bool hasFPCMov(unsigned X86CC) {
5110   switch (X86CC) {
5111   default:
5112     return false;
5113   case X86::COND_B:
5114   case X86::COND_BE:
5115   case X86::COND_E:
5116   case X86::COND_P:
5117   case X86::COND_A:
5118   case X86::COND_AE:
5119   case X86::COND_NE:
5120   case X86::COND_NP:
5121     return true;
5122   }
5123 }
5124 
5125 
getTgtMemIntrinsic(IntrinsicInfo & Info,const CallInst & I,MachineFunction & MF,unsigned Intrinsic) const5126 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
5127                                            const CallInst &I,
5128                                            MachineFunction &MF,
5129                                            unsigned Intrinsic) const {
5130   Info.flags = MachineMemOperand::MONone;
5131   Info.offset = 0;
5132 
5133   const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
5134   if (!IntrData) {
5135     switch (Intrinsic) {
5136     case Intrinsic::x86_aesenc128kl:
5137     case Intrinsic::x86_aesdec128kl:
5138       Info.opc = ISD::INTRINSIC_W_CHAIN;
5139       Info.ptrVal = I.getArgOperand(1);
5140       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5141       Info.align = Align(1);
5142       Info.flags |= MachineMemOperand::MOLoad;
5143       return true;
5144     case Intrinsic::x86_aesenc256kl:
5145     case Intrinsic::x86_aesdec256kl:
5146       Info.opc = ISD::INTRINSIC_W_CHAIN;
5147       Info.ptrVal = I.getArgOperand(1);
5148       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5149       Info.align = Align(1);
5150       Info.flags |= MachineMemOperand::MOLoad;
5151       return true;
5152     case Intrinsic::x86_aesencwide128kl:
5153     case Intrinsic::x86_aesdecwide128kl:
5154       Info.opc = ISD::INTRINSIC_W_CHAIN;
5155       Info.ptrVal = I.getArgOperand(0);
5156       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5157       Info.align = Align(1);
5158       Info.flags |= MachineMemOperand::MOLoad;
5159       return true;
5160     case Intrinsic::x86_aesencwide256kl:
5161     case Intrinsic::x86_aesdecwide256kl:
5162       Info.opc = ISD::INTRINSIC_W_CHAIN;
5163       Info.ptrVal = I.getArgOperand(0);
5164       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5165       Info.align = Align(1);
5166       Info.flags |= MachineMemOperand::MOLoad;
5167       return true;
5168     }
5169     return false;
5170   }
5171 
5172   switch (IntrData->Type) {
5173   case TRUNCATE_TO_MEM_VI8:
5174   case TRUNCATE_TO_MEM_VI16:
5175   case TRUNCATE_TO_MEM_VI32: {
5176     Info.opc = ISD::INTRINSIC_VOID;
5177     Info.ptrVal = I.getArgOperand(0);
5178     MVT VT  = MVT::getVT(I.getArgOperand(1)->getType());
5179     MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
5180     if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
5181       ScalarVT = MVT::i8;
5182     else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
5183       ScalarVT = MVT::i16;
5184     else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
5185       ScalarVT = MVT::i32;
5186 
5187     Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
5188     Info.align = Align(1);
5189     Info.flags |= MachineMemOperand::MOStore;
5190     break;
5191   }
5192   case GATHER:
5193   case GATHER_AVX2: {
5194     Info.opc = ISD::INTRINSIC_W_CHAIN;
5195     Info.ptrVal = nullptr;
5196     MVT DataVT = MVT::getVT(I.getType());
5197     MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5198     unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5199                                 IndexVT.getVectorNumElements());
5200     Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5201     Info.align = Align(1);
5202     Info.flags |= MachineMemOperand::MOLoad;
5203     break;
5204   }
5205   case SCATTER: {
5206     Info.opc = ISD::INTRINSIC_VOID;
5207     Info.ptrVal = nullptr;
5208     MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
5209     MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5210     unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5211                                 IndexVT.getVectorNumElements());
5212     Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5213     Info.align = Align(1);
5214     Info.flags |= MachineMemOperand::MOStore;
5215     break;
5216   }
5217   default:
5218     return false;
5219   }
5220 
5221   return true;
5222 }
5223 
5224 /// Returns true if the target can instruction select the
5225 /// specified FP immediate natively. If false, the legalizer will
5226 /// materialize the FP immediate as a load from a constant pool.
isFPImmLegal(const APFloat & Imm,EVT VT,bool ForCodeSize) const5227 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
5228                                      bool ForCodeSize) const {
5229   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
5230     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
5231       return true;
5232   }
5233   return false;
5234 }
5235 
shouldReduceLoadWidth(SDNode * Load,ISD::LoadExtType ExtTy,EVT NewVT) const5236 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
5237                                               ISD::LoadExtType ExtTy,
5238                                               EVT NewVT) const {
5239   assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
5240 
5241   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
5242   // relocation target a movq or addq instruction: don't let the load shrink.
5243   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
5244   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
5245     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
5246       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
5247 
5248   // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
5249   // those uses are extracted directly into a store, then the extract + store
5250   // can be store-folded. Therefore, it's probably not worth splitting the load.
5251   EVT VT = Load->getValueType(0);
5252   if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
5253     for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
5254       // Skip uses of the chain value. Result 0 of the node is the load value.
5255       if (UI.getUse().getResNo() != 0)
5256         continue;
5257 
5258       // If this use is not an extract + store, it's probably worth splitting.
5259       if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
5260           UI->use_begin()->getOpcode() != ISD::STORE)
5261         return true;
5262     }
5263     // All non-chain uses are extract + store.
5264     return false;
5265   }
5266 
5267   return true;
5268 }
5269 
5270 /// Returns true if it is beneficial to convert a load of a constant
5271 /// to just the constant itself.
shouldConvertConstantLoadToIntImm(const APInt & Imm,Type * Ty) const5272 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
5273                                                           Type *Ty) const {
5274   assert(Ty->isIntegerTy());
5275 
5276   unsigned BitSize = Ty->getPrimitiveSizeInBits();
5277   if (BitSize == 0 || BitSize > 64)
5278     return false;
5279   return true;
5280 }
5281 
reduceSelectOfFPConstantLoads(EVT CmpOpVT) const5282 bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
5283   // If we are using XMM registers in the ABI and the condition of the select is
5284   // a floating-point compare and we have blendv or conditional move, then it is
5285   // cheaper to select instead of doing a cross-register move and creating a
5286   // load that depends on the compare result.
5287   bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
5288   return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
5289 }
5290 
convertSelectOfConstantsToMath(EVT VT) const5291 bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
5292   // TODO: It might be a win to ease or lift this restriction, but the generic
5293   // folds in DAGCombiner conflict with vector folds for an AVX512 target.
5294   if (VT.isVector() && Subtarget.hasAVX512())
5295     return false;
5296 
5297   return true;
5298 }
5299 
decomposeMulByConstant(LLVMContext & Context,EVT VT,SDValue C) const5300 bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
5301                                                SDValue C) const {
5302   // TODO: We handle scalars using custom code, but generic combining could make
5303   // that unnecessary.
5304   APInt MulC;
5305   if (!ISD::isConstantSplatVector(C.getNode(), MulC))
5306     return false;
5307 
5308   // Find the type this will be legalized too. Otherwise we might prematurely
5309   // convert this to shl+add/sub and then still have to type legalize those ops.
5310   // Another choice would be to defer the decision for illegal types until
5311   // after type legalization. But constant splat vectors of i64 can't make it
5312   // through type legalization on 32-bit targets so we would need to special
5313   // case vXi64.
5314   while (getTypeAction(Context, VT) != TypeLegal)
5315     VT = getTypeToTransformTo(Context, VT);
5316 
5317   // If vector multiply is legal, assume that's faster than shl + add/sub.
5318   // TODO: Multiply is a complex op with higher latency and lower throughput in
5319   //       most implementations, so this check could be loosened based on type
5320   //       and/or a CPU attribute.
5321   if (isOperationLegal(ISD::MUL, VT))
5322     return false;
5323 
5324   // shl+add, shl+sub, shl+add+neg
5325   return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
5326          (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
5327 }
5328 
isExtractSubvectorCheap(EVT ResVT,EVT SrcVT,unsigned Index) const5329 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
5330                                                 unsigned Index) const {
5331   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
5332     return false;
5333 
5334   // Mask vectors support all subregister combinations and operations that
5335   // extract half of vector.
5336   if (ResVT.getVectorElementType() == MVT::i1)
5337     return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
5338                           (Index == ResVT.getVectorNumElements()));
5339 
5340   return (Index % ResVT.getVectorNumElements()) == 0;
5341 }
5342 
shouldScalarizeBinop(SDValue VecOp) const5343 bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
5344   unsigned Opc = VecOp.getOpcode();
5345 
5346   // Assume target opcodes can't be scalarized.
5347   // TODO - do we have any exceptions?
5348   if (Opc >= ISD::BUILTIN_OP_END)
5349     return false;
5350 
5351   // If the vector op is not supported, try to convert to scalar.
5352   EVT VecVT = VecOp.getValueType();
5353   if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
5354     return true;
5355 
5356   // If the vector op is supported, but the scalar op is not, the transform may
5357   // not be worthwhile.
5358   EVT ScalarVT = VecVT.getScalarType();
5359   return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
5360 }
5361 
shouldFormOverflowOp(unsigned Opcode,EVT VT,bool) const5362 bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
5363                                              bool) const {
5364   // TODO: Allow vectors?
5365   if (VT.isVector())
5366     return false;
5367   return VT.isSimple() || !isOperationExpand(Opcode, VT);
5368 }
5369 
isCheapToSpeculateCttz() const5370 bool X86TargetLowering::isCheapToSpeculateCttz() const {
5371   // Speculate cttz only if we can directly use TZCNT.
5372   return Subtarget.hasBMI();
5373 }
5374 
isCheapToSpeculateCtlz() const5375 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
5376   // Speculate ctlz only if we can directly use LZCNT.
5377   return Subtarget.hasLZCNT();
5378 }
5379 
isLoadBitCastBeneficial(EVT LoadVT,EVT BitcastVT,const SelectionDAG & DAG,const MachineMemOperand & MMO) const5380 bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
5381                                                 const SelectionDAG &DAG,
5382                                                 const MachineMemOperand &MMO) const {
5383   if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
5384       BitcastVT.getVectorElementType() == MVT::i1)
5385     return false;
5386 
5387   if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
5388     return false;
5389 
5390   // If both types are legal vectors, it's always ok to convert them.
5391   if (LoadVT.isVector() && BitcastVT.isVector() &&
5392       isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
5393     return true;
5394 
5395   return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
5396 }
5397 
canMergeStoresTo(unsigned AddressSpace,EVT MemVT,const SelectionDAG & DAG) const5398 bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
5399                                          const SelectionDAG &DAG) const {
5400   // Do not merge to float value size (128 bytes) if no implicit
5401   // float attribute is set.
5402   bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
5403       Attribute::NoImplicitFloat);
5404 
5405   if (NoFloat) {
5406     unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
5407     return (MemVT.getSizeInBits() <= MaxIntSize);
5408   }
5409   // Make sure we don't merge greater than our preferred vector
5410   // width.
5411   if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
5412     return false;
5413 
5414   return true;
5415 }
5416 
isCtlzFast() const5417 bool X86TargetLowering::isCtlzFast() const {
5418   return Subtarget.hasFastLZCNT();
5419 }
5420 
isMaskAndCmp0FoldingBeneficial(const Instruction & AndI) const5421 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
5422     const Instruction &AndI) const {
5423   return true;
5424 }
5425 
hasAndNotCompare(SDValue Y) const5426 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
5427   EVT VT = Y.getValueType();
5428 
5429   if (VT.isVector())
5430     return false;
5431 
5432   if (!Subtarget.hasBMI())
5433     return false;
5434 
5435   // There are only 32-bit and 64-bit forms for 'andn'.
5436   if (VT != MVT::i32 && VT != MVT::i64)
5437     return false;
5438 
5439   return !isa<ConstantSDNode>(Y);
5440 }
5441 
hasAndNot(SDValue Y) const5442 bool X86TargetLowering::hasAndNot(SDValue Y) const {
5443   EVT VT = Y.getValueType();
5444 
5445   if (!VT.isVector())
5446     return hasAndNotCompare(Y);
5447 
5448   // Vector.
5449 
5450   if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
5451     return false;
5452 
5453   if (VT == MVT::v4i32)
5454     return true;
5455 
5456   return Subtarget.hasSSE2();
5457 }
5458 
hasBitTest(SDValue X,SDValue Y) const5459 bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
5460   return X.getValueType().isScalarInteger(); // 'bt'
5461 }
5462 
5463 bool X86TargetLowering::
shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X,ConstantSDNode * XC,ConstantSDNode * CC,SDValue Y,unsigned OldShiftOpcode,unsigned NewShiftOpcode,SelectionDAG & DAG) const5464     shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5465         SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
5466         unsigned OldShiftOpcode, unsigned NewShiftOpcode,
5467         SelectionDAG &DAG) const {
5468   // Does baseline recommend not to perform the fold by default?
5469   if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5470           X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
5471     return false;
5472   // For scalars this transform is always beneficial.
5473   if (X.getValueType().isScalarInteger())
5474     return true;
5475   // If all the shift amounts are identical, then transform is beneficial even
5476   // with rudimentary SSE2 shifts.
5477   if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
5478     return true;
5479   // If we have AVX2 with it's powerful shift operations, then it's also good.
5480   if (Subtarget.hasAVX2())
5481     return true;
5482   // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
5483   return NewShiftOpcode == ISD::SHL;
5484 }
5485 
shouldFoldConstantShiftPairToMask(const SDNode * N,CombineLevel Level) const5486 bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
5487     const SDNode *N, CombineLevel Level) const {
5488   assert(((N->getOpcode() == ISD::SHL &&
5489            N->getOperand(0).getOpcode() == ISD::SRL) ||
5490           (N->getOpcode() == ISD::SRL &&
5491            N->getOperand(0).getOpcode() == ISD::SHL)) &&
5492          "Expected shift-shift mask");
5493   EVT VT = N->getValueType(0);
5494   if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
5495       (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
5496     // Only fold if the shift values are equal - so it folds to AND.
5497     // TODO - we should fold if either is a non-uniform vector but we don't do
5498     // the fold for non-splats yet.
5499     return N->getOperand(1) == N->getOperand(0).getOperand(1);
5500   }
5501   return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
5502 }
5503 
shouldFoldMaskToVariableShiftPair(SDValue Y) const5504 bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
5505   EVT VT = Y.getValueType();
5506 
5507   // For vectors, we don't have a preference, but we probably want a mask.
5508   if (VT.isVector())
5509     return false;
5510 
5511   // 64-bit shifts on 32-bit targets produce really bad bloated code.
5512   if (VT == MVT::i64 && !Subtarget.is64Bit())
5513     return false;
5514 
5515   return true;
5516 }
5517 
shouldExpandShift(SelectionDAG & DAG,SDNode * N) const5518 bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
5519                                           SDNode *N) const {
5520   if (DAG.getMachineFunction().getFunction().hasMinSize() &&
5521       !Subtarget.isOSWindows())
5522     return false;
5523   return true;
5524 }
5525 
shouldSplatInsEltVarIndex(EVT VT) const5526 bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
5527   // Any legal vector type can be splatted more efficiently than
5528   // loading/spilling from memory.
5529   return isTypeLegal(VT);
5530 }
5531 
hasFastEqualityCompare(unsigned NumBits) const5532 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
5533   MVT VT = MVT::getIntegerVT(NumBits);
5534   if (isTypeLegal(VT))
5535     return VT;
5536 
5537   // PMOVMSKB can handle this.
5538   if (NumBits == 128 && isTypeLegal(MVT::v16i8))
5539     return MVT::v16i8;
5540 
5541   // VPMOVMSKB can handle this.
5542   if (NumBits == 256 && isTypeLegal(MVT::v32i8))
5543     return MVT::v32i8;
5544 
5545   // TODO: Allow 64-bit type for 32-bit target.
5546   // TODO: 512-bit types should be allowed, but make sure that those
5547   // cases are handled in combineVectorSizedSetCCEquality().
5548 
5549   return MVT::INVALID_SIMPLE_VALUE_TYPE;
5550 }
5551 
5552 /// Val is the undef sentinel value or equal to the specified value.
isUndefOrEqual(int Val,int CmpVal)5553 static bool isUndefOrEqual(int Val, int CmpVal) {
5554   return ((Val == SM_SentinelUndef) || (Val == CmpVal));
5555 }
5556 
5557 /// Return true if every element in Mask is the undef sentinel value or equal to
5558 /// the specified value..
isUndefOrEqual(ArrayRef<int> Mask,int CmpVal)5559 static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
5560   return llvm::all_of(Mask, [CmpVal](int M) {
5561     return (M == SM_SentinelUndef) || (M == CmpVal);
5562   });
5563 }
5564 
5565 /// Val is either the undef or zero sentinel value.
isUndefOrZero(int Val)5566 static bool isUndefOrZero(int Val) {
5567   return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
5568 }
5569 
5570 /// Return true if every element in Mask, beginning from position Pos and ending
5571 /// in Pos+Size is the undef sentinel value.
isUndefInRange(ArrayRef<int> Mask,unsigned Pos,unsigned Size)5572 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
5573   return llvm::all_of(Mask.slice(Pos, Size),
5574                       [](int M) { return M == SM_SentinelUndef; });
5575 }
5576 
5577 /// Return true if the mask creates a vector whose lower half is undefined.
isUndefLowerHalf(ArrayRef<int> Mask)5578 static bool isUndefLowerHalf(ArrayRef<int> Mask) {
5579   unsigned NumElts = Mask.size();
5580   return isUndefInRange(Mask, 0, NumElts / 2);
5581 }
5582 
5583 /// Return true if the mask creates a vector whose upper half is undefined.
isUndefUpperHalf(ArrayRef<int> Mask)5584 static bool isUndefUpperHalf(ArrayRef<int> Mask) {
5585   unsigned NumElts = Mask.size();
5586   return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
5587 }
5588 
5589 /// Return true if Val falls within the specified range (L, H].
isInRange(int Val,int Low,int Hi)5590 static bool isInRange(int Val, int Low, int Hi) {
5591   return (Val >= Low && Val < Hi);
5592 }
5593 
5594 /// Return true if the value of any element in Mask falls within the specified
5595 /// range (L, H].
isAnyInRange(ArrayRef<int> Mask,int Low,int Hi)5596 static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
5597   return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
5598 }
5599 
5600 /// Return true if the value of any element in Mask is the zero sentinel value.
isAnyZero(ArrayRef<int> Mask)5601 static bool isAnyZero(ArrayRef<int> Mask) {
5602   return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
5603 }
5604 
5605 /// Return true if the value of any element in Mask is the zero or undef
5606 /// sentinel values.
isAnyZeroOrUndef(ArrayRef<int> Mask)5607 static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
5608   return llvm::any_of(Mask, [](int M) {
5609     return M == SM_SentinelZero || M == SM_SentinelUndef;
5610   });
5611 }
5612 
5613 /// Return true if Val is undef or if its value falls within the
5614 /// specified range (L, H].
isUndefOrInRange(int Val,int Low,int Hi)5615 static bool isUndefOrInRange(int Val, int Low, int Hi) {
5616   return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
5617 }
5618 
5619 /// Return true if every element in Mask is undef or if its value
5620 /// falls within the specified range (L, H].
isUndefOrInRange(ArrayRef<int> Mask,int Low,int Hi)5621 static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5622   return llvm::all_of(
5623       Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
5624 }
5625 
5626 /// Return true if Val is undef, zero or if its value falls within the
5627 /// specified range (L, H].
isUndefOrZeroOrInRange(int Val,int Low,int Hi)5628 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
5629   return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
5630 }
5631 
5632 /// Return true if every element in Mask is undef, zero or if its value
5633 /// falls within the specified range (L, H].
isUndefOrZeroOrInRange(ArrayRef<int> Mask,int Low,int Hi)5634 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5635   return llvm::all_of(
5636       Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
5637 }
5638 
5639 /// Return true if every element in Mask, beginning
5640 /// from position Pos and ending in Pos + Size, falls within the specified
5641 /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
isSequentialOrUndefInRange(ArrayRef<int> Mask,unsigned Pos,unsigned Size,int Low,int Step=1)5642 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
5643                                        unsigned Size, int Low, int Step = 1) {
5644   for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5645     if (!isUndefOrEqual(Mask[i], Low))
5646       return false;
5647   return true;
5648 }
5649 
5650 /// Return true if every element in Mask, beginning
5651 /// from position Pos and ending in Pos+Size, falls within the specified
5652 /// sequential range (Low, Low+Size], or is undef or is zero.
isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask,unsigned Pos,unsigned Size,int Low,int Step=1)5653 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5654                                              unsigned Size, int Low,
5655                                              int Step = 1) {
5656   for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5657     if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
5658       return false;
5659   return true;
5660 }
5661 
5662 /// Return true if every element in Mask, beginning
5663 /// from position Pos and ending in Pos+Size is undef or is zero.
isUndefOrZeroInRange(ArrayRef<int> Mask,unsigned Pos,unsigned Size)5664 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5665                                  unsigned Size) {
5666   return llvm::all_of(Mask.slice(Pos, Size),
5667                       [](int M) { return isUndefOrZero(M); });
5668 }
5669 
5670 /// Helper function to test whether a shuffle mask could be
5671 /// simplified by widening the elements being shuffled.
5672 ///
5673 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
5674 /// leaves it in an unspecified state.
5675 ///
5676 /// NOTE: This must handle normal vector shuffle masks and *target* vector
5677 /// shuffle masks. The latter have the special property of a '-2' representing
5678 /// a zero-ed lane of a vector.
canWidenShuffleElements(ArrayRef<int> Mask,SmallVectorImpl<int> & WidenedMask)5679 static bool canWidenShuffleElements(ArrayRef<int> Mask,
5680                                     SmallVectorImpl<int> &WidenedMask) {
5681   WidenedMask.assign(Mask.size() / 2, 0);
5682   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
5683     int M0 = Mask[i];
5684     int M1 = Mask[i + 1];
5685 
5686     // If both elements are undef, its trivial.
5687     if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
5688       WidenedMask[i / 2] = SM_SentinelUndef;
5689       continue;
5690     }
5691 
5692     // Check for an undef mask and a mask value properly aligned to fit with
5693     // a pair of values. If we find such a case, use the non-undef mask's value.
5694     if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
5695       WidenedMask[i / 2] = M1 / 2;
5696       continue;
5697     }
5698     if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
5699       WidenedMask[i / 2] = M0 / 2;
5700       continue;
5701     }
5702 
5703     // When zeroing, we need to spread the zeroing across both lanes to widen.
5704     if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
5705       if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
5706           (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
5707         WidenedMask[i / 2] = SM_SentinelZero;
5708         continue;
5709       }
5710       return false;
5711     }
5712 
5713     // Finally check if the two mask values are adjacent and aligned with
5714     // a pair.
5715     if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
5716       WidenedMask[i / 2] = M0 / 2;
5717       continue;
5718     }
5719 
5720     // Otherwise we can't safely widen the elements used in this shuffle.
5721     return false;
5722   }
5723   assert(WidenedMask.size() == Mask.size() / 2 &&
5724          "Incorrect size of mask after widening the elements!");
5725 
5726   return true;
5727 }
5728 
canWidenShuffleElements(ArrayRef<int> Mask,const APInt & Zeroable,bool V2IsZero,SmallVectorImpl<int> & WidenedMask)5729 static bool canWidenShuffleElements(ArrayRef<int> Mask,
5730                                     const APInt &Zeroable,
5731                                     bool V2IsZero,
5732                                     SmallVectorImpl<int> &WidenedMask) {
5733   // Create an alternative mask with info about zeroable elements.
5734   // Here we do not set undef elements as zeroable.
5735   SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
5736   if (V2IsZero) {
5737     assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
5738     for (int i = 0, Size = Mask.size(); i != Size; ++i)
5739       if (Mask[i] != SM_SentinelUndef && Zeroable[i])
5740         ZeroableMask[i] = SM_SentinelZero;
5741   }
5742   return canWidenShuffleElements(ZeroableMask, WidenedMask);
5743 }
5744 
canWidenShuffleElements(ArrayRef<int> Mask)5745 static bool canWidenShuffleElements(ArrayRef<int> Mask) {
5746   SmallVector<int, 32> WidenedMask;
5747   return canWidenShuffleElements(Mask, WidenedMask);
5748 }
5749 
5750 // Attempt to narrow/widen shuffle mask until it matches the target number of
5751 // elements.
scaleShuffleElements(ArrayRef<int> Mask,unsigned NumDstElts,SmallVectorImpl<int> & ScaledMask)5752 static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
5753                                  SmallVectorImpl<int> &ScaledMask) {
5754   unsigned NumSrcElts = Mask.size();
5755   assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
5756          "Illegal shuffle scale factor");
5757 
5758   // Narrowing is guaranteed to work.
5759   if (NumDstElts >= NumSrcElts) {
5760     int Scale = NumDstElts / NumSrcElts;
5761     llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
5762     return true;
5763   }
5764 
5765   // We have to repeat the widening until we reach the target size, but we can
5766   // split out the first widening as it sets up ScaledMask for us.
5767   if (canWidenShuffleElements(Mask, ScaledMask)) {
5768     while (ScaledMask.size() > NumDstElts) {
5769       SmallVector<int, 16> WidenedMask;
5770       if (!canWidenShuffleElements(ScaledMask, WidenedMask))
5771         return false;
5772       ScaledMask = std::move(WidenedMask);
5773     }
5774     return true;
5775   }
5776 
5777   return false;
5778 }
5779 
5780 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
isZeroNode(SDValue Elt)5781 bool X86::isZeroNode(SDValue Elt) {
5782   return isNullConstant(Elt) || isNullFPConstant(Elt);
5783 }
5784 
5785 // Build a vector of constants.
5786 // Use an UNDEF node if MaskElt == -1.
5787 // Split 64-bit constants in the 32-bit mode.
getConstVector(ArrayRef<int> Values,MVT VT,SelectionDAG & DAG,const SDLoc & dl,bool IsMask=false)5788 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
5789                               const SDLoc &dl, bool IsMask = false) {
5790 
5791   SmallVector<SDValue, 32>  Ops;
5792   bool Split = false;
5793 
5794   MVT ConstVecVT = VT;
5795   unsigned NumElts = VT.getVectorNumElements();
5796   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5797   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5798     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5799     Split = true;
5800   }
5801 
5802   MVT EltVT = ConstVecVT.getVectorElementType();
5803   for (unsigned i = 0; i < NumElts; ++i) {
5804     bool IsUndef = Values[i] < 0 && IsMask;
5805     SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
5806       DAG.getConstant(Values[i], dl, EltVT);
5807     Ops.push_back(OpNode);
5808     if (Split)
5809       Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
5810                     DAG.getConstant(0, dl, EltVT));
5811   }
5812   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5813   if (Split)
5814     ConstsNode = DAG.getBitcast(VT, ConstsNode);
5815   return ConstsNode;
5816 }
5817 
getConstVector(ArrayRef<APInt> Bits,APInt & Undefs,MVT VT,SelectionDAG & DAG,const SDLoc & dl)5818 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
5819                               MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5820   assert(Bits.size() == Undefs.getBitWidth() &&
5821          "Unequal constant and undef arrays");
5822   SmallVector<SDValue, 32> Ops;
5823   bool Split = false;
5824 
5825   MVT ConstVecVT = VT;
5826   unsigned NumElts = VT.getVectorNumElements();
5827   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5828   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5829     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5830     Split = true;
5831   }
5832 
5833   MVT EltVT = ConstVecVT.getVectorElementType();
5834   for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
5835     if (Undefs[i]) {
5836       Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
5837       continue;
5838     }
5839     const APInt &V = Bits[i];
5840     assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
5841     if (Split) {
5842       Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
5843       Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
5844     } else if (EltVT == MVT::f32) {
5845       APFloat FV(APFloat::IEEEsingle(), V);
5846       Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5847     } else if (EltVT == MVT::f64) {
5848       APFloat FV(APFloat::IEEEdouble(), V);
5849       Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5850     } else {
5851       Ops.push_back(DAG.getConstant(V, dl, EltVT));
5852     }
5853   }
5854 
5855   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5856   return DAG.getBitcast(VT, ConstsNode);
5857 }
5858 
5859 /// Returns a vector of specified type with all zero elements.
getZeroVector(MVT VT,const X86Subtarget & Subtarget,SelectionDAG & DAG,const SDLoc & dl)5860 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
5861                              SelectionDAG &DAG, const SDLoc &dl) {
5862   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
5863           VT.getVectorElementType() == MVT::i1) &&
5864          "Unexpected vector type");
5865 
5866   // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
5867   // type. This ensures they get CSE'd. But if the integer type is not
5868   // available, use a floating-point +0.0 instead.
5869   SDValue Vec;
5870   if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
5871     Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
5872   } else if (VT.isFloatingPoint()) {
5873     Vec = DAG.getConstantFP(+0.0, dl, VT);
5874   } else if (VT.getVectorElementType() == MVT::i1) {
5875     assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
5876            "Unexpected vector type");
5877     Vec = DAG.getConstant(0, dl, VT);
5878   } else {
5879     unsigned Num32BitElts = VT.getSizeInBits() / 32;
5880     Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
5881   }
5882   return DAG.getBitcast(VT, Vec);
5883 }
5884 
extractSubVector(SDValue Vec,unsigned IdxVal,SelectionDAG & DAG,const SDLoc & dl,unsigned vectorWidth)5885 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
5886                                 const SDLoc &dl, unsigned vectorWidth) {
5887   EVT VT = Vec.getValueType();
5888   EVT ElVT = VT.getVectorElementType();
5889   unsigned Factor = VT.getSizeInBits() / vectorWidth;
5890   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5891                                   VT.getVectorNumElements() / Factor);
5892 
5893   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
5894   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5895   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5896 
5897   // This is the index of the first element of the vectorWidth-bit chunk
5898   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5899   IdxVal &= ~(ElemsPerChunk - 1);
5900 
5901   // If the input is a buildvector just emit a smaller one.
5902   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5903     return DAG.getBuildVector(ResultVT, dl,
5904                               Vec->ops().slice(IdxVal, ElemsPerChunk));
5905 
5906   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5907   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5908 }
5909 
5910 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
5911 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5912 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5913 /// instructions or a simple subregister reference. Idx is an index in the
5914 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
5915 /// lowering EXTRACT_VECTOR_ELT operations easier.
extract128BitVector(SDValue Vec,unsigned IdxVal,SelectionDAG & DAG,const SDLoc & dl)5916 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5917                                    SelectionDAG &DAG, const SDLoc &dl) {
5918   assert((Vec.getValueType().is256BitVector() ||
5919           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
5920   return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5921 }
5922 
5923 /// Generate a DAG to grab 256-bits from a 512-bit vector.
extract256BitVector(SDValue Vec,unsigned IdxVal,SelectionDAG & DAG,const SDLoc & dl)5924 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5925                                    SelectionDAG &DAG, const SDLoc &dl) {
5926   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
5927   return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5928 }
5929 
insertSubVector(SDValue Result,SDValue Vec,unsigned IdxVal,SelectionDAG & DAG,const SDLoc & dl,unsigned vectorWidth)5930 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5931                                SelectionDAG &DAG, const SDLoc &dl,
5932                                unsigned vectorWidth) {
5933   assert((vectorWidth == 128 || vectorWidth == 256) &&
5934          "Unsupported vector width");
5935   // Inserting UNDEF is Result
5936   if (Vec.isUndef())
5937     return Result;
5938   EVT VT = Vec.getValueType();
5939   EVT ElVT = VT.getVectorElementType();
5940   EVT ResultVT = Result.getValueType();
5941 
5942   // Insert the relevant vectorWidth bits.
5943   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5944   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5945 
5946   // This is the index of the first element of the vectorWidth-bit chunk
5947   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5948   IdxVal &= ~(ElemsPerChunk - 1);
5949 
5950   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5951   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5952 }
5953 
5954 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
5955 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5956 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5957 /// simple superregister reference.  Idx is an index in the 128 bits
5958 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
5959 /// lowering INSERT_VECTOR_ELT operations easier.
insert128BitVector(SDValue Result,SDValue Vec,unsigned IdxVal,SelectionDAG & DAG,const SDLoc & dl)5960 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5961                                   SelectionDAG &DAG, const SDLoc &dl) {
5962   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5963   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5964 }
5965 
5966 /// Widen a vector to a larger size with the same scalar type, with the new
5967 /// elements either zero or undef.
widenSubVector(MVT VT,SDValue Vec,bool ZeroNewElements,const X86Subtarget & Subtarget,SelectionDAG & DAG,const SDLoc & dl)5968 static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
5969                               const X86Subtarget &Subtarget, SelectionDAG &DAG,
5970                               const SDLoc &dl) {
5971   assert(Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() &&
5972          Vec.getValueType().getScalarType() == VT.getScalarType() &&
5973          "Unsupported vector widening type");
5974   SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
5975                                 : DAG.getUNDEF(VT);
5976   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
5977                      DAG.getIntPtrConstant(0, dl));
5978 }
5979 
5980 /// Widen a vector to a larger size with the same scalar type, with the new
5981 /// elements either zero or undef.
widenSubVector(SDValue Vec,bool ZeroNewElements,const X86Subtarget & Subtarget,SelectionDAG & DAG,const SDLoc & dl,unsigned WideSizeInBits)5982 static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
5983                               const X86Subtarget &Subtarget, SelectionDAG &DAG,
5984                               const SDLoc &dl, unsigned WideSizeInBits) {
5985   assert(Vec.getValueSizeInBits() < WideSizeInBits &&
5986          (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
5987          "Unsupported vector widening type");
5988   unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
5989   MVT SVT = Vec.getSimpleValueType().getScalarType();
5990   MVT VT = MVT::getVectorVT(SVT, WideNumElts);
5991   return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
5992 }
5993 
5994 // Helper function to collect subvector ops that are concatenated together,
5995 // either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
5996 // The subvectors in Ops are guaranteed to be the same type.
collectConcatOps(SDNode * N,SmallVectorImpl<SDValue> & Ops)5997 static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
5998   assert(Ops.empty() && "Expected an empty ops vector");
5999 
6000   if (N->getOpcode() == ISD::CONCAT_VECTORS) {
6001     Ops.append(N->op_begin(), N->op_end());
6002     return true;
6003   }
6004 
6005   if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
6006     SDValue Src = N->getOperand(0);
6007     SDValue Sub = N->getOperand(1);
6008     const APInt &Idx = N->getConstantOperandAPInt(2);
6009     EVT VT = Src.getValueType();
6010     EVT SubVT = Sub.getValueType();
6011 
6012     // TODO - Handle more general insert_subvector chains.
6013     if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
6014         Idx == (VT.getVectorNumElements() / 2)) {
6015       // insert_subvector(insert_subvector(undef, x, lo), y, hi)
6016       if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6017           Src.getOperand(1).getValueType() == SubVT &&
6018           isNullConstant(Src.getOperand(2))) {
6019         Ops.push_back(Src.getOperand(1));
6020         Ops.push_back(Sub);
6021         return true;
6022       }
6023       // insert_subvector(x, extract_subvector(x, lo), hi)
6024       if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6025           Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
6026         Ops.append(2, Sub);
6027         return true;
6028       }
6029     }
6030   }
6031 
6032   return false;
6033 }
6034 
splitVector(SDValue Op,SelectionDAG & DAG,const SDLoc & dl)6035 static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
6036                                                const SDLoc &dl) {
6037   EVT VT = Op.getValueType();
6038   unsigned NumElems = VT.getVectorNumElements();
6039   unsigned SizeInBits = VT.getSizeInBits();
6040   assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
6041          "Can't split odd sized vector");
6042 
6043   SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
6044   SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
6045   return std::make_pair(Lo, Hi);
6046 }
6047 
6048 // Split an unary integer op into 2 half sized ops.
splitVectorIntUnary(SDValue Op,SelectionDAG & DAG)6049 static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
6050   EVT VT = Op.getValueType();
6051 
6052   // Make sure we only try to split 256/512-bit types to avoid creating
6053   // narrow vectors.
6054   assert((Op.getOperand(0).getValueType().is256BitVector() ||
6055           Op.getOperand(0).getValueType().is512BitVector()) &&
6056          (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
6057   assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
6058              VT.getVectorNumElements() &&
6059          "Unexpected VTs!");
6060 
6061   SDLoc dl(Op);
6062 
6063   // Extract the Lo/Hi vectors
6064   SDValue Lo, Hi;
6065   std::tie(Lo, Hi) = splitVector(Op.getOperand(0), DAG, dl);
6066 
6067   EVT LoVT, HiVT;
6068   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6069   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6070                      DAG.getNode(Op.getOpcode(), dl, LoVT, Lo),
6071                      DAG.getNode(Op.getOpcode(), dl, HiVT, Hi));
6072 }
6073 
6074 /// Break a binary integer operation into 2 half sized ops and then
6075 /// concatenate the result back.
splitVectorIntBinary(SDValue Op,SelectionDAG & DAG)6076 static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
6077   EVT VT = Op.getValueType();
6078 
6079   // Sanity check that all the types match.
6080   assert(Op.getOperand(0).getValueType() == VT &&
6081          Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
6082   assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
6083 
6084   SDLoc dl(Op);
6085 
6086   // Extract the LHS Lo/Hi vectors
6087   SDValue LHS1, LHS2;
6088   std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);
6089 
6090   // Extract the RHS Lo/Hi vectors
6091   SDValue RHS1, RHS2;
6092   std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);
6093 
6094   EVT LoVT, HiVT;
6095   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6096   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6097                      DAG.getNode(Op.getOpcode(), dl, LoVT, LHS1, RHS1),
6098                      DAG.getNode(Op.getOpcode(), dl, HiVT, LHS2, RHS2));
6099 }
6100 
6101 // Helper for splitting operands of an operation to legal target size and
6102 // apply a function on each part.
6103 // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
6104 // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
6105 // deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
6106 // The argument Builder is a function that will be applied on each split part:
6107 // SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
6108 template <typename F>
SplitOpsAndApply(SelectionDAG & DAG,const X86Subtarget & Subtarget,const SDLoc & DL,EVT VT,ArrayRef<SDValue> Ops,F Builder,bool CheckBWI=true)6109 SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
6110                          const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
6111                          F Builder, bool CheckBWI = true) {
6112   assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
6113   unsigned NumSubs = 1;
6114   if ((CheckBWI && Subtarget.useBWIRegs()) ||
6115       (!CheckBWI && Subtarget.useAVX512Regs())) {
6116     if (VT.getSizeInBits() > 512) {
6117       NumSubs = VT.getSizeInBits() / 512;
6118       assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
6119     }
6120   } else if (Subtarget.hasAVX2()) {
6121     if (VT.getSizeInBits() > 256) {
6122       NumSubs = VT.getSizeInBits() / 256;
6123       assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
6124     }
6125   } else {
6126     if (VT.getSizeInBits() > 128) {
6127       NumSubs = VT.getSizeInBits() / 128;
6128       assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
6129     }
6130   }
6131 
6132   if (NumSubs == 1)
6133     return Builder(DAG, DL, Ops);
6134 
6135   SmallVector<SDValue, 4> Subs;
6136   for (unsigned i = 0; i != NumSubs; ++i) {
6137     SmallVector<SDValue, 2> SubOps;
6138     for (SDValue Op : Ops) {
6139       EVT OpVT = Op.getValueType();
6140       unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
6141       unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
6142       SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
6143     }
6144     Subs.push_back(Builder(DAG, DL, SubOps));
6145   }
6146   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
6147 }
6148 
6149 /// Insert i1-subvector to i1-vector.
insert1BitVector(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)6150 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
6151                                 const X86Subtarget &Subtarget) {
6152 
6153   SDLoc dl(Op);
6154   SDValue Vec = Op.getOperand(0);
6155   SDValue SubVec = Op.getOperand(1);
6156   SDValue Idx = Op.getOperand(2);
6157   unsigned IdxVal = Op.getConstantOperandVal(2);
6158 
6159   // Inserting undef is a nop. We can just return the original vector.
6160   if (SubVec.isUndef())
6161     return Vec;
6162 
6163   if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
6164     return Op;
6165 
6166   MVT OpVT = Op.getSimpleValueType();
6167   unsigned NumElems = OpVT.getVectorNumElements();
6168   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6169 
6170   // Extend to natively supported kshift.
6171   MVT WideOpVT = OpVT;
6172   if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
6173     WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
6174 
6175   // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
6176   // if necessary.
6177   if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
6178     // May need to promote to a legal type.
6179     Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6180                      DAG.getConstant(0, dl, WideOpVT),
6181                      SubVec, Idx);
6182     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6183   }
6184 
6185   MVT SubVecVT = SubVec.getSimpleValueType();
6186   unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
6187   assert(IdxVal + SubVecNumElems <= NumElems &&
6188          IdxVal % SubVecVT.getSizeInBits() == 0 &&
6189          "Unexpected index value in INSERT_SUBVECTOR");
6190 
6191   SDValue Undef = DAG.getUNDEF(WideOpVT);
6192 
6193   if (IdxVal == 0) {
6194     // Zero lower bits of the Vec
6195     SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
6196     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
6197                       ZeroIdx);
6198     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6199     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6200     // Merge them together, SubVec should be zero extended.
6201     SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6202                          DAG.getConstant(0, dl, WideOpVT),
6203                          SubVec, ZeroIdx);
6204     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6205     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6206   }
6207 
6208   SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6209                        Undef, SubVec, ZeroIdx);
6210 
6211   if (Vec.isUndef()) {
6212     assert(IdxVal != 0 && "Unexpected index");
6213     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6214                          DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6215     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6216   }
6217 
6218   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
6219     assert(IdxVal != 0 && "Unexpected index");
6220     NumElems = WideOpVT.getVectorNumElements();
6221     unsigned ShiftLeft = NumElems - SubVecNumElems;
6222     unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6223     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6224                          DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6225     if (ShiftRight != 0)
6226       SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6227                            DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6228     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6229   }
6230 
6231   // Simple case when we put subvector in the upper part
6232   if (IdxVal + SubVecNumElems == NumElems) {
6233     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6234                          DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6235     if (SubVecNumElems * 2 == NumElems) {
6236       // Special case, use legal zero extending insert_subvector. This allows
6237       // isel to optimize when bits are known zero.
6238       Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
6239       Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6240                         DAG.getConstant(0, dl, WideOpVT),
6241                         Vec, ZeroIdx);
6242     } else {
6243       // Otherwise use explicit shifts to zero the bits.
6244       Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6245                         Undef, Vec, ZeroIdx);
6246       NumElems = WideOpVT.getVectorNumElements();
6247       SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
6248       Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6249       Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6250     }
6251     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6252     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6253   }
6254 
6255   // Inserting into the middle is more complicated.
6256 
6257   NumElems = WideOpVT.getVectorNumElements();
6258 
6259   // Widen the vector if needed.
6260   Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
6261 
6262   unsigned ShiftLeft = NumElems - SubVecNumElems;
6263   unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6264 
6265   // Do an optimization for the the most frequently used types.
6266   if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
6267     APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
6268     Mask0.flipAllBits();
6269     SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
6270     SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
6271     Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
6272     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6273                          DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6274     SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6275                          DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6276     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6277 
6278     // Reduce to original width if needed.
6279     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6280   }
6281 
6282   // Clear the upper bits of the subvector and move it to its insert position.
6283   SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6284                        DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6285   SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6286                        DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6287 
6288   // Isolate the bits below the insertion point.
6289   unsigned LowShift = NumElems - IdxVal;
6290   SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
6291                             DAG.getTargetConstant(LowShift, dl, MVT::i8));
6292   Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
6293                     DAG.getTargetConstant(LowShift, dl, MVT::i8));
6294 
6295   // Isolate the bits after the last inserted bit.
6296   unsigned HighShift = IdxVal + SubVecNumElems;
6297   SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
6298                             DAG.getTargetConstant(HighShift, dl, MVT::i8));
6299   High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
6300                     DAG.getTargetConstant(HighShift, dl, MVT::i8));
6301 
6302   // Now OR all 3 pieces together.
6303   Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
6304   SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
6305 
6306   // Reduce to original width if needed.
6307   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6308 }
6309 
concatSubVectors(SDValue V1,SDValue V2,SelectionDAG & DAG,const SDLoc & dl)6310 static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
6311                                 const SDLoc &dl) {
6312   assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
6313   EVT SubVT = V1.getValueType();
6314   EVT SubSVT = SubVT.getScalarType();
6315   unsigned SubNumElts = SubVT.getVectorNumElements();
6316   unsigned SubVectorWidth = SubVT.getSizeInBits();
6317   EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
6318   SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
6319   return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
6320 }
6321 
6322 /// Returns a vector of specified type with all bits set.
6323 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
6324 /// Then bitcast to their original type, ensuring they get CSE'd.
getOnesVector(EVT VT,SelectionDAG & DAG,const SDLoc & dl)6325 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6326   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6327          "Expected a 128/256/512-bit vector type");
6328 
6329   APInt Ones = APInt::getAllOnesValue(32);
6330   unsigned NumElts = VT.getSizeInBits() / 32;
6331   SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
6332   return DAG.getBitcast(VT, Vec);
6333 }
6334 
6335 // Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
getOpcode_EXTEND(unsigned Opcode)6336 static unsigned getOpcode_EXTEND(unsigned Opcode) {
6337   switch (Opcode) {
6338   case ISD::ANY_EXTEND:
6339   case ISD::ANY_EXTEND_VECTOR_INREG:
6340     return ISD::ANY_EXTEND;
6341   case ISD::ZERO_EXTEND:
6342   case ISD::ZERO_EXTEND_VECTOR_INREG:
6343     return ISD::ZERO_EXTEND;
6344   case ISD::SIGN_EXTEND:
6345   case ISD::SIGN_EXTEND_VECTOR_INREG:
6346     return ISD::SIGN_EXTEND;
6347   }
6348   llvm_unreachable("Unknown opcode");
6349 }
6350 
6351 // Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)6352 static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
6353   switch (Opcode) {
6354   case ISD::ANY_EXTEND:
6355   case ISD::ANY_EXTEND_VECTOR_INREG:
6356     return ISD::ANY_EXTEND_VECTOR_INREG;
6357   case ISD::ZERO_EXTEND:
6358   case ISD::ZERO_EXTEND_VECTOR_INREG:
6359     return ISD::ZERO_EXTEND_VECTOR_INREG;
6360   case ISD::SIGN_EXTEND:
6361   case ISD::SIGN_EXTEND_VECTOR_INREG:
6362     return ISD::SIGN_EXTEND_VECTOR_INREG;
6363   }
6364   llvm_unreachable("Unknown opcode");
6365 }
6366 
getEXTEND_VECTOR_INREG(unsigned Opcode,const SDLoc & DL,EVT VT,SDValue In,SelectionDAG & DAG)6367 static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
6368                                       SDValue In, SelectionDAG &DAG) {
6369   EVT InVT = In.getValueType();
6370   assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
6371   assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
6372           ISD::ZERO_EXTEND == Opcode) &&
6373          "Unknown extension opcode");
6374 
6375   // For 256-bit vectors, we only need the lower (128-bit) input half.
6376   // For 512-bit vectors, we only need the lower input half or quarter.
6377   if (InVT.getSizeInBits() > 128) {
6378     assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
6379            "Expected VTs to be the same size!");
6380     unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
6381     In = extractSubVector(In, 0, DAG, DL,
6382                           std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
6383     InVT = In.getValueType();
6384   }
6385 
6386   if (VT.getVectorNumElements() != InVT.getVectorNumElements())
6387     Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
6388 
6389   return DAG.getNode(Opcode, DL, VT, In);
6390 }
6391 
6392 // Match (xor X, -1) -> X.
6393 // Match extract_subvector(xor X, -1) -> extract_subvector(X).
6394 // Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
IsNOT(SDValue V,SelectionDAG & DAG)6395 static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
6396   V = peekThroughBitcasts(V);
6397   if (V.getOpcode() == ISD::XOR &&
6398       ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
6399     return V.getOperand(0);
6400   if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6401       (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
6402     if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
6403       Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
6404       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
6405                          Not, V.getOperand(1));
6406     }
6407   }
6408   SmallVector<SDValue, 2> CatOps;
6409   if (collectConcatOps(V.getNode(), CatOps)) {
6410     for (SDValue &CatOp : CatOps) {
6411       SDValue NotCat = IsNOT(CatOp, DAG);
6412       if (!NotCat) return SDValue();
6413       CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
6414     }
6415     return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
6416   }
6417   return SDValue();
6418 }
6419 
createUnpackShuffleMask(EVT VT,SmallVectorImpl<int> & Mask,bool Lo,bool Unary)6420 void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
6421                                    bool Lo, bool Unary) {
6422   assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
6423          "Illegal vector type to unpack");
6424   assert(Mask.empty() && "Expected an empty shuffle mask vector");
6425   int NumElts = VT.getVectorNumElements();
6426   int NumEltsInLane = 128 / VT.getScalarSizeInBits();
6427   for (int i = 0; i < NumElts; ++i) {
6428     unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
6429     int Pos = (i % NumEltsInLane) / 2 + LaneStart;
6430     Pos += (Unary ? 0 : NumElts * (i % 2));
6431     Pos += (Lo ? 0 : NumEltsInLane / 2);
6432     Mask.push_back(Pos);
6433   }
6434 }
6435 
6436 /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
6437 /// imposed by AVX and specific to the unary pattern. Example:
6438 /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
6439 /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
createSplat2ShuffleMask(MVT VT,SmallVectorImpl<int> & Mask,bool Lo)6440 void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6441                                    bool Lo) {
6442   assert(Mask.empty() && "Expected an empty shuffle mask vector");
6443   int NumElts = VT.getVectorNumElements();
6444   for (int i = 0; i < NumElts; ++i) {
6445     int Pos = i / 2;
6446     Pos += (Lo ? 0 : NumElts / 2);
6447     Mask.push_back(Pos);
6448   }
6449 }
6450 
6451 /// Returns a vector_shuffle node for an unpackl operation.
getUnpackl(SelectionDAG & DAG,const SDLoc & dl,EVT VT,SDValue V1,SDValue V2)6452 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
6453                           SDValue V1, SDValue V2) {
6454   SmallVector<int, 8> Mask;
6455   createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
6456   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6457 }
6458 
6459 /// Returns a vector_shuffle node for an unpackh operation.
getUnpackh(SelectionDAG & DAG,const SDLoc & dl,EVT VT,SDValue V1,SDValue V2)6460 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
6461                           SDValue V1, SDValue V2) {
6462   SmallVector<int, 8> Mask;
6463   createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
6464   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6465 }
6466 
6467 /// Return a vector_shuffle of the specified vector of zero or undef vector.
6468 /// This produces a shuffle where the low element of V2 is swizzled into the
6469 /// zero/undef vector, landing at element Idx.
6470 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
getShuffleVectorZeroOrUndef(SDValue V2,int Idx,bool IsZero,const X86Subtarget & Subtarget,SelectionDAG & DAG)6471 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
6472                                            bool IsZero,
6473                                            const X86Subtarget &Subtarget,
6474                                            SelectionDAG &DAG) {
6475   MVT VT = V2.getSimpleValueType();
6476   SDValue V1 = IsZero
6477     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
6478   int NumElems = VT.getVectorNumElements();
6479   SmallVector<int, 16> MaskVec(NumElems);
6480   for (int i = 0; i != NumElems; ++i)
6481     // If this is the insertion idx, put the low elt of V2 here.
6482     MaskVec[i] = (i == Idx) ? NumElems : i;
6483   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
6484 }
6485 
getTargetConstantFromBasePtr(SDValue Ptr)6486 static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
6487   if (Ptr.getOpcode() == X86ISD::Wrapper ||
6488       Ptr.getOpcode() == X86ISD::WrapperRIP)
6489     Ptr = Ptr.getOperand(0);
6490 
6491   auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
6492   if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
6493     return nullptr;
6494 
6495   return CNode->getConstVal();
6496 }
6497 
getTargetConstantFromNode(LoadSDNode * Load)6498 static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
6499   if (!Load || !ISD::isNormalLoad(Load))
6500     return nullptr;
6501   return getTargetConstantFromBasePtr(Load->getBasePtr());
6502 }
6503 
getTargetConstantFromNode(SDValue Op)6504 static const Constant *getTargetConstantFromNode(SDValue Op) {
6505   Op = peekThroughBitcasts(Op);
6506   return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
6507 }
6508 
6509 const Constant *
getTargetConstantFromLoad(LoadSDNode * LD) const6510 X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
6511   assert(LD && "Unexpected null LoadSDNode");
6512   return getTargetConstantFromNode(LD);
6513 }
6514 
6515 // Extract raw constant bits from constant pools.
getTargetConstantBitsFromNode(SDValue Op,unsigned EltSizeInBits,APInt & UndefElts,SmallVectorImpl<APInt> & EltBits,bool AllowWholeUndefs=true,bool AllowPartialUndefs=true)6516 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
6517                                           APInt &UndefElts,
6518                                           SmallVectorImpl<APInt> &EltBits,
6519                                           bool AllowWholeUndefs = true,
6520                                           bool AllowPartialUndefs = true) {
6521   assert(EltBits.empty() && "Expected an empty EltBits vector");
6522 
6523   Op = peekThroughBitcasts(Op);
6524 
6525   EVT VT = Op.getValueType();
6526   unsigned SizeInBits = VT.getSizeInBits();
6527   assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
6528   unsigned NumElts = SizeInBits / EltSizeInBits;
6529 
6530   // Bitcast a source array of element bits to the target size.
6531   auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
6532     unsigned NumSrcElts = UndefSrcElts.getBitWidth();
6533     unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
6534     assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
6535            "Constant bit sizes don't match");
6536 
6537     // Don't split if we don't allow undef bits.
6538     bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
6539     if (UndefSrcElts.getBoolValue() && !AllowUndefs)
6540       return false;
6541 
6542     // If we're already the right size, don't bother bitcasting.
6543     if (NumSrcElts == NumElts) {
6544       UndefElts = UndefSrcElts;
6545       EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
6546       return true;
6547     }
6548 
6549     // Extract all the undef/constant element data and pack into single bitsets.
6550     APInt UndefBits(SizeInBits, 0);
6551     APInt MaskBits(SizeInBits, 0);
6552 
6553     for (unsigned i = 0; i != NumSrcElts; ++i) {
6554       unsigned BitOffset = i * SrcEltSizeInBits;
6555       if (UndefSrcElts[i])
6556         UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
6557       MaskBits.insertBits(SrcEltBits[i], BitOffset);
6558     }
6559 
6560     // Split the undef/constant single bitset data into the target elements.
6561     UndefElts = APInt(NumElts, 0);
6562     EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
6563 
6564     for (unsigned i = 0; i != NumElts; ++i) {
6565       unsigned BitOffset = i * EltSizeInBits;
6566       APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
6567 
6568       // Only treat an element as UNDEF if all bits are UNDEF.
6569       if (UndefEltBits.isAllOnesValue()) {
6570         if (!AllowWholeUndefs)
6571           return false;
6572         UndefElts.setBit(i);
6573         continue;
6574       }
6575 
6576       // If only some bits are UNDEF then treat them as zero (or bail if not
6577       // supported).
6578       if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
6579         return false;
6580 
6581       EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
6582     }
6583     return true;
6584   };
6585 
6586   // Collect constant bits and insert into mask/undef bit masks.
6587   auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
6588                                 unsigned UndefBitIndex) {
6589     if (!Cst)
6590       return false;
6591     if (isa<UndefValue>(Cst)) {
6592       Undefs.setBit(UndefBitIndex);
6593       return true;
6594     }
6595     if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
6596       Mask = CInt->getValue();
6597       return true;
6598     }
6599     if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
6600       Mask = CFP->getValueAPF().bitcastToAPInt();
6601       return true;
6602     }
6603     return false;
6604   };
6605 
6606   // Handle UNDEFs.
6607   if (Op.isUndef()) {
6608     APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
6609     SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
6610     return CastBitData(UndefSrcElts, SrcEltBits);
6611   }
6612 
6613   // Extract scalar constant bits.
6614   if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
6615     APInt UndefSrcElts = APInt::getNullValue(1);
6616     SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
6617     return CastBitData(UndefSrcElts, SrcEltBits);
6618   }
6619   if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
6620     APInt UndefSrcElts = APInt::getNullValue(1);
6621     APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6622     SmallVector<APInt, 64> SrcEltBits(1, RawBits);
6623     return CastBitData(UndefSrcElts, SrcEltBits);
6624   }
6625 
6626   // Extract constant bits from build vector.
6627   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6628     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6629     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6630 
6631     APInt UndefSrcElts(NumSrcElts, 0);
6632     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6633     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6634       const SDValue &Src = Op.getOperand(i);
6635       if (Src.isUndef()) {
6636         UndefSrcElts.setBit(i);
6637         continue;
6638       }
6639       auto *Cst = cast<ConstantSDNode>(Src);
6640       SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
6641     }
6642     return CastBitData(UndefSrcElts, SrcEltBits);
6643   }
6644   if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
6645     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6646     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6647 
6648     APInt UndefSrcElts(NumSrcElts, 0);
6649     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6650     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6651       const SDValue &Src = Op.getOperand(i);
6652       if (Src.isUndef()) {
6653         UndefSrcElts.setBit(i);
6654         continue;
6655       }
6656       auto *Cst = cast<ConstantFPSDNode>(Src);
6657       APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6658       SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
6659     }
6660     return CastBitData(UndefSrcElts, SrcEltBits);
6661   }
6662 
6663   // Extract constant bits from constant pool vector.
6664   if (auto *Cst = getTargetConstantFromNode(Op)) {
6665     Type *CstTy = Cst->getType();
6666     unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
6667     if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
6668       return false;
6669 
6670     unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
6671     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6672 
6673     APInt UndefSrcElts(NumSrcElts, 0);
6674     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6675     for (unsigned i = 0; i != NumSrcElts; ++i)
6676       if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
6677                                UndefSrcElts, i))
6678         return false;
6679 
6680     return CastBitData(UndefSrcElts, SrcEltBits);
6681   }
6682 
6683   // Extract constant bits from a broadcasted constant pool scalar.
6684   if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
6685       EltSizeInBits <= VT.getScalarSizeInBits()) {
6686     auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
6687     if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
6688       return false;
6689 
6690     SDValue Ptr = MemIntr->getBasePtr();
6691     if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
6692       unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
6693       unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6694 
6695       APInt UndefSrcElts(NumSrcElts, 0);
6696       SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
6697       if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
6698         if (UndefSrcElts[0])
6699           UndefSrcElts.setBits(0, NumSrcElts);
6700         SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
6701         return CastBitData(UndefSrcElts, SrcEltBits);
6702       }
6703     }
6704   }
6705 
6706   // Extract constant bits from a subvector broadcast.
6707   if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
6708     auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
6709     SDValue Ptr = MemIntr->getBasePtr();
6710     if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
6711       Type *CstTy = Cst->getType();
6712       unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
6713       if (!CstTy->isVectorTy() || (SizeInBits % CstSizeInBits) != 0)
6714         return false;
6715       unsigned SubEltSizeInBits = CstTy->getScalarSizeInBits();
6716       unsigned NumSubElts = CstSizeInBits / SubEltSizeInBits;
6717       unsigned NumSubVecs = SizeInBits / CstSizeInBits;
6718       APInt UndefSubElts(NumSubElts, 0);
6719       SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
6720                                         APInt(SubEltSizeInBits, 0));
6721       for (unsigned i = 0; i != NumSubElts; ++i) {
6722         if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
6723                                  UndefSubElts, i))
6724           return false;
6725         for (unsigned j = 1; j != NumSubVecs; ++j)
6726           SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
6727       }
6728       UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
6729                                      UndefSubElts);
6730       return CastBitData(UndefSubElts, SubEltBits);
6731     }
6732   }
6733 
6734   // Extract a rematerialized scalar constant insertion.
6735   if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
6736       Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
6737       isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
6738     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6739     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6740 
6741     APInt UndefSrcElts(NumSrcElts, 0);
6742     SmallVector<APInt, 64> SrcEltBits;
6743     auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
6744     SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
6745     SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
6746     return CastBitData(UndefSrcElts, SrcEltBits);
6747   }
6748 
6749   // Insert constant bits from a base and sub vector sources.
6750   if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
6751     // If bitcasts to larger elements we might lose track of undefs - don't
6752     // allow any to be safe.
6753     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6754     bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
6755 
6756     APInt UndefSrcElts, UndefSubElts;
6757     SmallVector<APInt, 32> EltSrcBits, EltSubBits;
6758     if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
6759                                       UndefSubElts, EltSubBits,
6760                                       AllowWholeUndefs && AllowUndefs,
6761                                       AllowPartialUndefs && AllowUndefs) &&
6762         getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
6763                                       UndefSrcElts, EltSrcBits,
6764                                       AllowWholeUndefs && AllowUndefs,
6765                                       AllowPartialUndefs && AllowUndefs)) {
6766       unsigned BaseIdx = Op.getConstantOperandVal(2);
6767       UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
6768       for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
6769         EltSrcBits[BaseIdx + i] = EltSubBits[i];
6770       return CastBitData(UndefSrcElts, EltSrcBits);
6771     }
6772   }
6773 
6774   // Extract constant bits from a subvector's source.
6775   if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
6776     // TODO - support extract_subvector through bitcasts.
6777     if (EltSizeInBits != VT.getScalarSizeInBits())
6778       return false;
6779 
6780     if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6781                                       UndefElts, EltBits, AllowWholeUndefs,
6782                                       AllowPartialUndefs)) {
6783       EVT SrcVT = Op.getOperand(0).getValueType();
6784       unsigned NumSrcElts = SrcVT.getVectorNumElements();
6785       unsigned NumSubElts = VT.getVectorNumElements();
6786       unsigned BaseIdx = Op.getConstantOperandVal(1);
6787       UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
6788       if ((BaseIdx + NumSubElts) != NumSrcElts)
6789         EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
6790       if (BaseIdx != 0)
6791         EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
6792       return true;
6793     }
6794   }
6795 
6796   // Extract constant bits from shuffle node sources.
6797   if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
6798     // TODO - support shuffle through bitcasts.
6799     if (EltSizeInBits != VT.getScalarSizeInBits())
6800       return false;
6801 
6802     ArrayRef<int> Mask = SVN->getMask();
6803     if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
6804         llvm::any_of(Mask, [](int M) { return M < 0; }))
6805       return false;
6806 
6807     APInt UndefElts0, UndefElts1;
6808     SmallVector<APInt, 32> EltBits0, EltBits1;
6809     if (isAnyInRange(Mask, 0, NumElts) &&
6810         !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6811                                        UndefElts0, EltBits0, AllowWholeUndefs,
6812                                        AllowPartialUndefs))
6813       return false;
6814     if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
6815         !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
6816                                        UndefElts1, EltBits1, AllowWholeUndefs,
6817                                        AllowPartialUndefs))
6818       return false;
6819 
6820     UndefElts = APInt::getNullValue(NumElts);
6821     for (int i = 0; i != (int)NumElts; ++i) {
6822       int M = Mask[i];
6823       if (M < 0) {
6824         UndefElts.setBit(i);
6825         EltBits.push_back(APInt::getNullValue(EltSizeInBits));
6826       } else if (M < (int)NumElts) {
6827         if (UndefElts0[M])
6828           UndefElts.setBit(i);
6829         EltBits.push_back(EltBits0[M]);
6830       } else {
6831         if (UndefElts1[M - NumElts])
6832           UndefElts.setBit(i);
6833         EltBits.push_back(EltBits1[M - NumElts]);
6834       }
6835     }
6836     return true;
6837   }
6838 
6839   return false;
6840 }
6841 
6842 namespace llvm {
6843 namespace X86 {
isConstantSplat(SDValue Op,APInt & SplatVal,bool AllowPartialUndefs)6844 bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
6845   APInt UndefElts;
6846   SmallVector<APInt, 16> EltBits;
6847   if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
6848                                     UndefElts, EltBits, true,
6849                                     AllowPartialUndefs)) {
6850     int SplatIndex = -1;
6851     for (int i = 0, e = EltBits.size(); i != e; ++i) {
6852       if (UndefElts[i])
6853         continue;
6854       if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
6855         SplatIndex = -1;
6856         break;
6857       }
6858       SplatIndex = i;
6859     }
6860     if (0 <= SplatIndex) {
6861       SplatVal = EltBits[SplatIndex];
6862       return true;
6863     }
6864   }
6865 
6866   return false;
6867 }
6868 } // namespace X86
6869 } // namespace llvm
6870 
getTargetShuffleMaskIndices(SDValue MaskNode,unsigned MaskEltSizeInBits,SmallVectorImpl<uint64_t> & RawMask,APInt & UndefElts)6871 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
6872                                         unsigned MaskEltSizeInBits,
6873                                         SmallVectorImpl<uint64_t> &RawMask,
6874                                         APInt &UndefElts) {
6875   // Extract the raw target constant bits.
6876   SmallVector<APInt, 64> EltBits;
6877   if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
6878                                      EltBits, /* AllowWholeUndefs */ true,
6879                                      /* AllowPartialUndefs */ false))
6880     return false;
6881 
6882   // Insert the extracted elements into the mask.
6883   for (const APInt &Elt : EltBits)
6884     RawMask.push_back(Elt.getZExtValue());
6885 
6886   return true;
6887 }
6888 
6889 /// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
6890 /// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
6891 /// Note: This ignores saturation, so inputs must be checked first.
createPackShuffleMask(MVT VT,SmallVectorImpl<int> & Mask,bool Unary,unsigned NumStages=1)6892 static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6893                                   bool Unary, unsigned NumStages = 1) {
6894   assert(Mask.empty() && "Expected an empty shuffle mask vector");
6895   unsigned NumElts = VT.getVectorNumElements();
6896   unsigned NumLanes = VT.getSizeInBits() / 128;
6897   unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
6898   unsigned Offset = Unary ? 0 : NumElts;
6899   unsigned Repetitions = 1u << (NumStages - 1);
6900   unsigned Increment = 1u << NumStages;
6901   assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
6902 
6903   for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
6904     for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
6905       for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
6906         Mask.push_back(Elt + (Lane * NumEltsPerLane));
6907       for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
6908         Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
6909     }
6910   }
6911 }
6912 
6913 // Split the demanded elts of a PACKSS/PACKUS node between its operands.
getPackDemandedElts(EVT VT,const APInt & DemandedElts,APInt & DemandedLHS,APInt & DemandedRHS)6914 static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
6915                                 APInt &DemandedLHS, APInt &DemandedRHS) {
6916   int NumLanes = VT.getSizeInBits() / 128;
6917   int NumElts = DemandedElts.getBitWidth();
6918   int NumInnerElts = NumElts / 2;
6919   int NumEltsPerLane = NumElts / NumLanes;
6920   int NumInnerEltsPerLane = NumInnerElts / NumLanes;
6921 
6922   DemandedLHS = APInt::getNullValue(NumInnerElts);
6923   DemandedRHS = APInt::getNullValue(NumInnerElts);
6924 
6925   // Map DemandedElts to the packed operands.
6926   for (int Lane = 0; Lane != NumLanes; ++Lane) {
6927     for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
6928       int OuterIdx = (Lane * NumEltsPerLane) + Elt;
6929       int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
6930       if (DemandedElts[OuterIdx])
6931         DemandedLHS.setBit(InnerIdx);
6932       if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
6933         DemandedRHS.setBit(InnerIdx);
6934     }
6935   }
6936 }
6937 
6938 // Split the demanded elts of a HADD/HSUB node between its operands.
getHorizDemandedElts(EVT VT,const APInt & DemandedElts,APInt & DemandedLHS,APInt & DemandedRHS)6939 static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
6940                                  APInt &DemandedLHS, APInt &DemandedRHS) {
6941   int NumLanes = VT.getSizeInBits() / 128;
6942   int NumElts = DemandedElts.getBitWidth();
6943   int NumEltsPerLane = NumElts / NumLanes;
6944   int HalfEltsPerLane = NumEltsPerLane / 2;
6945 
6946   DemandedLHS = APInt::getNullValue(NumElts);
6947   DemandedRHS = APInt::getNullValue(NumElts);
6948 
6949   // Map DemandedElts to the horizontal operands.
6950   for (int Idx = 0; Idx != NumElts; ++Idx) {
6951     if (!DemandedElts[Idx])
6952       continue;
6953     int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
6954     int LocalIdx = Idx % NumEltsPerLane;
6955     if (LocalIdx < HalfEltsPerLane) {
6956       DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
6957       DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
6958     } else {
6959       LocalIdx -= HalfEltsPerLane;
6960       DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
6961       DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
6962     }
6963   }
6964 }
6965 
6966 /// Calculates the shuffle mask corresponding to the target-specific opcode.
6967 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
6968 /// operands in \p Ops, and returns true.
6969 /// Sets \p IsUnary to true if only one source is used. Note that this will set
6970 /// IsUnary for shuffles which use a single input multiple times, and in those
6971 /// cases it will adjust the mask to only have indices within that single input.
6972 /// It is an error to call this with non-empty Mask/Ops vectors.
getTargetShuffleMask(SDNode * N,MVT VT,bool AllowSentinelZero,SmallVectorImpl<SDValue> & Ops,SmallVectorImpl<int> & Mask,bool & IsUnary)6973 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
6974                                  SmallVectorImpl<SDValue> &Ops,
6975                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
6976   unsigned NumElems = VT.getVectorNumElements();
6977   unsigned MaskEltSize = VT.getScalarSizeInBits();
6978   SmallVector<uint64_t, 32> RawMask;
6979   APInt RawUndefs;
6980   uint64_t ImmN;
6981 
6982   assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
6983   assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
6984 
6985   IsUnary = false;
6986   bool IsFakeUnary = false;
6987   switch (N->getOpcode()) {
6988   case X86ISD::BLENDI:
6989     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6990     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6991     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6992     DecodeBLENDMask(NumElems, ImmN, Mask);
6993     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6994     break;
6995   case X86ISD::SHUFP:
6996     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6997     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6998     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6999     DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
7000     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7001     break;
7002   case X86ISD::INSERTPS:
7003     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7004     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7005     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7006     DecodeINSERTPSMask(ImmN, Mask);
7007     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7008     break;
7009   case X86ISD::EXTRQI:
7010     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7011     if (isa<ConstantSDNode>(N->getOperand(1)) &&
7012         isa<ConstantSDNode>(N->getOperand(2))) {
7013       int BitLen = N->getConstantOperandVal(1);
7014       int BitIdx = N->getConstantOperandVal(2);
7015       DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7016       IsUnary = true;
7017     }
7018     break;
7019   case X86ISD::INSERTQI:
7020     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7021     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7022     if (isa<ConstantSDNode>(N->getOperand(2)) &&
7023         isa<ConstantSDNode>(N->getOperand(3))) {
7024       int BitLen = N->getConstantOperandVal(2);
7025       int BitIdx = N->getConstantOperandVal(3);
7026       DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7027       IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7028     }
7029     break;
7030   case X86ISD::UNPCKH:
7031     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7032     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7033     DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
7034     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7035     break;
7036   case X86ISD::UNPCKL:
7037     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7038     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7039     DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
7040     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7041     break;
7042   case X86ISD::MOVHLPS:
7043     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7044     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7045     DecodeMOVHLPSMask(NumElems, Mask);
7046     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7047     break;
7048   case X86ISD::MOVLHPS:
7049     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7050     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7051     DecodeMOVLHPSMask(NumElems, Mask);
7052     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7053     break;
7054   case X86ISD::VALIGN:
7055     assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
7056            "Only 32-bit and 64-bit elements are supported!");
7057     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7058     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7059     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7060     DecodeVALIGNMask(NumElems, ImmN, Mask);
7061     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7062     Ops.push_back(N->getOperand(1));
7063     Ops.push_back(N->getOperand(0));
7064     break;
7065   case X86ISD::PALIGNR:
7066     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
7067     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7068     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7069     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7070     DecodePALIGNRMask(NumElems, ImmN, Mask);
7071     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7072     Ops.push_back(N->getOperand(1));
7073     Ops.push_back(N->getOperand(0));
7074     break;
7075   case X86ISD::VSHLDQ:
7076     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
7077     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7078     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7079     DecodePSLLDQMask(NumElems, ImmN, Mask);
7080     IsUnary = true;
7081     break;
7082   case X86ISD::VSRLDQ:
7083     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
7084     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7085     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7086     DecodePSRLDQMask(NumElems, ImmN, Mask);
7087     IsUnary = true;
7088     break;
7089   case X86ISD::PSHUFD:
7090   case X86ISD::VPERMILPI:
7091     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7092     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7093     DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
7094     IsUnary = true;
7095     break;
7096   case X86ISD::PSHUFHW:
7097     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7098     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7099     DecodePSHUFHWMask(NumElems, ImmN, Mask);
7100     IsUnary = true;
7101     break;
7102   case X86ISD::PSHUFLW:
7103     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7104     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7105     DecodePSHUFLWMask(NumElems, ImmN, Mask);
7106     IsUnary = true;
7107     break;
7108   case X86ISD::VZEXT_MOVL:
7109     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7110     DecodeZeroMoveLowMask(NumElems, Mask);
7111     IsUnary = true;
7112     break;
7113   case X86ISD::VBROADCAST:
7114     // We only decode broadcasts of same-sized vectors, peeking through to
7115     // extracted subvectors is likely to cause hasOneUse issues with
7116     // SimplifyDemandedBits etc.
7117     if (N->getOperand(0).getValueType() == VT) {
7118       DecodeVectorBroadcast(NumElems, Mask);
7119       IsUnary = true;
7120       break;
7121     }
7122     return false;
7123   case X86ISD::VPERMILPV: {
7124     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7125     IsUnary = true;
7126     SDValue MaskNode = N->getOperand(1);
7127     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7128                                     RawUndefs)) {
7129       DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
7130       break;
7131     }
7132     return false;
7133   }
7134   case X86ISD::PSHUFB: {
7135     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
7136     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7137     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7138     IsUnary = true;
7139     SDValue MaskNode = N->getOperand(1);
7140     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7141       DecodePSHUFBMask(RawMask, RawUndefs, Mask);
7142       break;
7143     }
7144     return false;
7145   }
7146   case X86ISD::VPERMI:
7147     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7148     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7149     DecodeVPERMMask(NumElems, ImmN, Mask);
7150     IsUnary = true;
7151     break;
7152   case X86ISD::MOVSS:
7153   case X86ISD::MOVSD:
7154     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7155     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7156     DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
7157     break;
7158   case X86ISD::VPERM2X128:
7159     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7160     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7161     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7162     DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
7163     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7164     break;
7165   case X86ISD::SHUF128:
7166     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7167     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7168     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7169     decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
7170     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7171     break;
7172   case X86ISD::MOVSLDUP:
7173     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7174     DecodeMOVSLDUPMask(NumElems, Mask);
7175     IsUnary = true;
7176     break;
7177   case X86ISD::MOVSHDUP:
7178     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7179     DecodeMOVSHDUPMask(NumElems, Mask);
7180     IsUnary = true;
7181     break;
7182   case X86ISD::MOVDDUP:
7183     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7184     DecodeMOVDDUPMask(NumElems, Mask);
7185     IsUnary = true;
7186     break;
7187   case X86ISD::VPERMIL2: {
7188     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7189     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7190     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7191     SDValue MaskNode = N->getOperand(2);
7192     SDValue CtrlNode = N->getOperand(3);
7193     if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
7194       unsigned CtrlImm = CtrlOp->getZExtValue();
7195       if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7196                                       RawUndefs)) {
7197         DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
7198                             Mask);
7199         break;
7200       }
7201     }
7202     return false;
7203   }
7204   case X86ISD::VPPERM: {
7205     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7206     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7207     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7208     SDValue MaskNode = N->getOperand(2);
7209     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7210       DecodeVPPERMMask(RawMask, RawUndefs, Mask);
7211       break;
7212     }
7213     return false;
7214   }
7215   case X86ISD::VPERMV: {
7216     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7217     IsUnary = true;
7218     // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
7219     Ops.push_back(N->getOperand(1));
7220     SDValue MaskNode = N->getOperand(0);
7221     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7222                                     RawUndefs)) {
7223       DecodeVPERMVMask(RawMask, RawUndefs, Mask);
7224       break;
7225     }
7226     return false;
7227   }
7228   case X86ISD::VPERMV3: {
7229     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7230     assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
7231     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
7232     // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
7233     Ops.push_back(N->getOperand(0));
7234     Ops.push_back(N->getOperand(2));
7235     SDValue MaskNode = N->getOperand(1);
7236     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7237                                     RawUndefs)) {
7238       DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
7239       break;
7240     }
7241     return false;
7242   }
7243   default: llvm_unreachable("unknown target shuffle node");
7244   }
7245 
7246   // Empty mask indicates the decode failed.
7247   if (Mask.empty())
7248     return false;
7249 
7250   // Check if we're getting a shuffle mask with zero'd elements.
7251   if (!AllowSentinelZero && isAnyZero(Mask))
7252     return false;
7253 
7254   // If we have a fake unary shuffle, the shuffle mask is spread across two
7255   // inputs that are actually the same node. Re-map the mask to always point
7256   // into the first input.
7257   if (IsFakeUnary)
7258     for (int &M : Mask)
7259       if (M >= (int)Mask.size())
7260         M -= Mask.size();
7261 
7262   // If we didn't already add operands in the opcode-specific code, default to
7263   // adding 1 or 2 operands starting at 0.
7264   if (Ops.empty()) {
7265     Ops.push_back(N->getOperand(0));
7266     if (!IsUnary || IsFakeUnary)
7267       Ops.push_back(N->getOperand(1));
7268   }
7269 
7270   return true;
7271 }
7272 
7273 // Wrapper for getTargetShuffleMask with InUnary;
getTargetShuffleMask(SDNode * N,MVT VT,bool AllowSentinelZero,SmallVectorImpl<SDValue> & Ops,SmallVectorImpl<int> & Mask)7274 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
7275                                  SmallVectorImpl<SDValue> &Ops,
7276                                  SmallVectorImpl<int> &Mask) {
7277   bool IsUnary;
7278   return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);
7279 }
7280 
7281 /// Compute whether each element of a shuffle is zeroable.
7282 ///
7283 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
7284 /// Either it is an undef element in the shuffle mask, the element of the input
7285 /// referenced is undef, or the element of the input referenced is known to be
7286 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7287 /// as many lanes with this technique as possible to simplify the remaining
7288 /// shuffle.
computeZeroableShuffleElements(ArrayRef<int> Mask,SDValue V1,SDValue V2,APInt & KnownUndef,APInt & KnownZero)7289 static void computeZeroableShuffleElements(ArrayRef<int> Mask,
7290                                            SDValue V1, SDValue V2,
7291                                            APInt &KnownUndef, APInt &KnownZero) {
7292   int Size = Mask.size();
7293   KnownUndef = KnownZero = APInt::getNullValue(Size);
7294 
7295   V1 = peekThroughBitcasts(V1);
7296   V2 = peekThroughBitcasts(V2);
7297 
7298   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7299   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7300 
7301   int VectorSizeInBits = V1.getValueSizeInBits();
7302   int ScalarSizeInBits = VectorSizeInBits / Size;
7303   assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
7304 
7305   for (int i = 0; i < Size; ++i) {
7306     int M = Mask[i];
7307     // Handle the easy cases.
7308     if (M < 0) {
7309       KnownUndef.setBit(i);
7310       continue;
7311     }
7312     if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7313       KnownZero.setBit(i);
7314       continue;
7315     }
7316 
7317     // Determine shuffle input and normalize the mask.
7318     SDValue V = M < Size ? V1 : V2;
7319     M %= Size;
7320 
7321     // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
7322     if (V.getOpcode() != ISD::BUILD_VECTOR)
7323       continue;
7324 
7325     // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
7326     // the (larger) source element must be UNDEF/ZERO.
7327     if ((Size % V.getNumOperands()) == 0) {
7328       int Scale = Size / V->getNumOperands();
7329       SDValue Op = V.getOperand(M / Scale);
7330       if (Op.isUndef())
7331         KnownUndef.setBit(i);
7332       if (X86::isZeroNode(Op))
7333         KnownZero.setBit(i);
7334       else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
7335         APInt Val = Cst->getAPIntValue();
7336         Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7337         if (Val == 0)
7338           KnownZero.setBit(i);
7339       } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7340         APInt Val = Cst->getValueAPF().bitcastToAPInt();
7341         Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7342         if (Val == 0)
7343           KnownZero.setBit(i);
7344       }
7345       continue;
7346     }
7347 
7348     // If the BUILD_VECTOR has more elements then all the (smaller) source
7349     // elements must be UNDEF or ZERO.
7350     if ((V.getNumOperands() % Size) == 0) {
7351       int Scale = V->getNumOperands() / Size;
7352       bool AllUndef = true;
7353       bool AllZero = true;
7354       for (int j = 0; j < Scale; ++j) {
7355         SDValue Op = V.getOperand((M * Scale) + j);
7356         AllUndef &= Op.isUndef();
7357         AllZero &= X86::isZeroNode(Op);
7358       }
7359       if (AllUndef)
7360         KnownUndef.setBit(i);
7361       if (AllZero)
7362         KnownZero.setBit(i);
7363       continue;
7364     }
7365   }
7366 }
7367 
7368 /// Decode a target shuffle mask and inputs and see if any values are
7369 /// known to be undef or zero from their inputs.
7370 /// Returns true if the target shuffle mask was decoded.
7371 /// FIXME: Merge this with computeZeroableShuffleElements?
getTargetShuffleAndZeroables(SDValue N,SmallVectorImpl<int> & Mask,SmallVectorImpl<SDValue> & Ops,APInt & KnownUndef,APInt & KnownZero)7372 static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
7373                                          SmallVectorImpl<SDValue> &Ops,
7374                                          APInt &KnownUndef, APInt &KnownZero) {
7375   bool IsUnary;
7376   if (!isTargetShuffle(N.getOpcode()))
7377     return false;
7378 
7379   MVT VT = N.getSimpleValueType();
7380   if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
7381     return false;
7382 
7383   int Size = Mask.size();
7384   SDValue V1 = Ops[0];
7385   SDValue V2 = IsUnary ? V1 : Ops[1];
7386   KnownUndef = KnownZero = APInt::getNullValue(Size);
7387 
7388   V1 = peekThroughBitcasts(V1);
7389   V2 = peekThroughBitcasts(V2);
7390 
7391   assert((VT.getSizeInBits() % Size) == 0 &&
7392          "Illegal split of shuffle value type");
7393   unsigned EltSizeInBits = VT.getSizeInBits() / Size;
7394 
7395   // Extract known constant input data.
7396   APInt UndefSrcElts[2];
7397   SmallVector<APInt, 32> SrcEltBits[2];
7398   bool IsSrcConstant[2] = {
7399       getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
7400                                     SrcEltBits[0], true, false),
7401       getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
7402                                     SrcEltBits[1], true, false)};
7403 
7404   for (int i = 0; i < Size; ++i) {
7405     int M = Mask[i];
7406 
7407     // Already decoded as SM_SentinelZero / SM_SentinelUndef.
7408     if (M < 0) {
7409       assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
7410       if (SM_SentinelUndef == M)
7411         KnownUndef.setBit(i);
7412       if (SM_SentinelZero == M)
7413         KnownZero.setBit(i);
7414       continue;
7415     }
7416 
7417     // Determine shuffle input and normalize the mask.
7418     unsigned SrcIdx = M / Size;
7419     SDValue V = M < Size ? V1 : V2;
7420     M %= Size;
7421 
7422     // We are referencing an UNDEF input.
7423     if (V.isUndef()) {
7424       KnownUndef.setBit(i);
7425       continue;
7426     }
7427 
7428     // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
7429     // TODO: We currently only set UNDEF for integer types - floats use the same
7430     // registers as vectors and many of the scalar folded loads rely on the
7431     // SCALAR_TO_VECTOR pattern.
7432     if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
7433         (Size % V.getValueType().getVectorNumElements()) == 0) {
7434       int Scale = Size / V.getValueType().getVectorNumElements();
7435       int Idx = M / Scale;
7436       if (Idx != 0 && !VT.isFloatingPoint())
7437         KnownUndef.setBit(i);
7438       else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
7439         KnownZero.setBit(i);
7440       continue;
7441     }
7442 
7443     // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
7444     // base vectors.
7445     if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
7446       SDValue Vec = V.getOperand(0);
7447       int NumVecElts = Vec.getValueType().getVectorNumElements();
7448       if (Vec.isUndef() && Size == NumVecElts) {
7449         int Idx = V.getConstantOperandVal(2);
7450         int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
7451         if (M < Idx || (Idx + NumSubElts) <= M)
7452           KnownUndef.setBit(i);
7453       }
7454       continue;
7455     }
7456 
7457     // Attempt to extract from the source's constant bits.
7458     if (IsSrcConstant[SrcIdx]) {
7459       if (UndefSrcElts[SrcIdx][M])
7460         KnownUndef.setBit(i);
7461       else if (SrcEltBits[SrcIdx][M] == 0)
7462         KnownZero.setBit(i);
7463     }
7464   }
7465 
7466   assert(VT.getVectorNumElements() == (unsigned)Size &&
7467          "Different mask size from vector size!");
7468   return true;
7469 }
7470 
7471 // Replace target shuffle mask elements with known undef/zero sentinels.
resolveTargetShuffleFromZeroables(SmallVectorImpl<int> & Mask,const APInt & KnownUndef,const APInt & KnownZero,bool ResolveKnownZeros=true)7472 static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
7473                                               const APInt &KnownUndef,
7474                                               const APInt &KnownZero,
7475                                               bool ResolveKnownZeros= true) {
7476   unsigned NumElts = Mask.size();
7477   assert(KnownUndef.getBitWidth() == NumElts &&
7478          KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
7479 
7480   for (unsigned i = 0; i != NumElts; ++i) {
7481     if (KnownUndef[i])
7482       Mask[i] = SM_SentinelUndef;
7483     else if (ResolveKnownZeros && KnownZero[i])
7484       Mask[i] = SM_SentinelZero;
7485   }
7486 }
7487 
7488 // Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> & Mask,APInt & KnownUndef,APInt & KnownZero)7489 static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
7490                                               APInt &KnownUndef,
7491                                               APInt &KnownZero) {
7492   unsigned NumElts = Mask.size();
7493   KnownUndef = KnownZero = APInt::getNullValue(NumElts);
7494 
7495   for (unsigned i = 0; i != NumElts; ++i) {
7496     int M = Mask[i];
7497     if (SM_SentinelUndef == M)
7498       KnownUndef.setBit(i);
7499     if (SM_SentinelZero == M)
7500       KnownZero.setBit(i);
7501   }
7502 }
7503 
7504 // Forward declaration (for getFauxShuffleMask recursive check).
7505 // TODO: Use DemandedElts variant.
7506 static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7507                                    SmallVectorImpl<int> &Mask,
7508                                    const SelectionDAG &DAG, unsigned Depth,
7509                                    bool ResolveKnownElts);
7510 
7511 // Attempt to decode ops that could be represented as a shuffle mask.
7512 // The decoded shuffle mask may contain a different number of elements to the
7513 // destination value type.
getFauxShuffleMask(SDValue N,const APInt & DemandedElts,SmallVectorImpl<int> & Mask,SmallVectorImpl<SDValue> & Ops,const SelectionDAG & DAG,unsigned Depth,bool ResolveKnownElts)7514 static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
7515                                SmallVectorImpl<int> &Mask,
7516                                SmallVectorImpl<SDValue> &Ops,
7517                                const SelectionDAG &DAG, unsigned Depth,
7518                                bool ResolveKnownElts) {
7519   Mask.clear();
7520   Ops.clear();
7521 
7522   MVT VT = N.getSimpleValueType();
7523   unsigned NumElts = VT.getVectorNumElements();
7524   unsigned NumSizeInBits = VT.getSizeInBits();
7525   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
7526   if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
7527     return false;
7528   assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
7529   unsigned NumSizeInBytes = NumSizeInBits / 8;
7530   unsigned NumBytesPerElt = NumBitsPerElt / 8;
7531 
7532   unsigned Opcode = N.getOpcode();
7533   switch (Opcode) {
7534   case ISD::VECTOR_SHUFFLE: {
7535     // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
7536     ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
7537     if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
7538       Mask.append(ShuffleMask.begin(), ShuffleMask.end());
7539       Ops.push_back(N.getOperand(0));
7540       Ops.push_back(N.getOperand(1));
7541       return true;
7542     }
7543     return false;
7544   }
7545   case ISD::AND:
7546   case X86ISD::ANDNP: {
7547     // Attempt to decode as a per-byte mask.
7548     APInt UndefElts;
7549     SmallVector<APInt, 32> EltBits;
7550     SDValue N0 = N.getOperand(0);
7551     SDValue N1 = N.getOperand(1);
7552     bool IsAndN = (X86ISD::ANDNP == Opcode);
7553     uint64_t ZeroMask = IsAndN ? 255 : 0;
7554     if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
7555       return false;
7556     for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
7557       if (UndefElts[i]) {
7558         Mask.push_back(SM_SentinelUndef);
7559         continue;
7560       }
7561       const APInt &ByteBits = EltBits[i];
7562       if (ByteBits != 0 && ByteBits != 255)
7563         return false;
7564       Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
7565     }
7566     Ops.push_back(IsAndN ? N1 : N0);
7567     return true;
7568   }
7569   case ISD::OR: {
7570     // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
7571     // is a valid shuffle index.
7572     SDValue N0 = peekThroughBitcasts(N.getOperand(0));
7573     SDValue N1 = peekThroughBitcasts(N.getOperand(1));
7574     if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
7575       return false;
7576     SmallVector<int, 64> SrcMask0, SrcMask1;
7577     SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
7578     if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
7579                                 true) ||
7580         !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
7581                                 true))
7582       return false;
7583 
7584     size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
7585     SmallVector<int, 64> Mask0, Mask1;
7586     narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
7587     narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
7588     for (int i = 0; i != (int)MaskSize; ++i) {
7589       // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
7590       // loops converting between OR and BLEND shuffles due to
7591       // canWidenShuffleElements merging away undef elements, meaning we
7592       // fail to recognise the OR as the undef element isn't known zero.
7593       if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
7594         Mask.push_back(SM_SentinelZero);
7595       else if (Mask1[i] == SM_SentinelZero)
7596         Mask.push_back(i);
7597       else if (Mask0[i] == SM_SentinelZero)
7598         Mask.push_back(i + MaskSize);
7599       else
7600         return false;
7601     }
7602     Ops.push_back(N0);
7603     Ops.push_back(N1);
7604     return true;
7605   }
7606   case ISD::INSERT_SUBVECTOR: {
7607     SDValue Src = N.getOperand(0);
7608     SDValue Sub = N.getOperand(1);
7609     EVT SubVT = Sub.getValueType();
7610     unsigned NumSubElts = SubVT.getVectorNumElements();
7611     if (!N->isOnlyUserOf(Sub.getNode()))
7612       return false;
7613     uint64_t InsertIdx = N.getConstantOperandVal(2);
7614     // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
7615     if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
7616         Sub.getOperand(0).getValueType() == VT) {
7617       uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
7618       for (int i = 0; i != (int)NumElts; ++i)
7619         Mask.push_back(i);
7620       for (int i = 0; i != (int)NumSubElts; ++i)
7621         Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
7622       Ops.push_back(Src);
7623       Ops.push_back(Sub.getOperand(0));
7624       return true;
7625     }
7626     // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
7627     SmallVector<int, 64> SubMask;
7628     SmallVector<SDValue, 2> SubInputs;
7629     if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
7630                                 SubMask, DAG, Depth + 1, ResolveKnownElts))
7631       return false;
7632 
7633     // Subvector shuffle inputs must not be larger than the subvector.
7634     if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
7635           return SubVT.getFixedSizeInBits() <
7636                  SubInput.getValueSizeInBits().getFixedSize();
7637         }))
7638       return false;
7639 
7640     if (SubMask.size() != NumSubElts) {
7641       assert(((SubMask.size() % NumSubElts) == 0 ||
7642               (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
7643       if ((NumSubElts % SubMask.size()) == 0) {
7644         int Scale = NumSubElts / SubMask.size();
7645         SmallVector<int,64> ScaledSubMask;
7646         narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
7647         SubMask = ScaledSubMask;
7648       } else {
7649         int Scale = SubMask.size() / NumSubElts;
7650         NumSubElts = SubMask.size();
7651         NumElts *= Scale;
7652         InsertIdx *= Scale;
7653       }
7654     }
7655     Ops.push_back(Src);
7656     Ops.append(SubInputs.begin(), SubInputs.end());
7657     if (ISD::isBuildVectorAllZeros(Src.getNode()))
7658       Mask.append(NumElts, SM_SentinelZero);
7659     else
7660       for (int i = 0; i != (int)NumElts; ++i)
7661         Mask.push_back(i);
7662     for (int i = 0; i != (int)NumSubElts; ++i) {
7663       int M = SubMask[i];
7664       if (0 <= M) {
7665         int InputIdx = M / NumSubElts;
7666         M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
7667       }
7668       Mask[i + InsertIdx] = M;
7669     }
7670     return true;
7671   }
7672   case X86ISD::PINSRB:
7673   case X86ISD::PINSRW:
7674   case ISD::SCALAR_TO_VECTOR:
7675   case ISD::INSERT_VECTOR_ELT: {
7676     // Match against a insert_vector_elt/scalar_to_vector of an extract from a
7677     // vector, for matching src/dst vector types.
7678     SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
7679 
7680     unsigned DstIdx = 0;
7681     if (Opcode != ISD::SCALAR_TO_VECTOR) {
7682       // Check we have an in-range constant insertion index.
7683       if (!isa<ConstantSDNode>(N.getOperand(2)) ||
7684           N.getConstantOperandAPInt(2).uge(NumElts))
7685         return false;
7686       DstIdx = N.getConstantOperandVal(2);
7687 
7688       // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
7689       if (X86::isZeroNode(Scl)) {
7690         Ops.push_back(N.getOperand(0));
7691         for (unsigned i = 0; i != NumElts; ++i)
7692           Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
7693         return true;
7694       }
7695     }
7696 
7697     // Peek through trunc/aext/zext.
7698     // TODO: aext shouldn't require SM_SentinelZero padding.
7699     // TODO: handle shift of scalars.
7700     unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
7701     while (Scl.getOpcode() == ISD::TRUNCATE ||
7702            Scl.getOpcode() == ISD::ANY_EXTEND ||
7703            Scl.getOpcode() == ISD::ZERO_EXTEND) {
7704       Scl = Scl.getOperand(0);
7705       MinBitsPerElt =
7706           std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
7707     }
7708     if ((MinBitsPerElt % 8) != 0)
7709       return false;
7710 
7711     // Attempt to find the source vector the scalar was extracted from.
7712     SDValue SrcExtract;
7713     if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
7714          Scl.getOpcode() == X86ISD::PEXTRW ||
7715          Scl.getOpcode() == X86ISD::PEXTRB) &&
7716         Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
7717       SrcExtract = Scl;
7718     }
7719     if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
7720       return false;
7721 
7722     SDValue SrcVec = SrcExtract.getOperand(0);
7723     EVT SrcVT = SrcVec.getValueType();
7724     if (!SrcVT.getScalarType().isByteSized())
7725       return false;
7726     unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
7727     unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
7728     unsigned DstByte = DstIdx * NumBytesPerElt;
7729     MinBitsPerElt =
7730         std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
7731 
7732     // Create 'identity' byte level shuffle mask and then add inserted bytes.
7733     if (Opcode == ISD::SCALAR_TO_VECTOR) {
7734       Ops.push_back(SrcVec);
7735       Mask.append(NumSizeInBytes, SM_SentinelUndef);
7736     } else {
7737       Ops.push_back(SrcVec);
7738       Ops.push_back(N.getOperand(0));
7739       for (int i = 0; i != (int)NumSizeInBytes; ++i)
7740         Mask.push_back(NumSizeInBytes + i);
7741     }
7742 
7743     unsigned MinBytesPerElts = MinBitsPerElt / 8;
7744     MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
7745     for (unsigned i = 0; i != MinBytesPerElts; ++i)
7746       Mask[DstByte + i] = SrcByte + i;
7747     for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
7748       Mask[DstByte + i] = SM_SentinelZero;
7749     return true;
7750   }
7751   case X86ISD::PACKSS:
7752   case X86ISD::PACKUS: {
7753     SDValue N0 = N.getOperand(0);
7754     SDValue N1 = N.getOperand(1);
7755     assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
7756            N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
7757            "Unexpected input value type");
7758 
7759     APInt EltsLHS, EltsRHS;
7760     getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
7761 
7762     // If we know input saturation won't happen (or we don't care for particular
7763     // lanes), we can treat this as a truncation shuffle.
7764     bool Offset0 = false, Offset1 = false;
7765     if (Opcode == X86ISD::PACKSS) {
7766       if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
7767            DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
7768           (!(N1.isUndef() || EltsRHS.isNullValue()) &&
7769            DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
7770         return false;
7771       // We can't easily fold ASHR into a shuffle, but if it was feeding a
7772       // PACKSS then it was likely being used for sign-extension for a
7773       // truncation, so just peek through and adjust the mask accordingly.
7774       if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
7775           N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
7776         Offset0 = true;
7777         N0 = N0.getOperand(0);
7778       }
7779       if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
7780           N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
7781         Offset1 = true;
7782         N1 = N1.getOperand(0);
7783       }
7784     } else {
7785       APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
7786       if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
7787            !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
7788           (!(N1.isUndef() || EltsRHS.isNullValue()) &&
7789            !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
7790         return false;
7791     }
7792 
7793     bool IsUnary = (N0 == N1);
7794 
7795     Ops.push_back(N0);
7796     if (!IsUnary)
7797       Ops.push_back(N1);
7798 
7799     createPackShuffleMask(VT, Mask, IsUnary);
7800 
7801     if (Offset0 || Offset1) {
7802       for (int &M : Mask)
7803         if ((Offset0 && isInRange(M, 0, NumElts)) ||
7804             (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
7805           ++M;
7806     }
7807     return true;
7808   }
7809   case X86ISD::VTRUNC: {
7810     SDValue Src = N.getOperand(0);
7811     EVT SrcVT = Src.getValueType();
7812     // Truncated source must be a simple vector.
7813     if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7814         (SrcVT.getScalarSizeInBits() % 8) != 0)
7815       return false;
7816     unsigned NumSrcElts = SrcVT.getVectorNumElements();
7817     unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
7818     unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
7819     assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
7820     for (unsigned i = 0; i != NumSrcElts; ++i)
7821       Mask.push_back(i * Scale);
7822     Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
7823     Ops.push_back(Src);
7824     return true;
7825   }
7826   case X86ISD::VSHLI:
7827   case X86ISD::VSRLI: {
7828     uint64_t ShiftVal = N.getConstantOperandVal(1);
7829     // Out of range bit shifts are guaranteed to be zero.
7830     if (NumBitsPerElt <= ShiftVal) {
7831       Mask.append(NumElts, SM_SentinelZero);
7832       return true;
7833     }
7834 
7835     // We can only decode 'whole byte' bit shifts as shuffles.
7836     if ((ShiftVal % 8) != 0)
7837       break;
7838 
7839     uint64_t ByteShift = ShiftVal / 8;
7840     Ops.push_back(N.getOperand(0));
7841 
7842     // Clear mask to all zeros and insert the shifted byte indices.
7843     Mask.append(NumSizeInBytes, SM_SentinelZero);
7844 
7845     if (X86ISD::VSHLI == Opcode) {
7846       for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
7847         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
7848           Mask[i + j] = i + j - ByteShift;
7849     } else {
7850       for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
7851         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
7852           Mask[i + j - ByteShift] = i + j;
7853     }
7854     return true;
7855   }
7856   case X86ISD::VROTLI:
7857   case X86ISD::VROTRI: {
7858     // We can only decode 'whole byte' bit rotates as shuffles.
7859     uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
7860     if ((RotateVal % 8) != 0)
7861       return false;
7862     Ops.push_back(N.getOperand(0));
7863     int Offset = RotateVal / 8;
7864     Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
7865     for (int i = 0; i != (int)NumElts; ++i) {
7866       int BaseIdx = i * NumBytesPerElt;
7867       for (int j = 0; j != (int)NumBytesPerElt; ++j) {
7868         Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
7869       }
7870     }
7871     return true;
7872   }
7873   case X86ISD::VBROADCAST: {
7874     SDValue Src = N.getOperand(0);
7875     if (!Src.getSimpleValueType().isVector()) {
7876       if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7877           !isNullConstant(Src.getOperand(1)) ||
7878           Src.getOperand(0).getValueType().getScalarType() !=
7879               VT.getScalarType())
7880         return false;
7881       Src = Src.getOperand(0);
7882     }
7883     Ops.push_back(Src);
7884     Mask.append(NumElts, 0);
7885     return true;
7886   }
7887   case ISD::ZERO_EXTEND:
7888   case ISD::ANY_EXTEND:
7889   case ISD::ZERO_EXTEND_VECTOR_INREG:
7890   case ISD::ANY_EXTEND_VECTOR_INREG: {
7891     SDValue Src = N.getOperand(0);
7892     EVT SrcVT = Src.getValueType();
7893 
7894     // Extended source must be a simple vector.
7895     if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7896         (SrcVT.getScalarSizeInBits() % 8) != 0)
7897       return false;
7898 
7899     bool IsAnyExtend =
7900         (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
7901     DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
7902                          IsAnyExtend, Mask);
7903     Ops.push_back(Src);
7904     return true;
7905   }
7906   }
7907 
7908   return false;
7909 }
7910 
7911 /// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> & Inputs,SmallVectorImpl<int> & Mask)7912 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
7913                                               SmallVectorImpl<int> &Mask) {
7914   int MaskWidth = Mask.size();
7915   SmallVector<SDValue, 16> UsedInputs;
7916   for (int i = 0, e = Inputs.size(); i < e; ++i) {
7917     int lo = UsedInputs.size() * MaskWidth;
7918     int hi = lo + MaskWidth;
7919 
7920     // Strip UNDEF input usage.
7921     if (Inputs[i].isUndef())
7922       for (int &M : Mask)
7923         if ((lo <= M) && (M < hi))
7924           M = SM_SentinelUndef;
7925 
7926     // Check for unused inputs.
7927     if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
7928       for (int &M : Mask)
7929         if (lo <= M)
7930           M -= MaskWidth;
7931       continue;
7932     }
7933 
7934     // Check for repeated inputs.
7935     bool IsRepeat = false;
7936     for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
7937       if (UsedInputs[j] != Inputs[i])
7938         continue;
7939       for (int &M : Mask)
7940         if (lo <= M)
7941           M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
7942       IsRepeat = true;
7943       break;
7944     }
7945     if (IsRepeat)
7946       continue;
7947 
7948     UsedInputs.push_back(Inputs[i]);
7949   }
7950   Inputs = UsedInputs;
7951 }
7952 
7953 /// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
7954 /// and then sets the SM_SentinelUndef and SM_SentinelZero values.
7955 /// Returns true if the target shuffle mask was decoded.
getTargetShuffleInputs(SDValue Op,const APInt & DemandedElts,SmallVectorImpl<SDValue> & Inputs,SmallVectorImpl<int> & Mask,APInt & KnownUndef,APInt & KnownZero,const SelectionDAG & DAG,unsigned Depth,bool ResolveKnownElts)7956 static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
7957                                    SmallVectorImpl<SDValue> &Inputs,
7958                                    SmallVectorImpl<int> &Mask,
7959                                    APInt &KnownUndef, APInt &KnownZero,
7960                                    const SelectionDAG &DAG, unsigned Depth,
7961                                    bool ResolveKnownElts) {
7962   EVT VT = Op.getValueType();
7963   if (!VT.isSimple() || !VT.isVector())
7964     return false;
7965 
7966   if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
7967     if (ResolveKnownElts)
7968       resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
7969     return true;
7970   }
7971   if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
7972                          ResolveKnownElts)) {
7973     resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
7974     return true;
7975   }
7976   return false;
7977 }
7978 
getTargetShuffleInputs(SDValue Op,SmallVectorImpl<SDValue> & Inputs,SmallVectorImpl<int> & Mask,const SelectionDAG & DAG,unsigned Depth=0,bool ResolveKnownElts=true)7979 static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7980                                    SmallVectorImpl<int> &Mask,
7981                                    const SelectionDAG &DAG, unsigned Depth = 0,
7982                                    bool ResolveKnownElts = true) {
7983   EVT VT = Op.getValueType();
7984   if (!VT.isSimple() || !VT.isVector())
7985     return false;
7986 
7987   APInt KnownUndef, KnownZero;
7988   unsigned NumElts = Op.getValueType().getVectorNumElements();
7989   APInt DemandedElts = APInt::getAllOnesValue(NumElts);
7990   return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
7991                                 KnownZero, DAG, Depth, ResolveKnownElts);
7992 }
7993 
7994 /// Returns the scalar element that will make up the i'th
7995 /// element of the result of the vector shuffle.
getShuffleScalarElt(SDValue Op,unsigned Index,SelectionDAG & DAG,unsigned Depth)7996 static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
7997                                    SelectionDAG &DAG, unsigned Depth) {
7998   if (Depth >= SelectionDAG::MaxRecursionDepth)
7999     return SDValue(); // Limit search depth.
8000 
8001   EVT VT = Op.getValueType();
8002   unsigned Opcode = Op.getOpcode();
8003   unsigned NumElems = VT.getVectorNumElements();
8004 
8005   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
8006   if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
8007     int Elt = SV->getMaskElt(Index);
8008 
8009     if (Elt < 0)
8010       return DAG.getUNDEF(VT.getVectorElementType());
8011 
8012     SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
8013     return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8014   }
8015 
8016   // Recurse into target specific vector shuffles to find scalars.
8017   if (isTargetShuffle(Opcode)) {
8018     MVT ShufVT = VT.getSimpleVT();
8019     MVT ShufSVT = ShufVT.getVectorElementType();
8020     int NumElems = (int)ShufVT.getVectorNumElements();
8021     SmallVector<int, 16> ShuffleMask;
8022     SmallVector<SDValue, 16> ShuffleOps;
8023     if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
8024                               ShuffleMask))
8025       return SDValue();
8026 
8027     int Elt = ShuffleMask[Index];
8028     if (Elt == SM_SentinelZero)
8029       return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
8030                                  : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
8031     if (Elt == SM_SentinelUndef)
8032       return DAG.getUNDEF(ShufSVT);
8033 
8034     assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
8035     SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
8036     return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8037   }
8038 
8039   // Recurse into insert_subvector base/sub vector to find scalars.
8040   if (Opcode == ISD::INSERT_SUBVECTOR) {
8041     SDValue Vec = Op.getOperand(0);
8042     SDValue Sub = Op.getOperand(1);
8043     uint64_t SubIdx = Op.getConstantOperandVal(2);
8044     unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
8045 
8046     if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
8047       return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
8048     return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
8049   }
8050 
8051   // Recurse into concat_vectors sub vector to find scalars.
8052   if (Opcode == ISD::CONCAT_VECTORS) {
8053     EVT SubVT = Op.getOperand(0).getValueType();
8054     unsigned NumSubElts = SubVT.getVectorNumElements();
8055     uint64_t SubIdx = Index / NumSubElts;
8056     uint64_t SubElt = Index % NumSubElts;
8057     return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
8058   }
8059 
8060   // Recurse into extract_subvector src vector to find scalars.
8061   if (Opcode == ISD::EXTRACT_SUBVECTOR) {
8062     SDValue Src = Op.getOperand(0);
8063     uint64_t SrcIdx = Op.getConstantOperandVal(1);
8064     return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
8065   }
8066 
8067   // We only peek through bitcasts of the same vector width.
8068   if (Opcode == ISD::BITCAST) {
8069     SDValue Src = Op.getOperand(0);
8070     EVT SrcVT = Src.getValueType();
8071     if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
8072       return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
8073     return SDValue();
8074   }
8075 
8076   // Actual nodes that may contain scalar elements
8077 
8078   // For insert_vector_elt - either return the index matching scalar or recurse
8079   // into the base vector.
8080   if (Opcode == ISD::INSERT_VECTOR_ELT &&
8081       isa<ConstantSDNode>(Op.getOperand(2))) {
8082     if (Op.getConstantOperandAPInt(2) == Index)
8083       return Op.getOperand(1);
8084     return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
8085   }
8086 
8087   if (Opcode == ISD::SCALAR_TO_VECTOR)
8088     return (Index == 0) ? Op.getOperand(0)
8089                         : DAG.getUNDEF(VT.getVectorElementType());
8090 
8091   if (Opcode == ISD::BUILD_VECTOR)
8092     return Op.getOperand(Index);
8093 
8094   return SDValue();
8095 }
8096 
8097 // Use PINSRB/PINSRW/PINSRD to create a build vector.
LowerBuildVectorAsInsert(SDValue Op,const APInt & NonZeroMask,unsigned NumNonZero,unsigned NumZero,SelectionDAG & DAG,const X86Subtarget & Subtarget)8098 static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
8099                                         unsigned NumNonZero, unsigned NumZero,
8100                                         SelectionDAG &DAG,
8101                                         const X86Subtarget &Subtarget) {
8102   MVT VT = Op.getSimpleValueType();
8103   unsigned NumElts = VT.getVectorNumElements();
8104   assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
8105           ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
8106          "Illegal vector insertion");
8107 
8108   SDLoc dl(Op);
8109   SDValue V;
8110   bool First = true;
8111 
8112   for (unsigned i = 0; i < NumElts; ++i) {
8113     bool IsNonZero = NonZeroMask[i];
8114     if (!IsNonZero)
8115       continue;
8116 
8117     // If the build vector contains zeros or our first insertion is not the
8118     // first index then insert into zero vector to break any register
8119     // dependency else use SCALAR_TO_VECTOR.
8120     if (First) {
8121       First = false;
8122       if (NumZero || 0 != i)
8123         V = getZeroVector(VT, Subtarget, DAG, dl);
8124       else {
8125         assert(0 == i && "Expected insertion into zero-index");
8126         V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8127         V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
8128         V = DAG.getBitcast(VT, V);
8129         continue;
8130       }
8131     }
8132     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
8133                     DAG.getIntPtrConstant(i, dl));
8134   }
8135 
8136   return V;
8137 }
8138 
8139 /// Custom lower build_vector of v16i8.
LowerBuildVectorv16i8(SDValue Op,const APInt & NonZeroMask,unsigned NumNonZero,unsigned NumZero,SelectionDAG & DAG,const X86Subtarget & Subtarget)8140 static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
8141                                      unsigned NumNonZero, unsigned NumZero,
8142                                      SelectionDAG &DAG,
8143                                      const X86Subtarget &Subtarget) {
8144   if (NumNonZero > 8 && !Subtarget.hasSSE41())
8145     return SDValue();
8146 
8147   // SSE4.1 - use PINSRB to insert each byte directly.
8148   if (Subtarget.hasSSE41())
8149     return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8150                                     Subtarget);
8151 
8152   SDLoc dl(Op);
8153   SDValue V;
8154 
8155   // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
8156   for (unsigned i = 0; i < 16; i += 2) {
8157     bool ThisIsNonZero = NonZeroMask[i];
8158     bool NextIsNonZero = NonZeroMask[i + 1];
8159     if (!ThisIsNonZero && !NextIsNonZero)
8160       continue;
8161 
8162     // FIXME: Investigate combining the first 4 bytes as a i32 instead.
8163     SDValue Elt;
8164     if (ThisIsNonZero) {
8165       if (NumZero || NextIsNonZero)
8166         Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8167       else
8168         Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8169     }
8170 
8171     if (NextIsNonZero) {
8172       SDValue NextElt = Op.getOperand(i + 1);
8173       if (i == 0 && NumZero)
8174         NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
8175       else
8176         NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
8177       NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
8178                             DAG.getConstant(8, dl, MVT::i8));
8179       if (ThisIsNonZero)
8180         Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
8181       else
8182         Elt = NextElt;
8183     }
8184 
8185     // If our first insertion is not the first index or zeros are needed, then
8186     // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
8187     // elements undefined).
8188     if (!V) {
8189       if (i != 0 || NumZero)
8190         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
8191       else {
8192         V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
8193         V = DAG.getBitcast(MVT::v8i16, V);
8194         continue;
8195       }
8196     }
8197     Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
8198     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
8199                     DAG.getIntPtrConstant(i / 2, dl));
8200   }
8201 
8202   return DAG.getBitcast(MVT::v16i8, V);
8203 }
8204 
8205 /// Custom lower build_vector of v8i16.
LowerBuildVectorv8i16(SDValue Op,const APInt & NonZeroMask,unsigned NumNonZero,unsigned NumZero,SelectionDAG & DAG,const X86Subtarget & Subtarget)8206 static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
8207                                      unsigned NumNonZero, unsigned NumZero,
8208                                      SelectionDAG &DAG,
8209                                      const X86Subtarget &Subtarget) {
8210   if (NumNonZero > 4 && !Subtarget.hasSSE41())
8211     return SDValue();
8212 
8213   // Use PINSRW to insert each byte directly.
8214   return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8215                                   Subtarget);
8216 }
8217 
8218 /// Custom lower build_vector of v4i32 or v4f32.
LowerBuildVectorv4x32(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)8219 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
8220                                      const X86Subtarget &Subtarget) {
8221   // If this is a splat of a pair of elements, use MOVDDUP (unless the target
8222   // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
8223   // Because we're creating a less complicated build vector here, we may enable
8224   // further folding of the MOVDDUP via shuffle transforms.
8225   if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
8226       Op.getOperand(0) == Op.getOperand(2) &&
8227       Op.getOperand(1) == Op.getOperand(3) &&
8228       Op.getOperand(0) != Op.getOperand(1)) {
8229     SDLoc DL(Op);
8230     MVT VT = Op.getSimpleValueType();
8231     MVT EltVT = VT.getVectorElementType();
8232     // Create a new build vector with the first 2 elements followed by undef
8233     // padding, bitcast to v2f64, duplicate, and bitcast back.
8234     SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
8235                        DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
8236     SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
8237     SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
8238     return DAG.getBitcast(VT, Dup);
8239   }
8240 
8241   // Find all zeroable elements.
8242   std::bitset<4> Zeroable, Undefs;
8243   for (int i = 0; i < 4; ++i) {
8244     SDValue Elt = Op.getOperand(i);
8245     Undefs[i] = Elt.isUndef();
8246     Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
8247   }
8248   assert(Zeroable.size() - Zeroable.count() > 1 &&
8249          "We expect at least two non-zero elements!");
8250 
8251   // We only know how to deal with build_vector nodes where elements are either
8252   // zeroable or extract_vector_elt with constant index.
8253   SDValue FirstNonZero;
8254   unsigned FirstNonZeroIdx;
8255   for (unsigned i = 0; i < 4; ++i) {
8256     if (Zeroable[i])
8257       continue;
8258     SDValue Elt = Op.getOperand(i);
8259     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8260         !isa<ConstantSDNode>(Elt.getOperand(1)))
8261       return SDValue();
8262     // Make sure that this node is extracting from a 128-bit vector.
8263     MVT VT = Elt.getOperand(0).getSimpleValueType();
8264     if (!VT.is128BitVector())
8265       return SDValue();
8266     if (!FirstNonZero.getNode()) {
8267       FirstNonZero = Elt;
8268       FirstNonZeroIdx = i;
8269     }
8270   }
8271 
8272   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
8273   SDValue V1 = FirstNonZero.getOperand(0);
8274   MVT VT = V1.getSimpleValueType();
8275 
8276   // See if this build_vector can be lowered as a blend with zero.
8277   SDValue Elt;
8278   unsigned EltMaskIdx, EltIdx;
8279   int Mask[4];
8280   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
8281     if (Zeroable[EltIdx]) {
8282       // The zero vector will be on the right hand side.
8283       Mask[EltIdx] = EltIdx+4;
8284       continue;
8285     }
8286 
8287     Elt = Op->getOperand(EltIdx);
8288     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
8289     EltMaskIdx = Elt.getConstantOperandVal(1);
8290     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
8291       break;
8292     Mask[EltIdx] = EltIdx;
8293   }
8294 
8295   if (EltIdx == 4) {
8296     // Let the shuffle legalizer deal with blend operations.
8297     SDValue VZeroOrUndef = (Zeroable == Undefs)
8298                                ? DAG.getUNDEF(VT)
8299                                : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
8300     if (V1.getSimpleValueType() != VT)
8301       V1 = DAG.getBitcast(VT, V1);
8302     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
8303   }
8304 
8305   // See if we can lower this build_vector to a INSERTPS.
8306   if (!Subtarget.hasSSE41())
8307     return SDValue();
8308 
8309   SDValue V2 = Elt.getOperand(0);
8310   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
8311     V1 = SDValue();
8312 
8313   bool CanFold = true;
8314   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
8315     if (Zeroable[i])
8316       continue;
8317 
8318     SDValue Current = Op->getOperand(i);
8319     SDValue SrcVector = Current->getOperand(0);
8320     if (!V1.getNode())
8321       V1 = SrcVector;
8322     CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
8323   }
8324 
8325   if (!CanFold)
8326     return SDValue();
8327 
8328   assert(V1.getNode() && "Expected at least two non-zero elements!");
8329   if (V1.getSimpleValueType() != MVT::v4f32)
8330     V1 = DAG.getBitcast(MVT::v4f32, V1);
8331   if (V2.getSimpleValueType() != MVT::v4f32)
8332     V2 = DAG.getBitcast(MVT::v4f32, V2);
8333 
8334   // Ok, we can emit an INSERTPS instruction.
8335   unsigned ZMask = Zeroable.to_ulong();
8336 
8337   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
8338   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8339   SDLoc DL(Op);
8340   SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8341                                DAG.getIntPtrConstant(InsertPSMask, DL, true));
8342   return DAG.getBitcast(VT, Result);
8343 }
8344 
8345 /// Return a vector logical shift node.
getVShift(bool isLeft,EVT VT,SDValue SrcOp,unsigned NumBits,SelectionDAG & DAG,const TargetLowering & TLI,const SDLoc & dl)8346 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
8347                          SelectionDAG &DAG, const TargetLowering &TLI,
8348                          const SDLoc &dl) {
8349   assert(VT.is128BitVector() && "Unknown type for VShift");
8350   MVT ShVT = MVT::v16i8;
8351   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
8352   SrcOp = DAG.getBitcast(ShVT, SrcOp);
8353   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
8354   SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
8355   return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
8356 }
8357 
LowerAsSplatVectorLoad(SDValue SrcOp,MVT VT,const SDLoc & dl,SelectionDAG & DAG)8358 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
8359                                       SelectionDAG &DAG) {
8360 
8361   // Check if the scalar load can be widened into a vector load. And if
8362   // the address is "base + cst" see if the cst can be "absorbed" into
8363   // the shuffle mask.
8364   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
8365     SDValue Ptr = LD->getBasePtr();
8366     if (!ISD::isNormalLoad(LD) || !LD->isSimple())
8367       return SDValue();
8368     EVT PVT = LD->getValueType(0);
8369     if (PVT != MVT::i32 && PVT != MVT::f32)
8370       return SDValue();
8371 
8372     int FI = -1;
8373     int64_t Offset = 0;
8374     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
8375       FI = FINode->getIndex();
8376       Offset = 0;
8377     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
8378                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
8379       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
8380       Offset = Ptr.getConstantOperandVal(1);
8381       Ptr = Ptr.getOperand(0);
8382     } else {
8383       return SDValue();
8384     }
8385 
8386     // FIXME: 256-bit vector instructions don't require a strict alignment,
8387     // improve this code to support it better.
8388     Align RequiredAlign(VT.getSizeInBits() / 8);
8389     SDValue Chain = LD->getChain();
8390     // Make sure the stack object alignment is at least 16 or 32.
8391     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
8392     MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
8393     if (!InferredAlign || *InferredAlign < RequiredAlign) {
8394       if (MFI.isFixedObjectIndex(FI)) {
8395         // Can't change the alignment. FIXME: It's possible to compute
8396         // the exact stack offset and reference FI + adjust offset instead.
8397         // If someone *really* cares about this. That's the way to implement it.
8398         return SDValue();
8399       } else {
8400         MFI.setObjectAlignment(FI, RequiredAlign);
8401       }
8402     }
8403 
8404     // (Offset % 16 or 32) must be multiple of 4. Then address is then
8405     // Ptr + (Offset & ~15).
8406     if (Offset < 0)
8407       return SDValue();
8408     if ((Offset % RequiredAlign.value()) & 3)
8409       return SDValue();
8410     int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
8411     if (StartOffset) {
8412       SDLoc DL(Ptr);
8413       Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8414                         DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
8415     }
8416 
8417     int EltNo = (Offset - StartOffset) >> 2;
8418     unsigned NumElems = VT.getVectorNumElements();
8419 
8420     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
8421     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
8422                              LD->getPointerInfo().getWithOffset(StartOffset));
8423 
8424     SmallVector<int, 8> Mask(NumElems, EltNo);
8425 
8426     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
8427   }
8428 
8429   return SDValue();
8430 }
8431 
8432 // Recurse to find a LoadSDNode source and the accumulated ByteOffest.
findEltLoadSrc(SDValue Elt,LoadSDNode * & Ld,int64_t & ByteOffset)8433 static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
8434   if (ISD::isNON_EXTLoad(Elt.getNode())) {
8435     auto *BaseLd = cast<LoadSDNode>(Elt);
8436     if (!BaseLd->isSimple())
8437       return false;
8438     Ld = BaseLd;
8439     ByteOffset = 0;
8440     return true;
8441   }
8442 
8443   switch (Elt.getOpcode()) {
8444   case ISD::BITCAST:
8445   case ISD::TRUNCATE:
8446   case ISD::SCALAR_TO_VECTOR:
8447     return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
8448   case ISD::SRL:
8449     if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8450       uint64_t Idx = IdxC->getZExtValue();
8451       if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
8452         ByteOffset += Idx / 8;
8453         return true;
8454       }
8455     }
8456     break;
8457   case ISD::EXTRACT_VECTOR_ELT:
8458     if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8459       SDValue Src = Elt.getOperand(0);
8460       unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
8461       unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
8462       if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
8463           findEltLoadSrc(Src, Ld, ByteOffset)) {
8464         uint64_t Idx = IdxC->getZExtValue();
8465         ByteOffset += Idx * (SrcSizeInBits / 8);
8466         return true;
8467       }
8468     }
8469     break;
8470   }
8471 
8472   return false;
8473 }
8474 
8475 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
8476 /// elements can be replaced by a single large load which has the same value as
8477 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
8478 ///
8479 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
EltsFromConsecutiveLoads(EVT VT,ArrayRef<SDValue> Elts,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget,bool isAfterLegalize)8480 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
8481                                         const SDLoc &DL, SelectionDAG &DAG,
8482                                         const X86Subtarget &Subtarget,
8483                                         bool isAfterLegalize) {
8484   if ((VT.getScalarSizeInBits() % 8) != 0)
8485     return SDValue();
8486 
8487   unsigned NumElems = Elts.size();
8488 
8489   int LastLoadedElt = -1;
8490   APInt LoadMask = APInt::getNullValue(NumElems);
8491   APInt ZeroMask = APInt::getNullValue(NumElems);
8492   APInt UndefMask = APInt::getNullValue(NumElems);
8493 
8494   SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
8495   SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
8496 
8497   // For each element in the initializer, see if we've found a load, zero or an
8498   // undef.
8499   for (unsigned i = 0; i < NumElems; ++i) {
8500     SDValue Elt = peekThroughBitcasts(Elts[i]);
8501     if (!Elt.getNode())
8502       return SDValue();
8503     if (Elt.isUndef()) {
8504       UndefMask.setBit(i);
8505       continue;
8506     }
8507     if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
8508       ZeroMask.setBit(i);
8509       continue;
8510     }
8511 
8512     // Each loaded element must be the correct fractional portion of the
8513     // requested vector load.
8514     unsigned EltSizeInBits = Elt.getValueSizeInBits();
8515     if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
8516       return SDValue();
8517 
8518     if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
8519       return SDValue();
8520     unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
8521     if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
8522       return SDValue();
8523 
8524     LoadMask.setBit(i);
8525     LastLoadedElt = i;
8526   }
8527   assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +
8528           LoadMask.countPopulation()) == NumElems &&
8529          "Incomplete element masks");
8530 
8531   // Handle Special Cases - all undef or undef/zero.
8532   if (UndefMask.countPopulation() == NumElems)
8533     return DAG.getUNDEF(VT);
8534   if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
8535     return VT.isInteger() ? DAG.getConstant(0, DL, VT)
8536                           : DAG.getConstantFP(0.0, DL, VT);
8537 
8538   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8539   int FirstLoadedElt = LoadMask.countTrailingZeros();
8540   SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
8541   EVT EltBaseVT = EltBase.getValueType();
8542   assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
8543          "Register/Memory size mismatch");
8544   LoadSDNode *LDBase = Loads[FirstLoadedElt];
8545   assert(LDBase && "Did not find base load for merging consecutive loads");
8546   unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
8547   unsigned BaseSizeInBytes = BaseSizeInBits / 8;
8548   int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
8549   int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
8550   assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
8551 
8552   // TODO: Support offsetting the base load.
8553   if (ByteOffsets[FirstLoadedElt] != 0)
8554     return SDValue();
8555 
8556   // Check to see if the element's load is consecutive to the base load
8557   // or offset from a previous (already checked) load.
8558   auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
8559     LoadSDNode *Ld = Loads[EltIdx];
8560     int64_t ByteOffset = ByteOffsets[EltIdx];
8561     if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
8562       int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
8563       return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
8564               Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
8565     }
8566     return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
8567                                               EltIdx - FirstLoadedElt);
8568   };
8569 
8570   // Consecutive loads can contain UNDEFS but not ZERO elements.
8571   // Consecutive loads with UNDEFs and ZEROs elements require a
8572   // an additional shuffle stage to clear the ZERO elements.
8573   bool IsConsecutiveLoad = true;
8574   bool IsConsecutiveLoadWithZeros = true;
8575   for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
8576     if (LoadMask[i]) {
8577       if (!CheckConsecutiveLoad(LDBase, i)) {
8578         IsConsecutiveLoad = false;
8579         IsConsecutiveLoadWithZeros = false;
8580         break;
8581       }
8582     } else if (ZeroMask[i]) {
8583       IsConsecutiveLoad = false;
8584     }
8585   }
8586 
8587   auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
8588     auto MMOFlags = LDBase->getMemOperand()->getFlags();
8589     assert(LDBase->isSimple() &&
8590            "Cannot merge volatile or atomic loads.");
8591     SDValue NewLd =
8592         DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
8593                     LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
8594                     MMOFlags);
8595     for (auto *LD : Loads)
8596       if (LD)
8597         DAG.makeEquivalentMemoryOrdering(LD, NewLd);
8598     return NewLd;
8599   };
8600 
8601   // Check if the base load is entirely dereferenceable.
8602   bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
8603       VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
8604 
8605   // LOAD - all consecutive load/undefs (must start/end with a load or be
8606   // entirely dereferenceable). If we have found an entire vector of loads and
8607   // undefs, then return a large load of the entire vector width starting at the
8608   // base pointer. If the vector contains zeros, then attempt to shuffle those
8609   // elements.
8610   if (FirstLoadedElt == 0 &&
8611       (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
8612       (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
8613     if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
8614       return SDValue();
8615 
8616     // Don't create 256-bit non-temporal aligned loads without AVX2 as these
8617     // will lower to regular temporal loads and use the cache.
8618     if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
8619         VT.is256BitVector() && !Subtarget.hasInt256())
8620       return SDValue();
8621 
8622     if (NumElems == 1)
8623       return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
8624 
8625     if (!ZeroMask)
8626       return CreateLoad(VT, LDBase);
8627 
8628     // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
8629     // vector and a zero vector to clear out the zero elements.
8630     if (!isAfterLegalize && VT.isVector()) {
8631       unsigned NumMaskElts = VT.getVectorNumElements();
8632       if ((NumMaskElts % NumElems) == 0) {
8633         unsigned Scale = NumMaskElts / NumElems;
8634         SmallVector<int, 4> ClearMask(NumMaskElts, -1);
8635         for (unsigned i = 0; i < NumElems; ++i) {
8636           if (UndefMask[i])
8637             continue;
8638           int Offset = ZeroMask[i] ? NumMaskElts : 0;
8639           for (unsigned j = 0; j != Scale; ++j)
8640             ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
8641         }
8642         SDValue V = CreateLoad(VT, LDBase);
8643         SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
8644                                    : DAG.getConstantFP(0.0, DL, VT);
8645         return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
8646       }
8647     }
8648   }
8649 
8650   // If the upper half of a ymm/zmm load is undef then just load the lower half.
8651   if (VT.is256BitVector() || VT.is512BitVector()) {
8652     unsigned HalfNumElems = NumElems / 2;
8653     if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
8654       EVT HalfVT =
8655           EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
8656       SDValue HalfLD =
8657           EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
8658                                    DAG, Subtarget, isAfterLegalize);
8659       if (HalfLD)
8660         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
8661                            HalfLD, DAG.getIntPtrConstant(0, DL));
8662     }
8663   }
8664 
8665   // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
8666   if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
8667       (LoadSizeInBits == 32 || LoadSizeInBits == 64) &&
8668       ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
8669     MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
8670                                       : MVT::getIntegerVT(LoadSizeInBits);
8671     MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
8672     // Allow v4f32 on SSE1 only targets.
8673     // FIXME: Add more isel patterns so we can just use VT directly.
8674     if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
8675       VecVT = MVT::v4f32;
8676     if (TLI.isTypeLegal(VecVT)) {
8677       SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
8678       SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
8679       SDValue ResNode = DAG.getMemIntrinsicNode(
8680           X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
8681           LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
8682       for (auto *LD : Loads)
8683         if (LD)
8684           DAG.makeEquivalentMemoryOrdering(LD, ResNode);
8685       return DAG.getBitcast(VT, ResNode);
8686     }
8687   }
8688 
8689   // BROADCAST - match the smallest possible repetition pattern, load that
8690   // scalar/subvector element and then broadcast to the entire vector.
8691   if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
8692       (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
8693     for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
8694       unsigned RepeatSize = SubElems * BaseSizeInBits;
8695       unsigned ScalarSize = std::min(RepeatSize, 64u);
8696       if (!Subtarget.hasAVX2() && ScalarSize < 32)
8697         continue;
8698 
8699       // Don't attempt a 1:N subvector broadcast - it should be caught by
8700       // combineConcatVectorOps, else will cause infinite loops.
8701       if (RepeatSize > ScalarSize && SubElems == 1)
8702         continue;
8703 
8704       bool Match = true;
8705       SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
8706       for (unsigned i = 0; i != NumElems && Match; ++i) {
8707         if (!LoadMask[i])
8708           continue;
8709         SDValue Elt = peekThroughBitcasts(Elts[i]);
8710         if (RepeatedLoads[i % SubElems].isUndef())
8711           RepeatedLoads[i % SubElems] = Elt;
8712         else
8713           Match &= (RepeatedLoads[i % SubElems] == Elt);
8714       }
8715 
8716       // We must have loads at both ends of the repetition.
8717       Match &= !RepeatedLoads.front().isUndef();
8718       Match &= !RepeatedLoads.back().isUndef();
8719       if (!Match)
8720         continue;
8721 
8722       EVT RepeatVT =
8723           VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
8724               ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
8725               : EVT::getFloatingPointVT(ScalarSize);
8726       if (RepeatSize > ScalarSize)
8727         RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
8728                                     RepeatSize / ScalarSize);
8729       EVT BroadcastVT =
8730           EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
8731                            VT.getSizeInBits() / ScalarSize);
8732       if (TLI.isTypeLegal(BroadcastVT)) {
8733         if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
8734                 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) {
8735           SDValue Broadcast = RepeatLoad;
8736           if (RepeatSize > ScalarSize) {
8737             while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
8738               Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
8739           } else {
8740             Broadcast =
8741                 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
8742           }
8743           return DAG.getBitcast(VT, Broadcast);
8744         }
8745       }
8746     }
8747   }
8748 
8749   return SDValue();
8750 }
8751 
8752 // Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
8753 // load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
8754 // are consecutive, non-overlapping, and in the right order.
combineToConsecutiveLoads(EVT VT,SDValue Op,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget,bool isAfterLegalize)8755 static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
8756                                          SelectionDAG &DAG,
8757                                          const X86Subtarget &Subtarget,
8758                                          bool isAfterLegalize) {
8759   SmallVector<SDValue, 64> Elts;
8760   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
8761     if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
8762       Elts.push_back(Elt);
8763       continue;
8764     }
8765     return SDValue();
8766   }
8767   assert(Elts.size() == VT.getVectorNumElements());
8768   return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
8769                                   isAfterLegalize);
8770 }
8771 
getConstantVector(MVT VT,const APInt & SplatValue,unsigned SplatBitSize,LLVMContext & C)8772 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
8773                                    unsigned SplatBitSize, LLVMContext &C) {
8774   unsigned ScalarSize = VT.getScalarSizeInBits();
8775   unsigned NumElm = SplatBitSize / ScalarSize;
8776 
8777   SmallVector<Constant *, 32> ConstantVec;
8778   for (unsigned i = 0; i < NumElm; i++) {
8779     APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
8780     Constant *Const;
8781     if (VT.isFloatingPoint()) {
8782       if (ScalarSize == 32) {
8783         Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
8784       } else {
8785         assert(ScalarSize == 64 && "Unsupported floating point scalar size");
8786         Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
8787       }
8788     } else
8789       Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
8790     ConstantVec.push_back(Const);
8791   }
8792   return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
8793 }
8794 
isFoldableUseOfShuffle(SDNode * N)8795 static bool isFoldableUseOfShuffle(SDNode *N) {
8796   for (auto *U : N->uses()) {
8797     unsigned Opc = U->getOpcode();
8798     // VPERMV/VPERMV3 shuffles can never fold their index operands.
8799     if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
8800       return false;
8801     if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
8802       return false;
8803     if (isTargetShuffle(Opc))
8804       return true;
8805     if (Opc == ISD::BITCAST) // Ignore bitcasts
8806       return isFoldableUseOfShuffle(U);
8807     if (N->hasOneUse())
8808       return true;
8809   }
8810   return false;
8811 }
8812 
8813 /// Attempt to use the vbroadcast instruction to generate a splat value
8814 /// from a splat BUILD_VECTOR which uses:
8815 ///  a. A single scalar load, or a constant.
8816 ///  b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
8817 ///
8818 /// The VBROADCAST node is returned when a pattern is found,
8819 /// or SDValue() otherwise.
lowerBuildVectorAsBroadcast(BuildVectorSDNode * BVOp,const X86Subtarget & Subtarget,SelectionDAG & DAG)8820 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
8821                                            const X86Subtarget &Subtarget,
8822                                            SelectionDAG &DAG) {
8823   // VBROADCAST requires AVX.
8824   // TODO: Splats could be generated for non-AVX CPUs using SSE
8825   // instructions, but there's less potential gain for only 128-bit vectors.
8826   if (!Subtarget.hasAVX())
8827     return SDValue();
8828 
8829   MVT VT = BVOp->getSimpleValueType(0);
8830   unsigned NumElts = VT.getVectorNumElements();
8831   SDLoc dl(BVOp);
8832 
8833   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
8834          "Unsupported vector type for broadcast.");
8835 
8836   // See if the build vector is a repeating sequence of scalars (inc. splat).
8837   SDValue Ld;
8838   BitVector UndefElements;
8839   SmallVector<SDValue, 16> Sequence;
8840   if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
8841     assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
8842     if (Sequence.size() == 1)
8843       Ld = Sequence[0];
8844   }
8845 
8846   // Attempt to use VBROADCASTM
8847   // From this pattern:
8848   // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
8849   // b. t1 = (build_vector t0 t0)
8850   //
8851   // Create (VBROADCASTM v2i1 X)
8852   if (!Sequence.empty() && Subtarget.hasCDI()) {
8853     // If not a splat, are the upper sequence values zeroable?
8854     unsigned SeqLen = Sequence.size();
8855     bool UpperZeroOrUndef =
8856         SeqLen == 1 ||
8857         llvm::all_of(makeArrayRef(Sequence).drop_front(), [](SDValue V) {
8858           return !V || V.isUndef() || isNullConstant(V);
8859         });
8860     SDValue Op0 = Sequence[0];
8861     if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
8862                              (Op0.getOpcode() == ISD::ZERO_EXTEND &&
8863                               Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
8864       SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
8865                              ? Op0.getOperand(0)
8866                              : Op0.getOperand(0).getOperand(0);
8867       MVT MaskVT = BOperand.getSimpleValueType();
8868       MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
8869       if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) ||  // for broadcastmb2q
8870           (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
8871         MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
8872         if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
8873           unsigned Scale = 512 / VT.getSizeInBits();
8874           BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
8875         }
8876         SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
8877         if (BcstVT.getSizeInBits() != VT.getSizeInBits())
8878           Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
8879         return DAG.getBitcast(VT, Bcst);
8880       }
8881     }
8882   }
8883 
8884   unsigned NumUndefElts = UndefElements.count();
8885   if (!Ld || (NumElts - NumUndefElts) <= 1) {
8886     APInt SplatValue, Undef;
8887     unsigned SplatBitSize;
8888     bool HasUndef;
8889     // Check if this is a repeated constant pattern suitable for broadcasting.
8890     if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
8891         SplatBitSize > VT.getScalarSizeInBits() &&
8892         SplatBitSize < VT.getSizeInBits()) {
8893       // Avoid replacing with broadcast when it's a use of a shuffle
8894       // instruction to preserve the present custom lowering of shuffles.
8895       if (isFoldableUseOfShuffle(BVOp))
8896         return SDValue();
8897       // replace BUILD_VECTOR with broadcast of the repeated constants.
8898       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8899       LLVMContext *Ctx = DAG.getContext();
8900       MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
8901       if (Subtarget.hasAVX()) {
8902         if (SplatBitSize == 32 || SplatBitSize == 64 ||
8903             (SplatBitSize < 32 && Subtarget.hasAVX2())) {
8904           // Splatted value can fit in one INTEGER constant in constant pool.
8905           // Load the constant and broadcast it.
8906           MVT CVT = MVT::getIntegerVT(SplatBitSize);
8907           Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
8908           Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
8909           SDValue CP = DAG.getConstantPool(C, PVT);
8910           unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
8911 
8912           Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
8913           SDVTList Tys =
8914               DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
8915           SDValue Ops[] = {DAG.getEntryNode(), CP};
8916           MachinePointerInfo MPI =
8917               MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
8918           SDValue Brdcst = DAG.getMemIntrinsicNode(
8919               X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
8920               MachineMemOperand::MOLoad);
8921           return DAG.getBitcast(VT, Brdcst);
8922         }
8923         if (SplatBitSize > 64) {
8924           // Load the vector of constants and broadcast it.
8925           Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
8926                                              *Ctx);
8927           SDValue VCP = DAG.getConstantPool(VecC, PVT);
8928           unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
8929           MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
8930           Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
8931           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8932           SDValue Ops[] = {DAG.getEntryNode(), VCP};
8933           MachinePointerInfo MPI =
8934               MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
8935           return DAG.getMemIntrinsicNode(
8936               X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,
8937               MachineMemOperand::MOLoad);
8938         }
8939       }
8940     }
8941 
8942     // If we are moving a scalar into a vector (Ld must be set and all elements
8943     // but 1 are undef) and that operation is not obviously supported by
8944     // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
8945     // That's better than general shuffling and may eliminate a load to GPR and
8946     // move from scalar to vector register.
8947     if (!Ld || NumElts - NumUndefElts != 1)
8948       return SDValue();
8949     unsigned ScalarSize = Ld.getValueSizeInBits();
8950     if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
8951       return SDValue();
8952   }
8953 
8954   bool ConstSplatVal =
8955       (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
8956   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
8957 
8958   // TODO: Handle broadcasts of non-constant sequences.
8959 
8960   // Make sure that all of the users of a non-constant load are from the
8961   // BUILD_VECTOR node.
8962   // FIXME: Is the use count needed for non-constant, non-load case?
8963   if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
8964     return SDValue();
8965 
8966   unsigned ScalarSize = Ld.getValueSizeInBits();
8967   bool IsGE256 = (VT.getSizeInBits() >= 256);
8968 
8969   // When optimizing for size, generate up to 5 extra bytes for a broadcast
8970   // instruction to save 8 or more bytes of constant pool data.
8971   // TODO: If multiple splats are generated to load the same constant,
8972   // it may be detrimental to overall size. There needs to be a way to detect
8973   // that condition to know if this is truly a size win.
8974   bool OptForSize = DAG.shouldOptForSize();
8975 
8976   // Handle broadcasting a single constant scalar from the constant pool
8977   // into a vector.
8978   // On Sandybridge (no AVX2), it is still better to load a constant vector
8979   // from the constant pool and not to broadcast it from a scalar.
8980   // But override that restriction when optimizing for size.
8981   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
8982   if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
8983     EVT CVT = Ld.getValueType();
8984     assert(!CVT.isVector() && "Must not broadcast a vector type");
8985 
8986     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
8987     // For size optimization, also splat v2f64 and v2i64, and for size opt
8988     // with AVX2, also splat i8 and i16.
8989     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
8990     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
8991         (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
8992       const Constant *C = nullptr;
8993       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
8994         C = CI->getConstantIntValue();
8995       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
8996         C = CF->getConstantFPValue();
8997 
8998       assert(C && "Invalid constant type");
8999 
9000       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9001       SDValue CP =
9002           DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
9003       Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9004 
9005       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9006       SDValue Ops[] = {DAG.getEntryNode(), CP};
9007       MachinePointerInfo MPI =
9008           MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9009       return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
9010                                      MPI, Alignment, MachineMemOperand::MOLoad);
9011     }
9012   }
9013 
9014   // Handle AVX2 in-register broadcasts.
9015   if (!IsLoad && Subtarget.hasInt256() &&
9016       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
9017     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9018 
9019   // The scalar source must be a normal load.
9020   if (!IsLoad)
9021     return SDValue();
9022 
9023   // Make sure the non-chain result is only used by this build vector.
9024   if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
9025     return SDValue();
9026 
9027   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9028       (Subtarget.hasVLX() && ScalarSize == 64)) {
9029     auto *LN = cast<LoadSDNode>(Ld);
9030     SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9031     SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9032     SDValue BCast =
9033         DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9034                                 LN->getMemoryVT(), LN->getMemOperand());
9035     DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9036     return BCast;
9037   }
9038 
9039   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
9040   // double since there is no vbroadcastsd xmm
9041   if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
9042       (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
9043     auto *LN = cast<LoadSDNode>(Ld);
9044     SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9045     SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9046     SDValue BCast =
9047         DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9048                                 LN->getMemoryVT(), LN->getMemOperand());
9049     DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9050     return BCast;
9051   }
9052 
9053   // Unsupported broadcast.
9054   return SDValue();
9055 }
9056 
9057 /// For an EXTRACT_VECTOR_ELT with a constant index return the real
9058 /// underlying vector and index.
9059 ///
9060 /// Modifies \p ExtractedFromVec to the real vector and returns the real
9061 /// index.
getUnderlyingExtractedFromVec(SDValue & ExtractedFromVec,SDValue ExtIdx)9062 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
9063                                          SDValue ExtIdx) {
9064   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
9065   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
9066     return Idx;
9067 
9068   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
9069   // lowered this:
9070   //   (extract_vector_elt (v8f32 %1), Constant<6>)
9071   // to:
9072   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
9073   //                           (extract_subvector (v8f32 %0), Constant<4>),
9074   //                           undef)
9075   //                       Constant<0>)
9076   // In this case the vector is the extract_subvector expression and the index
9077   // is 2, as specified by the shuffle.
9078   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
9079   SDValue ShuffleVec = SVOp->getOperand(0);
9080   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
9081   assert(ShuffleVecVT.getVectorElementType() ==
9082          ExtractedFromVec.getSimpleValueType().getVectorElementType());
9083 
9084   int ShuffleIdx = SVOp->getMaskElt(Idx);
9085   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
9086     ExtractedFromVec = ShuffleVec;
9087     return ShuffleIdx;
9088   }
9089   return Idx;
9090 }
9091 
buildFromShuffleMostly(SDValue Op,SelectionDAG & DAG)9092 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
9093   MVT VT = Op.getSimpleValueType();
9094 
9095   // Skip if insert_vec_elt is not supported.
9096   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9097   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
9098     return SDValue();
9099 
9100   SDLoc DL(Op);
9101   unsigned NumElems = Op.getNumOperands();
9102 
9103   SDValue VecIn1;
9104   SDValue VecIn2;
9105   SmallVector<unsigned, 4> InsertIndices;
9106   SmallVector<int, 8> Mask(NumElems, -1);
9107 
9108   for (unsigned i = 0; i != NumElems; ++i) {
9109     unsigned Opc = Op.getOperand(i).getOpcode();
9110 
9111     if (Opc == ISD::UNDEF)
9112       continue;
9113 
9114     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
9115       // Quit if more than 1 elements need inserting.
9116       if (InsertIndices.size() > 1)
9117         return SDValue();
9118 
9119       InsertIndices.push_back(i);
9120       continue;
9121     }
9122 
9123     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
9124     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
9125 
9126     // Quit if non-constant index.
9127     if (!isa<ConstantSDNode>(ExtIdx))
9128       return SDValue();
9129     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
9130 
9131     // Quit if extracted from vector of different type.
9132     if (ExtractedFromVec.getValueType() != VT)
9133       return SDValue();
9134 
9135     if (!VecIn1.getNode())
9136       VecIn1 = ExtractedFromVec;
9137     else if (VecIn1 != ExtractedFromVec) {
9138       if (!VecIn2.getNode())
9139         VecIn2 = ExtractedFromVec;
9140       else if (VecIn2 != ExtractedFromVec)
9141         // Quit if more than 2 vectors to shuffle
9142         return SDValue();
9143     }
9144 
9145     if (ExtractedFromVec == VecIn1)
9146       Mask[i] = Idx;
9147     else if (ExtractedFromVec == VecIn2)
9148       Mask[i] = Idx + NumElems;
9149   }
9150 
9151   if (!VecIn1.getNode())
9152     return SDValue();
9153 
9154   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
9155   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
9156 
9157   for (unsigned Idx : InsertIndices)
9158     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
9159                      DAG.getIntPtrConstant(Idx, DL));
9160 
9161   return NV;
9162 }
9163 
9164 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
LowerBUILD_VECTORvXi1(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)9165 static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
9166                                      const X86Subtarget &Subtarget) {
9167 
9168   MVT VT = Op.getSimpleValueType();
9169   assert((VT.getVectorElementType() == MVT::i1) &&
9170          "Unexpected type in LowerBUILD_VECTORvXi1!");
9171 
9172   SDLoc dl(Op);
9173   if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
9174       ISD::isBuildVectorAllOnes(Op.getNode()))
9175     return Op;
9176 
9177   uint64_t Immediate = 0;
9178   SmallVector<unsigned, 16> NonConstIdx;
9179   bool IsSplat = true;
9180   bool HasConstElts = false;
9181   int SplatIdx = -1;
9182   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
9183     SDValue In = Op.getOperand(idx);
9184     if (In.isUndef())
9185       continue;
9186     if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
9187       Immediate |= (InC->getZExtValue() & 0x1) << idx;
9188       HasConstElts = true;
9189     } else {
9190       NonConstIdx.push_back(idx);
9191     }
9192     if (SplatIdx < 0)
9193       SplatIdx = idx;
9194     else if (In != Op.getOperand(SplatIdx))
9195       IsSplat = false;
9196   }
9197 
9198   // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
9199   if (IsSplat) {
9200     // The build_vector allows the scalar element to be larger than the vector
9201     // element type. We need to mask it to use as a condition unless we know
9202     // the upper bits are zero.
9203     // FIXME: Use computeKnownBits instead of checking specific opcode?
9204     SDValue Cond = Op.getOperand(SplatIdx);
9205     assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
9206     if (Cond.getOpcode() != ISD::SETCC)
9207       Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
9208                          DAG.getConstant(1, dl, MVT::i8));
9209 
9210     // Perform the select in the scalar domain so we can use cmov.
9211     if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9212       SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
9213                                      DAG.getAllOnesConstant(dl, MVT::i32),
9214                                      DAG.getConstant(0, dl, MVT::i32));
9215       Select = DAG.getBitcast(MVT::v32i1, Select);
9216       return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
9217     } else {
9218       MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9219       SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
9220                                      DAG.getAllOnesConstant(dl, ImmVT),
9221                                      DAG.getConstant(0, dl, ImmVT));
9222       MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9223       Select = DAG.getBitcast(VecVT, Select);
9224       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
9225                          DAG.getIntPtrConstant(0, dl));
9226     }
9227   }
9228 
9229   // insert elements one by one
9230   SDValue DstVec;
9231   if (HasConstElts) {
9232     if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9233       SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
9234       SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
9235       ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
9236       ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
9237       DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
9238     } else {
9239       MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9240       SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
9241       MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9242       DstVec = DAG.getBitcast(VecVT, Imm);
9243       DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
9244                            DAG.getIntPtrConstant(0, dl));
9245     }
9246   } else
9247     DstVec = DAG.getUNDEF(VT);
9248 
9249   for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
9250     unsigned InsertIdx = NonConstIdx[i];
9251     DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
9252                          Op.getOperand(InsertIdx),
9253                          DAG.getIntPtrConstant(InsertIdx, dl));
9254   }
9255   return DstVec;
9256 }
9257 
isHorizOp(unsigned Opcode)9258 LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
9259   switch (Opcode) {
9260   case X86ISD::PACKSS:
9261   case X86ISD::PACKUS:
9262   case X86ISD::FHADD:
9263   case X86ISD::FHSUB:
9264   case X86ISD::HADD:
9265   case X86ISD::HSUB:
9266     return true;
9267   }
9268   return false;
9269 }
9270 
9271 /// This is a helper function of LowerToHorizontalOp().
9272 /// This function checks that the build_vector \p N in input implements a
9273 /// 128-bit partial horizontal operation on a 256-bit vector, but that operation
9274 /// may not match the layout of an x86 256-bit horizontal instruction.
9275 /// In other words, if this returns true, then some extraction/insertion will
9276 /// be required to produce a valid horizontal instruction.
9277 ///
9278 /// Parameter \p Opcode defines the kind of horizontal operation to match.
9279 /// For example, if \p Opcode is equal to ISD::ADD, then this function
9280 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
9281 /// is equal to ISD::SUB, then this function checks if this is a horizontal
9282 /// arithmetic sub.
9283 ///
9284 /// This function only analyzes elements of \p N whose indices are
9285 /// in range [BaseIdx, LastIdx).
9286 ///
9287 /// TODO: This function was originally used to match both real and fake partial
9288 /// horizontal operations, but the index-matching logic is incorrect for that.
9289 /// See the corrected implementation in isHopBuildVector(). Can we reduce this
9290 /// code because it is only used for partial h-op matching now?
isHorizontalBinOpPart(const BuildVectorSDNode * N,unsigned Opcode,SelectionDAG & DAG,unsigned BaseIdx,unsigned LastIdx,SDValue & V0,SDValue & V1)9291 static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
9292                                   SelectionDAG &DAG,
9293                                   unsigned BaseIdx, unsigned LastIdx,
9294                                   SDValue &V0, SDValue &V1) {
9295   EVT VT = N->getValueType(0);
9296   assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
9297   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
9298   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
9299          "Invalid Vector in input!");
9300 
9301   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
9302   bool CanFold = true;
9303   unsigned ExpectedVExtractIdx = BaseIdx;
9304   unsigned NumElts = LastIdx - BaseIdx;
9305   V0 = DAG.getUNDEF(VT);
9306   V1 = DAG.getUNDEF(VT);
9307 
9308   // Check if N implements a horizontal binop.
9309   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
9310     SDValue Op = N->getOperand(i + BaseIdx);
9311 
9312     // Skip UNDEFs.
9313     if (Op->isUndef()) {
9314       // Update the expected vector extract index.
9315       if (i * 2 == NumElts)
9316         ExpectedVExtractIdx = BaseIdx;
9317       ExpectedVExtractIdx += 2;
9318       continue;
9319     }
9320 
9321     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
9322 
9323     if (!CanFold)
9324       break;
9325 
9326     SDValue Op0 = Op.getOperand(0);
9327     SDValue Op1 = Op.getOperand(1);
9328 
9329     // Try to match the following pattern:
9330     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
9331     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9332         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9333         Op0.getOperand(0) == Op1.getOperand(0) &&
9334         isa<ConstantSDNode>(Op0.getOperand(1)) &&
9335         isa<ConstantSDNode>(Op1.getOperand(1)));
9336     if (!CanFold)
9337       break;
9338 
9339     unsigned I0 = Op0.getConstantOperandVal(1);
9340     unsigned I1 = Op1.getConstantOperandVal(1);
9341 
9342     if (i * 2 < NumElts) {
9343       if (V0.isUndef()) {
9344         V0 = Op0.getOperand(0);
9345         if (V0.getValueType() != VT)
9346           return false;
9347       }
9348     } else {
9349       if (V1.isUndef()) {
9350         V1 = Op0.getOperand(0);
9351         if (V1.getValueType() != VT)
9352           return false;
9353       }
9354       if (i * 2 == NumElts)
9355         ExpectedVExtractIdx = BaseIdx;
9356     }
9357 
9358     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
9359     if (I0 == ExpectedVExtractIdx)
9360       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
9361     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
9362       // Try to match the following dag sequence:
9363       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
9364       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
9365     } else
9366       CanFold = false;
9367 
9368     ExpectedVExtractIdx += 2;
9369   }
9370 
9371   return CanFold;
9372 }
9373 
9374 /// Emit a sequence of two 128-bit horizontal add/sub followed by
9375 /// a concat_vector.
9376 ///
9377 /// This is a helper function of LowerToHorizontalOp().
9378 /// This function expects two 256-bit vectors called V0 and V1.
9379 /// At first, each vector is split into two separate 128-bit vectors.
9380 /// Then, the resulting 128-bit vectors are used to implement two
9381 /// horizontal binary operations.
9382 ///
9383 /// The kind of horizontal binary operation is defined by \p X86Opcode.
9384 ///
9385 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
9386 /// the two new horizontal binop.
9387 /// When Mode is set, the first horizontal binop dag node would take as input
9388 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
9389 /// horizontal binop dag node would take as input the lower 128-bit of V1
9390 /// and the upper 128-bit of V1.
9391 ///   Example:
9392 ///     HADD V0_LO, V0_HI
9393 ///     HADD V1_LO, V1_HI
9394 ///
9395 /// Otherwise, the first horizontal binop dag node takes as input the lower
9396 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
9397 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
9398 ///   Example:
9399 ///     HADD V0_LO, V1_LO
9400 ///     HADD V0_HI, V1_HI
9401 ///
9402 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
9403 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
9404 /// the upper 128-bits of the result.
ExpandHorizontalBinOp(const SDValue & V0,const SDValue & V1,const SDLoc & DL,SelectionDAG & DAG,unsigned X86Opcode,bool Mode,bool isUndefLO,bool isUndefHI)9405 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
9406                                      const SDLoc &DL, SelectionDAG &DAG,
9407                                      unsigned X86Opcode, bool Mode,
9408                                      bool isUndefLO, bool isUndefHI) {
9409   MVT VT = V0.getSimpleValueType();
9410   assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
9411          "Invalid nodes in input!");
9412 
9413   unsigned NumElts = VT.getVectorNumElements();
9414   SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
9415   SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
9416   SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
9417   SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
9418   MVT NewVT = V0_LO.getSimpleValueType();
9419 
9420   SDValue LO = DAG.getUNDEF(NewVT);
9421   SDValue HI = DAG.getUNDEF(NewVT);
9422 
9423   if (Mode) {
9424     // Don't emit a horizontal binop if the result is expected to be UNDEF.
9425     if (!isUndefLO && !V0->isUndef())
9426       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
9427     if (!isUndefHI && !V1->isUndef())
9428       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
9429   } else {
9430     // Don't emit a horizontal binop if the result is expected to be UNDEF.
9431     if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
9432       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
9433 
9434     if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
9435       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
9436   }
9437 
9438   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
9439 }
9440 
9441 /// Returns true iff \p BV builds a vector with the result equivalent to
9442 /// the result of ADDSUB/SUBADD operation.
9443 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
9444 /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
9445 /// \p Opnd0 and \p Opnd1.
isAddSubOrSubAdd(const BuildVectorSDNode * BV,const X86Subtarget & Subtarget,SelectionDAG & DAG,SDValue & Opnd0,SDValue & Opnd1,unsigned & NumExtracts,bool & IsSubAdd)9446 static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
9447                              const X86Subtarget &Subtarget, SelectionDAG &DAG,
9448                              SDValue &Opnd0, SDValue &Opnd1,
9449                              unsigned &NumExtracts,
9450                              bool &IsSubAdd) {
9451 
9452   MVT VT = BV->getSimpleValueType(0);
9453   if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
9454     return false;
9455 
9456   unsigned NumElts = VT.getVectorNumElements();
9457   SDValue InVec0 = DAG.getUNDEF(VT);
9458   SDValue InVec1 = DAG.getUNDEF(VT);
9459 
9460   NumExtracts = 0;
9461 
9462   // Odd-numbered elements in the input build vector are obtained from
9463   // adding/subtracting two integer/float elements.
9464   // Even-numbered elements in the input build vector are obtained from
9465   // subtracting/adding two integer/float elements.
9466   unsigned Opc[2] = {0, 0};
9467   for (unsigned i = 0, e = NumElts; i != e; ++i) {
9468     SDValue Op = BV->getOperand(i);
9469 
9470     // Skip 'undef' values.
9471     unsigned Opcode = Op.getOpcode();
9472     if (Opcode == ISD::UNDEF)
9473       continue;
9474 
9475     // Early exit if we found an unexpected opcode.
9476     if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
9477       return false;
9478 
9479     SDValue Op0 = Op.getOperand(0);
9480     SDValue Op1 = Op.getOperand(1);
9481 
9482     // Try to match the following pattern:
9483     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
9484     // Early exit if we cannot match that sequence.
9485     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9486         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9487         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9488         Op0.getOperand(1) != Op1.getOperand(1))
9489       return false;
9490 
9491     unsigned I0 = Op0.getConstantOperandVal(1);
9492     if (I0 != i)
9493       return false;
9494 
9495     // We found a valid add/sub node, make sure its the same opcode as previous
9496     // elements for this parity.
9497     if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
9498       return false;
9499     Opc[i % 2] = Opcode;
9500 
9501     // Update InVec0 and InVec1.
9502     if (InVec0.isUndef()) {
9503       InVec0 = Op0.getOperand(0);
9504       if (InVec0.getSimpleValueType() != VT)
9505         return false;
9506     }
9507     if (InVec1.isUndef()) {
9508       InVec1 = Op1.getOperand(0);
9509       if (InVec1.getSimpleValueType() != VT)
9510         return false;
9511     }
9512 
9513     // Make sure that operands in input to each add/sub node always
9514     // come from a same pair of vectors.
9515     if (InVec0 != Op0.getOperand(0)) {
9516       if (Opcode == ISD::FSUB)
9517         return false;
9518 
9519       // FADD is commutable. Try to commute the operands
9520       // and then test again.
9521       std::swap(Op0, Op1);
9522       if (InVec0 != Op0.getOperand(0))
9523         return false;
9524     }
9525 
9526     if (InVec1 != Op1.getOperand(0))
9527       return false;
9528 
9529     // Increment the number of extractions done.
9530     ++NumExtracts;
9531   }
9532 
9533   // Ensure we have found an opcode for both parities and that they are
9534   // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
9535   // inputs are undef.
9536   if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
9537       InVec0.isUndef() || InVec1.isUndef())
9538     return false;
9539 
9540   IsSubAdd = Opc[0] == ISD::FADD;
9541 
9542   Opnd0 = InVec0;
9543   Opnd1 = InVec1;
9544   return true;
9545 }
9546 
9547 /// Returns true if is possible to fold MUL and an idiom that has already been
9548 /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
9549 /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
9550 /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
9551 ///
9552 /// Prior to calling this function it should be known that there is some
9553 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
9554 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
9555 /// before replacement of such SDNode with ADDSUB operation. Thus the number
9556 /// of \p Opnd0 uses is expected to be equal to 2.
9557 /// For example, this function may be called for the following IR:
9558 ///    %AB = fmul fast <2 x double> %A, %B
9559 ///    %Sub = fsub fast <2 x double> %AB, %C
9560 ///    %Add = fadd fast <2 x double> %AB, %C
9561 ///    %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
9562 ///                            <2 x i32> <i32 0, i32 3>
9563 /// There is a def for %Addsub here, which potentially can be replaced by
9564 /// X86ISD::ADDSUB operation:
9565 ///    %Addsub = X86ISD::ADDSUB %AB, %C
9566 /// and such ADDSUB can further be replaced with FMADDSUB:
9567 ///    %Addsub = FMADDSUB %A, %B, %C.
9568 ///
9569 /// The main reason why this method is called before the replacement of the
9570 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
9571 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
9572 /// FMADDSUB is.
isFMAddSubOrFMSubAdd(const X86Subtarget & Subtarget,SelectionDAG & DAG,SDValue & Opnd0,SDValue & Opnd1,SDValue & Opnd2,unsigned ExpectedUses)9573 static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
9574                                  SelectionDAG &DAG,
9575                                  SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
9576                                  unsigned ExpectedUses) {
9577   if (Opnd0.getOpcode() != ISD::FMUL ||
9578       !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
9579     return false;
9580 
9581   // FIXME: These checks must match the similar ones in
9582   // DAGCombiner::visitFADDForFMACombine. It would be good to have one
9583   // function that would answer if it is Ok to fuse MUL + ADD to FMADD
9584   // or MUL + ADDSUB to FMADDSUB.
9585   const TargetOptions &Options = DAG.getTarget().Options;
9586   bool AllowFusion =
9587       (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
9588   if (!AllowFusion)
9589     return false;
9590 
9591   Opnd2 = Opnd1;
9592   Opnd1 = Opnd0.getOperand(1);
9593   Opnd0 = Opnd0.getOperand(0);
9594 
9595   return true;
9596 }
9597 
9598 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
9599 /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
9600 /// X86ISD::FMSUBADD node.
lowerToAddSubOrFMAddSub(const BuildVectorSDNode * BV,const X86Subtarget & Subtarget,SelectionDAG & DAG)9601 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
9602                                        const X86Subtarget &Subtarget,
9603                                        SelectionDAG &DAG) {
9604   SDValue Opnd0, Opnd1;
9605   unsigned NumExtracts;
9606   bool IsSubAdd;
9607   if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
9608                         IsSubAdd))
9609     return SDValue();
9610 
9611   MVT VT = BV->getSimpleValueType(0);
9612   SDLoc DL(BV);
9613 
9614   // Try to generate X86ISD::FMADDSUB node here.
9615   SDValue Opnd2;
9616   if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
9617     unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
9618     return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
9619   }
9620 
9621   // We only support ADDSUB.
9622   if (IsSubAdd)
9623     return SDValue();
9624 
9625   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
9626   // the ADDSUB idiom has been successfully recognized. There are no known
9627   // X86 targets with 512-bit ADDSUB instructions!
9628   // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
9629   // recognition.
9630   if (VT.is512BitVector())
9631     return SDValue();
9632 
9633   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
9634 }
9635 
isHopBuildVector(const BuildVectorSDNode * BV,SelectionDAG & DAG,unsigned & HOpcode,SDValue & V0,SDValue & V1)9636 static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
9637                              unsigned &HOpcode, SDValue &V0, SDValue &V1) {
9638   // Initialize outputs to known values.
9639   MVT VT = BV->getSimpleValueType(0);
9640   HOpcode = ISD::DELETED_NODE;
9641   V0 = DAG.getUNDEF(VT);
9642   V1 = DAG.getUNDEF(VT);
9643 
9644   // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
9645   // half of the result is calculated independently from the 128-bit halves of
9646   // the inputs, so that makes the index-checking logic below more complicated.
9647   unsigned NumElts = VT.getVectorNumElements();
9648   unsigned GenericOpcode = ISD::DELETED_NODE;
9649   unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
9650   unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
9651   unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
9652   for (unsigned i = 0; i != Num128BitChunks; ++i) {
9653     for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
9654       // Ignore undef elements.
9655       SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
9656       if (Op.isUndef())
9657         continue;
9658 
9659       // If there's an opcode mismatch, we're done.
9660       if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
9661         return false;
9662 
9663       // Initialize horizontal opcode.
9664       if (HOpcode == ISD::DELETED_NODE) {
9665         GenericOpcode = Op.getOpcode();
9666         switch (GenericOpcode) {
9667         case ISD::ADD: HOpcode = X86ISD::HADD; break;
9668         case ISD::SUB: HOpcode = X86ISD::HSUB; break;
9669         case ISD::FADD: HOpcode = X86ISD::FHADD; break;
9670         case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
9671         default: return false;
9672         }
9673       }
9674 
9675       SDValue Op0 = Op.getOperand(0);
9676       SDValue Op1 = Op.getOperand(1);
9677       if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9678           Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9679           Op0.getOperand(0) != Op1.getOperand(0) ||
9680           !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9681           !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
9682         return false;
9683 
9684       // The source vector is chosen based on which 64-bit half of the
9685       // destination vector is being calculated.
9686       if (j < NumEltsIn64Bits) {
9687         if (V0.isUndef())
9688           V0 = Op0.getOperand(0);
9689       } else {
9690         if (V1.isUndef())
9691           V1 = Op0.getOperand(0);
9692       }
9693 
9694       SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
9695       if (SourceVec != Op0.getOperand(0))
9696         return false;
9697 
9698       // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
9699       unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
9700       unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
9701       unsigned ExpectedIndex = i * NumEltsIn128Bits +
9702                                (j % NumEltsIn64Bits) * 2;
9703       if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
9704         continue;
9705 
9706       // If this is not a commutative op, this does not match.
9707       if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
9708         return false;
9709 
9710       // Addition is commutative, so try swapping the extract indexes.
9711       // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
9712       if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
9713         continue;
9714 
9715       // Extract indexes do not match horizontal requirement.
9716       return false;
9717     }
9718   }
9719   // We matched. Opcode and operands are returned by reference as arguments.
9720   return true;
9721 }
9722 
getHopForBuildVector(const BuildVectorSDNode * BV,SelectionDAG & DAG,unsigned HOpcode,SDValue V0,SDValue V1)9723 static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
9724                                     SelectionDAG &DAG, unsigned HOpcode,
9725                                     SDValue V0, SDValue V1) {
9726   // If either input vector is not the same size as the build vector,
9727   // extract/insert the low bits to the correct size.
9728   // This is free (examples: zmm --> xmm, xmm --> ymm).
9729   MVT VT = BV->getSimpleValueType(0);
9730   unsigned Width = VT.getSizeInBits();
9731   if (V0.getValueSizeInBits() > Width)
9732     V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
9733   else if (V0.getValueSizeInBits() < Width)
9734     V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
9735 
9736   if (V1.getValueSizeInBits() > Width)
9737     V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
9738   else if (V1.getValueSizeInBits() < Width)
9739     V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
9740 
9741   unsigned NumElts = VT.getVectorNumElements();
9742   APInt DemandedElts = APInt::getAllOnesValue(NumElts);
9743   for (unsigned i = 0; i != NumElts; ++i)
9744     if (BV->getOperand(i).isUndef())
9745       DemandedElts.clearBit(i);
9746 
9747   // If we don't need the upper xmm, then perform as a xmm hop.
9748   unsigned HalfNumElts = NumElts / 2;
9749   if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
9750     MVT HalfVT = VT.getHalfNumVectorElementsVT();
9751     V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
9752     V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
9753     SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
9754     return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
9755   }
9756 
9757   return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
9758 }
9759 
9760 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
LowerToHorizontalOp(const BuildVectorSDNode * BV,const X86Subtarget & Subtarget,SelectionDAG & DAG)9761 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
9762                                    const X86Subtarget &Subtarget,
9763                                    SelectionDAG &DAG) {
9764   // We need at least 2 non-undef elements to make this worthwhile by default.
9765   unsigned NumNonUndefs =
9766       count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
9767   if (NumNonUndefs < 2)
9768     return SDValue();
9769 
9770   // There are 4 sets of horizontal math operations distinguished by type:
9771   // int/FP at 128-bit/256-bit. Each type was introduced with a different
9772   // subtarget feature. Try to match those "native" patterns first.
9773   MVT VT = BV->getSimpleValueType(0);
9774   if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
9775       ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
9776       ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
9777       ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
9778     unsigned HOpcode;
9779     SDValue V0, V1;
9780     if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
9781       return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
9782   }
9783 
9784   // Try harder to match 256-bit ops by using extract/concat.
9785   if (!Subtarget.hasAVX() || !VT.is256BitVector())
9786     return SDValue();
9787 
9788   // Count the number of UNDEF operands in the build_vector in input.
9789   unsigned NumElts = VT.getVectorNumElements();
9790   unsigned Half = NumElts / 2;
9791   unsigned NumUndefsLO = 0;
9792   unsigned NumUndefsHI = 0;
9793   for (unsigned i = 0, e = Half; i != e; ++i)
9794     if (BV->getOperand(i)->isUndef())
9795       NumUndefsLO++;
9796 
9797   for (unsigned i = Half, e = NumElts; i != e; ++i)
9798     if (BV->getOperand(i)->isUndef())
9799       NumUndefsHI++;
9800 
9801   SDLoc DL(BV);
9802   SDValue InVec0, InVec1;
9803   if (VT == MVT::v8i32 || VT == MVT::v16i16) {
9804     SDValue InVec2, InVec3;
9805     unsigned X86Opcode;
9806     bool CanFold = true;
9807 
9808     if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
9809         isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
9810                               InVec3) &&
9811         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9812         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9813       X86Opcode = X86ISD::HADD;
9814     else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
9815                                    InVec1) &&
9816              isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
9817                                    InVec3) &&
9818              ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9819              ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9820       X86Opcode = X86ISD::HSUB;
9821     else
9822       CanFold = false;
9823 
9824     if (CanFold) {
9825       // Do not try to expand this build_vector into a pair of horizontal
9826       // add/sub if we can emit a pair of scalar add/sub.
9827       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9828         return SDValue();
9829 
9830       // Convert this build_vector into a pair of horizontal binops followed by
9831       // a concat vector. We must adjust the outputs from the partial horizontal
9832       // matching calls above to account for undefined vector halves.
9833       SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
9834       SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
9835       assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
9836       bool isUndefLO = NumUndefsLO == Half;
9837       bool isUndefHI = NumUndefsHI == Half;
9838       return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
9839                                    isUndefHI);
9840     }
9841   }
9842 
9843   if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
9844       VT == MVT::v16i16) {
9845     unsigned X86Opcode;
9846     if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
9847       X86Opcode = X86ISD::HADD;
9848     else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
9849                                    InVec1))
9850       X86Opcode = X86ISD::HSUB;
9851     else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
9852                                    InVec1))
9853       X86Opcode = X86ISD::FHADD;
9854     else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
9855                                    InVec1))
9856       X86Opcode = X86ISD::FHSUB;
9857     else
9858       return SDValue();
9859 
9860     // Don't try to expand this build_vector into a pair of horizontal add/sub
9861     // if we can simply emit a pair of scalar add/sub.
9862     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9863       return SDValue();
9864 
9865     // Convert this build_vector into two horizontal add/sub followed by
9866     // a concat vector.
9867     bool isUndefLO = NumUndefsLO == Half;
9868     bool isUndefHI = NumUndefsHI == Half;
9869     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
9870                                  isUndefLO, isUndefHI);
9871   }
9872 
9873   return SDValue();
9874 }
9875 
9876 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
9877                           SelectionDAG &DAG);
9878 
9879 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
9880 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
9881 /// just apply the bit to the vectors.
9882 /// NOTE: Its not in our interest to start make a general purpose vectorizer
9883 /// from this, but enough scalar bit operations are created from the later
9884 /// legalization + scalarization stages to need basic support.
lowerBuildVectorToBitOp(BuildVectorSDNode * Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)9885 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
9886                                        const X86Subtarget &Subtarget,
9887                                        SelectionDAG &DAG) {
9888   SDLoc DL(Op);
9889   MVT VT = Op->getSimpleValueType(0);
9890   unsigned NumElems = VT.getVectorNumElements();
9891   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9892 
9893   // Check that all elements have the same opcode.
9894   // TODO: Should we allow UNDEFS and if so how many?
9895   unsigned Opcode = Op->getOperand(0).getOpcode();
9896   for (unsigned i = 1; i < NumElems; ++i)
9897     if (Opcode != Op->getOperand(i).getOpcode())
9898       return SDValue();
9899 
9900   // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
9901   bool IsShift = false;
9902   switch (Opcode) {
9903   default:
9904     return SDValue();
9905   case ISD::SHL:
9906   case ISD::SRL:
9907   case ISD::SRA:
9908     IsShift = true;
9909     break;
9910   case ISD::AND:
9911   case ISD::XOR:
9912   case ISD::OR:
9913     // Don't do this if the buildvector is a splat - we'd replace one
9914     // constant with an entire vector.
9915     if (Op->getSplatValue())
9916       return SDValue();
9917     if (!TLI.isOperationLegalOrPromote(Opcode, VT))
9918       return SDValue();
9919     break;
9920   }
9921 
9922   SmallVector<SDValue, 4> LHSElts, RHSElts;
9923   for (SDValue Elt : Op->ops()) {
9924     SDValue LHS = Elt.getOperand(0);
9925     SDValue RHS = Elt.getOperand(1);
9926 
9927     // We expect the canonicalized RHS operand to be the constant.
9928     if (!isa<ConstantSDNode>(RHS))
9929       return SDValue();
9930 
9931     // Extend shift amounts.
9932     if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
9933       if (!IsShift)
9934         return SDValue();
9935       RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
9936     }
9937 
9938     LHSElts.push_back(LHS);
9939     RHSElts.push_back(RHS);
9940   }
9941 
9942   // Limit to shifts by uniform immediates.
9943   // TODO: Only accept vXi8/vXi64 special cases?
9944   // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
9945   if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
9946     return SDValue();
9947 
9948   SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
9949   SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
9950   SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
9951 
9952   if (!IsShift)
9953     return Res;
9954 
9955   // Immediately lower the shift to ensure the constant build vector doesn't
9956   // get converted to a constant pool before the shift is lowered.
9957   return LowerShift(Res, Subtarget, DAG);
9958 }
9959 
9960 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
9961 /// functionality to do this, so it's all zeros, all ones, or some derivation
9962 /// that is cheap to calculate.
materializeVectorConstant(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)9963 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
9964                                          const X86Subtarget &Subtarget) {
9965   SDLoc DL(Op);
9966   MVT VT = Op.getSimpleValueType();
9967 
9968   // Vectors containing all zeros can be matched by pxor and xorps.
9969   if (ISD::isBuildVectorAllZeros(Op.getNode()))
9970     return Op;
9971 
9972   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
9973   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
9974   // vpcmpeqd on 256-bit vectors.
9975   if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
9976     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
9977       return Op;
9978 
9979     return getOnesVector(VT, DAG, DL);
9980   }
9981 
9982   return SDValue();
9983 }
9984 
9985 /// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
9986 /// from a vector of source values and a vector of extraction indices.
9987 /// The vectors might be manipulated to match the type of the permute op.
createVariablePermute(MVT VT,SDValue SrcVec,SDValue IndicesVec,SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget)9988 static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
9989                                      SDLoc &DL, SelectionDAG &DAG,
9990                                      const X86Subtarget &Subtarget) {
9991   MVT ShuffleVT = VT;
9992   EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
9993   unsigned NumElts = VT.getVectorNumElements();
9994   unsigned SizeInBits = VT.getSizeInBits();
9995 
9996   // Adjust IndicesVec to match VT size.
9997   assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
9998          "Illegal variable permute mask size");
9999   if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
10000     // Narrow/widen the indices vector to the correct size.
10001     if (IndicesVec.getValueSizeInBits() > SizeInBits)
10002       IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
10003                                     NumElts * VT.getScalarSizeInBits());
10004     else if (IndicesVec.getValueSizeInBits() < SizeInBits)
10005       IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
10006                                   SDLoc(IndicesVec), SizeInBits);
10007     // Zero-extend the index elements within the vector.
10008     if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
10009       IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
10010                                IndicesVT, IndicesVec);
10011   }
10012   IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
10013 
10014   // Handle SrcVec that don't match VT type.
10015   if (SrcVec.getValueSizeInBits() != SizeInBits) {
10016     if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
10017       // Handle larger SrcVec by treating it as a larger permute.
10018       unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
10019       VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
10020       IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10021       IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
10022                                   Subtarget, DAG, SDLoc(IndicesVec));
10023       SDValue NewSrcVec =
10024           createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10025       if (NewSrcVec)
10026         return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
10027       return SDValue();
10028     } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
10029       // Widen smaller SrcVec to match VT.
10030       SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
10031     } else
10032       return SDValue();
10033   }
10034 
10035   auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
10036     assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
10037     EVT SrcVT = Idx.getValueType();
10038     unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
10039     uint64_t IndexScale = 0;
10040     uint64_t IndexOffset = 0;
10041 
10042     // If we're scaling a smaller permute op, then we need to repeat the
10043     // indices, scaling and offsetting them as well.
10044     // e.g. v4i32 -> v16i8 (Scale = 4)
10045     // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
10046     // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
10047     for (uint64_t i = 0; i != Scale; ++i) {
10048       IndexScale |= Scale << (i * NumDstBits);
10049       IndexOffset |= i << (i * NumDstBits);
10050     }
10051 
10052     Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
10053                       DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
10054     Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
10055                       DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
10056     return Idx;
10057   };
10058 
10059   unsigned Opcode = 0;
10060   switch (VT.SimpleTy) {
10061   default:
10062     break;
10063   case MVT::v16i8:
10064     if (Subtarget.hasSSSE3())
10065       Opcode = X86ISD::PSHUFB;
10066     break;
10067   case MVT::v8i16:
10068     if (Subtarget.hasVLX() && Subtarget.hasBWI())
10069       Opcode = X86ISD::VPERMV;
10070     else if (Subtarget.hasSSSE3()) {
10071       Opcode = X86ISD::PSHUFB;
10072       ShuffleVT = MVT::v16i8;
10073     }
10074     break;
10075   case MVT::v4f32:
10076   case MVT::v4i32:
10077     if (Subtarget.hasAVX()) {
10078       Opcode = X86ISD::VPERMILPV;
10079       ShuffleVT = MVT::v4f32;
10080     } else if (Subtarget.hasSSSE3()) {
10081       Opcode = X86ISD::PSHUFB;
10082       ShuffleVT = MVT::v16i8;
10083     }
10084     break;
10085   case MVT::v2f64:
10086   case MVT::v2i64:
10087     if (Subtarget.hasAVX()) {
10088       // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
10089       IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10090       Opcode = X86ISD::VPERMILPV;
10091       ShuffleVT = MVT::v2f64;
10092     } else if (Subtarget.hasSSE41()) {
10093       // SSE41 can compare v2i64 - select between indices 0 and 1.
10094       return DAG.getSelectCC(
10095           DL, IndicesVec,
10096           getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
10097           DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
10098           DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
10099           ISD::CondCode::SETEQ);
10100     }
10101     break;
10102   case MVT::v32i8:
10103     if (Subtarget.hasVLX() && Subtarget.hasVBMI())
10104       Opcode = X86ISD::VPERMV;
10105     else if (Subtarget.hasXOP()) {
10106       SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
10107       SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
10108       SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
10109       SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
10110       return DAG.getNode(
10111           ISD::CONCAT_VECTORS, DL, VT,
10112           DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
10113           DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
10114     } else if (Subtarget.hasAVX()) {
10115       SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
10116       SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
10117       SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
10118       SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
10119       auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
10120                               ArrayRef<SDValue> Ops) {
10121         // Permute Lo and Hi and then select based on index range.
10122         // This works as SHUFB uses bits[3:0] to permute elements and we don't
10123         // care about the bit[7] as its just an index vector.
10124         SDValue Idx = Ops[2];
10125         EVT VT = Idx.getValueType();
10126         return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
10127                                DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
10128                                DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
10129                                ISD::CondCode::SETGT);
10130       };
10131       SDValue Ops[] = {LoLo, HiHi, IndicesVec};
10132       return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
10133                               PSHUFBBuilder);
10134     }
10135     break;
10136   case MVT::v16i16:
10137     if (Subtarget.hasVLX() && Subtarget.hasBWI())
10138       Opcode = X86ISD::VPERMV;
10139     else if (Subtarget.hasAVX()) {
10140       // Scale to v32i8 and perform as v32i8.
10141       IndicesVec = ScaleIndices(IndicesVec, 2);
10142       return DAG.getBitcast(
10143           VT, createVariablePermute(
10144                   MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
10145                   DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
10146     }
10147     break;
10148   case MVT::v8f32:
10149   case MVT::v8i32:
10150     if (Subtarget.hasAVX2())
10151       Opcode = X86ISD::VPERMV;
10152     else if (Subtarget.hasAVX()) {
10153       SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
10154       SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10155                                           {0, 1, 2, 3, 0, 1, 2, 3});
10156       SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10157                                           {4, 5, 6, 7, 4, 5, 6, 7});
10158       if (Subtarget.hasXOP())
10159         return DAG.getBitcast(
10160             VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
10161                             IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10162       // Permute Lo and Hi and then select based on index range.
10163       // This works as VPERMILPS only uses index bits[0:1] to permute elements.
10164       SDValue Res = DAG.getSelectCC(
10165           DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
10166           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
10167           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
10168           ISD::CondCode::SETGT);
10169       return DAG.getBitcast(VT, Res);
10170     }
10171     break;
10172   case MVT::v4i64:
10173   case MVT::v4f64:
10174     if (Subtarget.hasAVX512()) {
10175       if (!Subtarget.hasVLX()) {
10176         MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
10177         SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
10178                                 SDLoc(SrcVec));
10179         IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
10180                                     DAG, SDLoc(IndicesVec));
10181         SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
10182                                             DAG, Subtarget);
10183         return extract256BitVector(Res, 0, DAG, DL);
10184       }
10185       Opcode = X86ISD::VPERMV;
10186     } else if (Subtarget.hasAVX()) {
10187       SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
10188       SDValue LoLo =
10189           DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
10190       SDValue HiHi =
10191           DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
10192       // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
10193       IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10194       if (Subtarget.hasXOP())
10195         return DAG.getBitcast(
10196             VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
10197                             IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10198       // Permute Lo and Hi and then select based on index range.
10199       // This works as VPERMILPD only uses index bit[1] to permute elements.
10200       SDValue Res = DAG.getSelectCC(
10201           DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
10202           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
10203           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
10204           ISD::CondCode::SETGT);
10205       return DAG.getBitcast(VT, Res);
10206     }
10207     break;
10208   case MVT::v64i8:
10209     if (Subtarget.hasVBMI())
10210       Opcode = X86ISD::VPERMV;
10211     break;
10212   case MVT::v32i16:
10213     if (Subtarget.hasBWI())
10214       Opcode = X86ISD::VPERMV;
10215     break;
10216   case MVT::v16f32:
10217   case MVT::v16i32:
10218   case MVT::v8f64:
10219   case MVT::v8i64:
10220     if (Subtarget.hasAVX512())
10221       Opcode = X86ISD::VPERMV;
10222     break;
10223   }
10224   if (!Opcode)
10225     return SDValue();
10226 
10227   assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
10228          (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
10229          "Illegal variable permute shuffle type");
10230 
10231   uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
10232   if (Scale > 1)
10233     IndicesVec = ScaleIndices(IndicesVec, Scale);
10234 
10235   EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
10236   IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
10237 
10238   SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
10239   SDValue Res = Opcode == X86ISD::VPERMV
10240                     ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
10241                     : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
10242   return DAG.getBitcast(VT, Res);
10243 }
10244 
10245 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
10246 // reasoned to be a permutation of a vector by indices in a non-constant vector.
10247 // (build_vector (extract_elt V, (extract_elt I, 0)),
10248 //               (extract_elt V, (extract_elt I, 1)),
10249 //                    ...
10250 // ->
10251 // (vpermv I, V)
10252 //
10253 // TODO: Handle undefs
10254 // TODO: Utilize pshufb and zero mask blending to support more efficient
10255 // construction of vectors with constant-0 elements.
10256 static SDValue
LowerBUILD_VECTORAsVariablePermute(SDValue V,SelectionDAG & DAG,const X86Subtarget & Subtarget)10257 LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
10258                                    const X86Subtarget &Subtarget) {
10259   SDValue SrcVec, IndicesVec;
10260   // Check for a match of the permute source vector and permute index elements.
10261   // This is done by checking that the i-th build_vector operand is of the form:
10262   // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
10263   for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
10264     SDValue Op = V.getOperand(Idx);
10265     if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10266       return SDValue();
10267 
10268     // If this is the first extract encountered in V, set the source vector,
10269     // otherwise verify the extract is from the previously defined source
10270     // vector.
10271     if (!SrcVec)
10272       SrcVec = Op.getOperand(0);
10273     else if (SrcVec != Op.getOperand(0))
10274       return SDValue();
10275     SDValue ExtractedIndex = Op->getOperand(1);
10276     // Peek through extends.
10277     if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
10278         ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
10279       ExtractedIndex = ExtractedIndex.getOperand(0);
10280     if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10281       return SDValue();
10282 
10283     // If this is the first extract from the index vector candidate, set the
10284     // indices vector, otherwise verify the extract is from the previously
10285     // defined indices vector.
10286     if (!IndicesVec)
10287       IndicesVec = ExtractedIndex.getOperand(0);
10288     else if (IndicesVec != ExtractedIndex.getOperand(0))
10289       return SDValue();
10290 
10291     auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
10292     if (!PermIdx || PermIdx->getAPIntValue() != Idx)
10293       return SDValue();
10294   }
10295 
10296   SDLoc DL(V);
10297   MVT VT = V.getSimpleValueType();
10298   return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10299 }
10300 
10301 SDValue
LowerBUILD_VECTOR(SDValue Op,SelectionDAG & DAG) const10302 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
10303   SDLoc dl(Op);
10304 
10305   MVT VT = Op.getSimpleValueType();
10306   MVT EltVT = VT.getVectorElementType();
10307   unsigned NumElems = Op.getNumOperands();
10308 
10309   // Generate vectors for predicate vectors.
10310   if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
10311     return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
10312 
10313   if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
10314     return VectorConstant;
10315 
10316   unsigned EVTBits = EltVT.getSizeInBits();
10317   APInt UndefMask = APInt::getNullValue(NumElems);
10318   APInt ZeroMask = APInt::getNullValue(NumElems);
10319   APInt NonZeroMask = APInt::getNullValue(NumElems);
10320   bool IsAllConstants = true;
10321   SmallSet<SDValue, 8> Values;
10322   unsigned NumConstants = NumElems;
10323   for (unsigned i = 0; i < NumElems; ++i) {
10324     SDValue Elt = Op.getOperand(i);
10325     if (Elt.isUndef()) {
10326       UndefMask.setBit(i);
10327       continue;
10328     }
10329     Values.insert(Elt);
10330     if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
10331       IsAllConstants = false;
10332       NumConstants--;
10333     }
10334     if (X86::isZeroNode(Elt)) {
10335       ZeroMask.setBit(i);
10336     } else {
10337       NonZeroMask.setBit(i);
10338     }
10339   }
10340 
10341   // All undef vector. Return an UNDEF. All zero vectors were handled above.
10342   if (NonZeroMask == 0) {
10343     assert(UndefMask.isAllOnesValue() && "Fully undef mask expected");
10344     return DAG.getUNDEF(VT);
10345   }
10346 
10347   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
10348 
10349   // If the upper elts of a ymm/zmm are undef/zero then we might be better off
10350   // lowering to a smaller build vector and padding with undef/zero.
10351   if ((VT.is256BitVector() || VT.is512BitVector()) &&
10352       !isFoldableUseOfShuffle(BV)) {
10353     unsigned UpperElems = NumElems / 2;
10354     APInt UndefOrZeroMask = UndefMask | ZeroMask;
10355     unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countLeadingOnes();
10356     if (NumUpperUndefsOrZeros >= UpperElems) {
10357       if (VT.is512BitVector() &&
10358           NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
10359         UpperElems = NumElems - (NumElems / 4);
10360       bool UndefUpper = UndefMask.countLeadingOnes() >= UpperElems;
10361       MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
10362       SDValue NewBV =
10363           DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
10364       return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
10365     }
10366   }
10367 
10368   if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
10369     return AddSub;
10370   if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
10371     return HorizontalOp;
10372   if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
10373     return Broadcast;
10374   if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
10375     return BitOp;
10376 
10377   unsigned NumZero = ZeroMask.countPopulation();
10378   unsigned NumNonZero = NonZeroMask.countPopulation();
10379 
10380   // If we are inserting one variable into a vector of non-zero constants, try
10381   // to avoid loading each constant element as a scalar. Load the constants as a
10382   // vector and then insert the variable scalar element. If insertion is not
10383   // supported, fall back to a shuffle to get the scalar blended with the
10384   // constants. Insertion into a zero vector is handled as a special-case
10385   // somewhere below here.
10386   if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
10387       (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
10388        isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
10389     // Create an all-constant vector. The variable element in the old
10390     // build vector is replaced by undef in the constant vector. Save the
10391     // variable scalar element and its index for use in the insertelement.
10392     LLVMContext &Context = *DAG.getContext();
10393     Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
10394     SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
10395     SDValue VarElt;
10396     SDValue InsIndex;
10397     for (unsigned i = 0; i != NumElems; ++i) {
10398       SDValue Elt = Op.getOperand(i);
10399       if (auto *C = dyn_cast<ConstantSDNode>(Elt))
10400         ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
10401       else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
10402         ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
10403       else if (!Elt.isUndef()) {
10404         assert(!VarElt.getNode() && !InsIndex.getNode() &&
10405                "Expected one variable element in this vector");
10406         VarElt = Elt;
10407         InsIndex = DAG.getVectorIdxConstant(i, dl);
10408       }
10409     }
10410     Constant *CV = ConstantVector::get(ConstVecOps);
10411     SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
10412 
10413     // The constants we just created may not be legal (eg, floating point). We
10414     // must lower the vector right here because we can not guarantee that we'll
10415     // legalize it before loading it. This is also why we could not just create
10416     // a new build vector here. If the build vector contains illegal constants,
10417     // it could get split back up into a series of insert elements.
10418     // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
10419     SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
10420     MachineFunction &MF = DAG.getMachineFunction();
10421     MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
10422     SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
10423     unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
10424     unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
10425     if (InsertC < NumEltsInLow128Bits)
10426       return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
10427 
10428     // There's no good way to insert into the high elements of a >128-bit
10429     // vector, so use shuffles to avoid an extract/insert sequence.
10430     assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
10431     assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
10432     SmallVector<int, 8> ShuffleMask;
10433     unsigned NumElts = VT.getVectorNumElements();
10434     for (unsigned i = 0; i != NumElts; ++i)
10435       ShuffleMask.push_back(i == InsertC ? NumElts : i);
10436     SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
10437     return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
10438   }
10439 
10440   // Special case for single non-zero, non-undef, element.
10441   if (NumNonZero == 1) {
10442     unsigned Idx = NonZeroMask.countTrailingZeros();
10443     SDValue Item = Op.getOperand(Idx);
10444 
10445     // If we have a constant or non-constant insertion into the low element of
10446     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
10447     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
10448     // depending on what the source datatype is.
10449     if (Idx == 0) {
10450       if (NumZero == 0)
10451         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10452 
10453       if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
10454           (EltVT == MVT::i64 && Subtarget.is64Bit())) {
10455         assert((VT.is128BitVector() || VT.is256BitVector() ||
10456                 VT.is512BitVector()) &&
10457                "Expected an SSE value type!");
10458         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10459         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
10460         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10461       }
10462 
10463       // We can't directly insert an i8 or i16 into a vector, so zero extend
10464       // it to i32 first.
10465       if (EltVT == MVT::i16 || EltVT == MVT::i8) {
10466         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
10467         MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
10468         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
10469         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10470         return DAG.getBitcast(VT, Item);
10471       }
10472     }
10473 
10474     // Is it a vector logical left shift?
10475     if (NumElems == 2 && Idx == 1 &&
10476         X86::isZeroNode(Op.getOperand(0)) &&
10477         !X86::isZeroNode(Op.getOperand(1))) {
10478       unsigned NumBits = VT.getSizeInBits();
10479       return getVShift(true, VT,
10480                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
10481                                    VT, Op.getOperand(1)),
10482                        NumBits/2, DAG, *this, dl);
10483     }
10484 
10485     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
10486       return SDValue();
10487 
10488     // Otherwise, if this is a vector with i32 or f32 elements, and the element
10489     // is a non-constant being inserted into an element other than the low one,
10490     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
10491     // movd/movss) to move this into the low element, then shuffle it into
10492     // place.
10493     if (EVTBits == 32) {
10494       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10495       return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
10496     }
10497   }
10498 
10499   // Splat is obviously ok. Let legalizer expand it to a shuffle.
10500   if (Values.size() == 1) {
10501     if (EVTBits == 32) {
10502       // Instead of a shuffle like this:
10503       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
10504       // Check if it's possible to issue this instead.
10505       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
10506       unsigned Idx = NonZeroMask.countTrailingZeros();
10507       SDValue Item = Op.getOperand(Idx);
10508       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
10509         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
10510     }
10511     return SDValue();
10512   }
10513 
10514   // A vector full of immediates; various special cases are already
10515   // handled, so this is best done with a single constant-pool load.
10516   if (IsAllConstants)
10517     return SDValue();
10518 
10519   if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
10520       return V;
10521 
10522   // See if we can use a vector load to get all of the elements.
10523   {
10524     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
10525     if (SDValue LD =
10526             EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
10527       return LD;
10528   }
10529 
10530   // If this is a splat of pairs of 32-bit elements, we can use a narrower
10531   // build_vector and broadcast it.
10532   // TODO: We could probably generalize this more.
10533   if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
10534     SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
10535                        DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
10536     auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
10537       // Make sure all the even/odd operands match.
10538       for (unsigned i = 2; i != NumElems; ++i)
10539         if (Ops[i % 2] != Op.getOperand(i))
10540           return false;
10541       return true;
10542     };
10543     if (CanSplat(Op, NumElems, Ops)) {
10544       MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
10545       MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
10546       // Create a new build vector and cast to v2i64/v2f64.
10547       SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
10548                                      DAG.getBuildVector(NarrowVT, dl, Ops));
10549       // Broadcast from v2i64/v2f64 and cast to final VT.
10550       MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
10551       return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
10552                                             NewBV));
10553     }
10554   }
10555 
10556   // For AVX-length vectors, build the individual 128-bit pieces and use
10557   // shuffles to put them in place.
10558   if (VT.getSizeInBits() > 128) {
10559     MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
10560 
10561     // Build both the lower and upper subvector.
10562     SDValue Lower =
10563         DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
10564     SDValue Upper = DAG.getBuildVector(
10565         HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
10566 
10567     // Recreate the wider vector with the lower and upper part.
10568     return concatSubVectors(Lower, Upper, DAG, dl);
10569   }
10570 
10571   // Let legalizer expand 2-wide build_vectors.
10572   if (EVTBits == 64) {
10573     if (NumNonZero == 1) {
10574       // One half is zero or undef.
10575       unsigned Idx = NonZeroMask.countTrailingZeros();
10576       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
10577                                Op.getOperand(Idx));
10578       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
10579     }
10580     return SDValue();
10581   }
10582 
10583   // If element VT is < 32 bits, convert it to inserts into a zero vector.
10584   if (EVTBits == 8 && NumElems == 16)
10585     if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
10586                                           DAG, Subtarget))
10587       return V;
10588 
10589   if (EVTBits == 16 && NumElems == 8)
10590     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
10591                                           DAG, Subtarget))
10592       return V;
10593 
10594   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
10595   if (EVTBits == 32 && NumElems == 4)
10596     if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
10597       return V;
10598 
10599   // If element VT is == 32 bits, turn it into a number of shuffles.
10600   if (NumElems == 4 && NumZero > 0) {
10601     SmallVector<SDValue, 8> Ops(NumElems);
10602     for (unsigned i = 0; i < 4; ++i) {
10603       bool isZero = !NonZeroMask[i];
10604       if (isZero)
10605         Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
10606       else
10607         Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10608     }
10609 
10610     for (unsigned i = 0; i < 2; ++i) {
10611       switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
10612         default: llvm_unreachable("Unexpected NonZero count");
10613         case 0:
10614           Ops[i] = Ops[i*2];  // Must be a zero vector.
10615           break;
10616         case 1:
10617           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
10618           break;
10619         case 2:
10620           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10621           break;
10622         case 3:
10623           Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10624           break;
10625       }
10626     }
10627 
10628     bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
10629     bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
10630     int MaskVec[] = {
10631       Reverse1 ? 1 : 0,
10632       Reverse1 ? 0 : 1,
10633       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
10634       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
10635     };
10636     return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
10637   }
10638 
10639   assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
10640 
10641   // Check for a build vector from mostly shuffle plus few inserting.
10642   if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
10643     return Sh;
10644 
10645   // For SSE 4.1, use insertps to put the high elements into the low element.
10646   if (Subtarget.hasSSE41()) {
10647     SDValue Result;
10648     if (!Op.getOperand(0).isUndef())
10649       Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
10650     else
10651       Result = DAG.getUNDEF(VT);
10652 
10653     for (unsigned i = 1; i < NumElems; ++i) {
10654       if (Op.getOperand(i).isUndef()) continue;
10655       Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
10656                            Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
10657     }
10658     return Result;
10659   }
10660 
10661   // Otherwise, expand into a number of unpckl*, start by extending each of
10662   // our (non-undef) elements to the full vector width with the element in the
10663   // bottom slot of the vector (which generates no code for SSE).
10664   SmallVector<SDValue, 8> Ops(NumElems);
10665   for (unsigned i = 0; i < NumElems; ++i) {
10666     if (!Op.getOperand(i).isUndef())
10667       Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10668     else
10669       Ops[i] = DAG.getUNDEF(VT);
10670   }
10671 
10672   // Next, we iteratively mix elements, e.g. for v4f32:
10673   //   Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
10674   //         : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
10675   //   Step 2: unpcklpd X, Y ==>    <3, 2, 1, 0>
10676   for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
10677     // Generate scaled UNPCKL shuffle mask.
10678     SmallVector<int, 16> Mask;
10679     for(unsigned i = 0; i != Scale; ++i)
10680       Mask.push_back(i);
10681     for (unsigned i = 0; i != Scale; ++i)
10682       Mask.push_back(NumElems+i);
10683     Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
10684 
10685     for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
10686       Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
10687   }
10688   return Ops[0];
10689 }
10690 
10691 // 256-bit AVX can use the vinsertf128 instruction
10692 // to create 256-bit vectors from two other 128-bit ones.
10693 // TODO: Detect subvector broadcast here instead of DAG combine?
LowerAVXCONCAT_VECTORS(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)10694 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
10695                                       const X86Subtarget &Subtarget) {
10696   SDLoc dl(Op);
10697   MVT ResVT = Op.getSimpleValueType();
10698 
10699   assert((ResVT.is256BitVector() ||
10700           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
10701 
10702   unsigned NumOperands = Op.getNumOperands();
10703   unsigned NumZero = 0;
10704   unsigned NumNonZero = 0;
10705   unsigned NonZeros = 0;
10706   for (unsigned i = 0; i != NumOperands; ++i) {
10707     SDValue SubVec = Op.getOperand(i);
10708     if (SubVec.isUndef())
10709       continue;
10710     if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10711       ++NumZero;
10712     else {
10713       assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
10714       NonZeros |= 1 << i;
10715       ++NumNonZero;
10716     }
10717   }
10718 
10719   // If we have more than 2 non-zeros, build each half separately.
10720   if (NumNonZero > 2) {
10721     MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10722     ArrayRef<SDUse> Ops = Op->ops();
10723     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10724                              Ops.slice(0, NumOperands/2));
10725     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10726                              Ops.slice(NumOperands/2));
10727     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10728   }
10729 
10730   // Otherwise, build it up through insert_subvectors.
10731   SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
10732                         : DAG.getUNDEF(ResVT);
10733 
10734   MVT SubVT = Op.getOperand(0).getSimpleValueType();
10735   unsigned NumSubElems = SubVT.getVectorNumElements();
10736   for (unsigned i = 0; i != NumOperands; ++i) {
10737     if ((NonZeros & (1 << i)) == 0)
10738       continue;
10739 
10740     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
10741                       Op.getOperand(i),
10742                       DAG.getIntPtrConstant(i * NumSubElems, dl));
10743   }
10744 
10745   return Vec;
10746 }
10747 
10748 // Returns true if the given node is a type promotion (by concatenating i1
10749 // zeros) of the result of a node that already zeros all upper bits of
10750 // k-register.
10751 // TODO: Merge this with LowerAVXCONCAT_VECTORS?
LowerCONCAT_VECTORSvXi1(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)10752 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
10753                                        const X86Subtarget &Subtarget,
10754                                        SelectionDAG & DAG) {
10755   SDLoc dl(Op);
10756   MVT ResVT = Op.getSimpleValueType();
10757   unsigned NumOperands = Op.getNumOperands();
10758 
10759   assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
10760          "Unexpected number of operands in CONCAT_VECTORS");
10761 
10762   uint64_t Zeros = 0;
10763   uint64_t NonZeros = 0;
10764   for (unsigned i = 0; i != NumOperands; ++i) {
10765     SDValue SubVec = Op.getOperand(i);
10766     if (SubVec.isUndef())
10767       continue;
10768     assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
10769     if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10770       Zeros |= (uint64_t)1 << i;
10771     else
10772       NonZeros |= (uint64_t)1 << i;
10773   }
10774 
10775   unsigned NumElems = ResVT.getVectorNumElements();
10776 
10777   // If we are inserting non-zero vector and there are zeros in LSBs and undef
10778   // in the MSBs we need to emit a KSHIFTL. The generic lowering to
10779   // insert_subvector will give us two kshifts.
10780   if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
10781       Log2_64(NonZeros) != NumOperands - 1) {
10782     MVT ShiftVT = ResVT;
10783     if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
10784       ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
10785     unsigned Idx = Log2_64(NonZeros);
10786     SDValue SubVec = Op.getOperand(Idx);
10787     unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10788     SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
10789                          DAG.getUNDEF(ShiftVT), SubVec,
10790                          DAG.getIntPtrConstant(0, dl));
10791     Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
10792                      DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
10793     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
10794                        DAG.getIntPtrConstant(0, dl));
10795   }
10796 
10797   // If there are zero or one non-zeros we can handle this very simply.
10798   if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
10799     SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
10800     if (!NonZeros)
10801       return Vec;
10802     unsigned Idx = Log2_64(NonZeros);
10803     SDValue SubVec = Op.getOperand(Idx);
10804     unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10805     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
10806                        DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
10807   }
10808 
10809   if (NumOperands > 2) {
10810     MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10811     ArrayRef<SDUse> Ops = Op->ops();
10812     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10813                              Ops.slice(0, NumOperands/2));
10814     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10815                              Ops.slice(NumOperands/2));
10816     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10817   }
10818 
10819   assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?");
10820 
10821   if (ResVT.getVectorNumElements() >= 16)
10822     return Op; // The operation is legal with KUNPCK
10823 
10824   SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
10825                             DAG.getUNDEF(ResVT), Op.getOperand(0),
10826                             DAG.getIntPtrConstant(0, dl));
10827   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
10828                      DAG.getIntPtrConstant(NumElems/2, dl));
10829 }
10830 
LowerCONCAT_VECTORS(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)10831 static SDValue LowerCONCAT_VECTORS(SDValue Op,
10832                                    const X86Subtarget &Subtarget,
10833                                    SelectionDAG &DAG) {
10834   MVT VT = Op.getSimpleValueType();
10835   if (VT.getVectorElementType() == MVT::i1)
10836     return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
10837 
10838   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
10839          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
10840           Op.getNumOperands() == 4)));
10841 
10842   // AVX can use the vinsertf128 instruction to create 256-bit vectors
10843   // from two other 128-bit ones.
10844 
10845   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
10846   return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
10847 }
10848 
10849 //===----------------------------------------------------------------------===//
10850 // Vector shuffle lowering
10851 //
10852 // This is an experimental code path for lowering vector shuffles on x86. It is
10853 // designed to handle arbitrary vector shuffles and blends, gracefully
10854 // degrading performance as necessary. It works hard to recognize idiomatic
10855 // shuffles and lower them to optimal instruction patterns without leaving
10856 // a framework that allows reasonably efficient handling of all vector shuffle
10857 // patterns.
10858 //===----------------------------------------------------------------------===//
10859 
10860 /// Tiny helper function to identify a no-op mask.
10861 ///
10862 /// This is a somewhat boring predicate function. It checks whether the mask
10863 /// array input, which is assumed to be a single-input shuffle mask of the kind
10864 /// used by the X86 shuffle instructions (not a fully general
10865 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
10866 /// in-place shuffle are 'no-op's.
isNoopShuffleMask(ArrayRef<int> Mask)10867 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
10868   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10869     assert(Mask[i] >= -1 && "Out of bound mask element!");
10870     if (Mask[i] >= 0 && Mask[i] != i)
10871       return false;
10872   }
10873   return true;
10874 }
10875 
10876 /// Test whether there are elements crossing LaneSizeInBits lanes in this
10877 /// shuffle mask.
10878 ///
10879 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
10880 /// and we routinely test for these.
isLaneCrossingShuffleMask(unsigned LaneSizeInBits,unsigned ScalarSizeInBits,ArrayRef<int> Mask)10881 static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
10882                                       unsigned ScalarSizeInBits,
10883                                       ArrayRef<int> Mask) {
10884   assert(LaneSizeInBits && ScalarSizeInBits &&
10885          (LaneSizeInBits % ScalarSizeInBits) == 0 &&
10886          "Illegal shuffle lane size");
10887   int LaneSize = LaneSizeInBits / ScalarSizeInBits;
10888   int Size = Mask.size();
10889   for (int i = 0; i < Size; ++i)
10890     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10891       return true;
10892   return false;
10893 }
10894 
10895 /// Test whether there are elements crossing 128-bit lanes in this
10896 /// shuffle mask.
is128BitLaneCrossingShuffleMask(MVT VT,ArrayRef<int> Mask)10897 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
10898   return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
10899 }
10900 
10901 /// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
10902 /// from multiple lanes - this is different to isLaneCrossingShuffleMask to
10903 /// better support 'repeated mask + lane permute' style shuffles.
isMultiLaneShuffleMask(unsigned LaneSizeInBits,unsigned ScalarSizeInBits,ArrayRef<int> Mask)10904 static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
10905                                    unsigned ScalarSizeInBits,
10906                                    ArrayRef<int> Mask) {
10907   assert(LaneSizeInBits && ScalarSizeInBits &&
10908          (LaneSizeInBits % ScalarSizeInBits) == 0 &&
10909          "Illegal shuffle lane size");
10910   int NumElts = Mask.size();
10911   int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
10912   int NumLanes = NumElts / NumEltsPerLane;
10913   if (NumLanes > 1) {
10914     for (int i = 0; i != NumLanes; ++i) {
10915       int SrcLane = -1;
10916       for (int j = 0; j != NumEltsPerLane; ++j) {
10917         int M = Mask[(i * NumEltsPerLane) + j];
10918         if (M < 0)
10919           continue;
10920         int Lane = (M % NumElts) / NumEltsPerLane;
10921         if (SrcLane >= 0 && SrcLane != Lane)
10922           return true;
10923         SrcLane = Lane;
10924       }
10925     }
10926   }
10927   return false;
10928 }
10929 
10930 /// Test whether a shuffle mask is equivalent within each sub-lane.
10931 ///
10932 /// This checks a shuffle mask to see if it is performing the same
10933 /// lane-relative shuffle in each sub-lane. This trivially implies
10934 /// that it is also not lane-crossing. It may however involve a blend from the
10935 /// same lane of a second vector.
10936 ///
10937 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
10938 /// non-trivial to compute in the face of undef lanes. The representation is
10939 /// suitable for use with existing 128-bit shuffles as entries from the second
10940 /// vector have been remapped to [LaneSize, 2*LaneSize).
isRepeatedShuffleMask(unsigned LaneSizeInBits,MVT VT,ArrayRef<int> Mask,SmallVectorImpl<int> & RepeatedMask)10941 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
10942                                   ArrayRef<int> Mask,
10943                                   SmallVectorImpl<int> &RepeatedMask) {
10944   auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10945   RepeatedMask.assign(LaneSize, -1);
10946   int Size = Mask.size();
10947   for (int i = 0; i < Size; ++i) {
10948     assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
10949     if (Mask[i] < 0)
10950       continue;
10951     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10952       // This entry crosses lanes, so there is no way to model this shuffle.
10953       return false;
10954 
10955     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10956     // Adjust second vector indices to start at LaneSize instead of Size.
10957     int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10958                                 : Mask[i] % LaneSize + LaneSize;
10959     if (RepeatedMask[i % LaneSize] < 0)
10960       // This is the first non-undef entry in this slot of a 128-bit lane.
10961       RepeatedMask[i % LaneSize] = LocalM;
10962     else if (RepeatedMask[i % LaneSize] != LocalM)
10963       // Found a mismatch with the repeated mask.
10964       return false;
10965   }
10966   return true;
10967 }
10968 
10969 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
10970 static bool
is128BitLaneRepeatedShuffleMask(MVT VT,ArrayRef<int> Mask,SmallVectorImpl<int> & RepeatedMask)10971 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
10972                                 SmallVectorImpl<int> &RepeatedMask) {
10973   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10974 }
10975 
10976 static bool
is128BitLaneRepeatedShuffleMask(MVT VT,ArrayRef<int> Mask)10977 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
10978   SmallVector<int, 32> RepeatedMask;
10979   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10980 }
10981 
10982 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
10983 static bool
is256BitLaneRepeatedShuffleMask(MVT VT,ArrayRef<int> Mask,SmallVectorImpl<int> & RepeatedMask)10984 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
10985                                 SmallVectorImpl<int> &RepeatedMask) {
10986   return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10987 }
10988 
10989 /// Test whether a target shuffle mask is equivalent within each sub-lane.
10990 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,unsigned EltSizeInBits,ArrayRef<int> Mask,SmallVectorImpl<int> & RepeatedMask)10991 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
10992                                         unsigned EltSizeInBits,
10993                                         ArrayRef<int> Mask,
10994                                         SmallVectorImpl<int> &RepeatedMask) {
10995   int LaneSize = LaneSizeInBits / EltSizeInBits;
10996   RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10997   int Size = Mask.size();
10998   for (int i = 0; i < Size; ++i) {
10999     assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
11000     if (Mask[i] == SM_SentinelUndef)
11001       continue;
11002     if (Mask[i] == SM_SentinelZero) {
11003       if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
11004         return false;
11005       RepeatedMask[i % LaneSize] = SM_SentinelZero;
11006       continue;
11007     }
11008     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11009       // This entry crosses lanes, so there is no way to model this shuffle.
11010       return false;
11011 
11012     // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
11013     // later vector indices to start at multiples of LaneSize instead of Size.
11014     int LaneM = Mask[i] / Size;
11015     int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
11016     if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
11017       // This is the first non-undef entry in this slot of a 128-bit lane.
11018       RepeatedMask[i % LaneSize] = LocalM;
11019     else if (RepeatedMask[i % LaneSize] != LocalM)
11020       // Found a mismatch with the repeated mask.
11021       return false;
11022   }
11023   return true;
11024 }
11025 
11026 /// Test whether a target shuffle mask is equivalent within each sub-lane.
11027 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,MVT VT,ArrayRef<int> Mask,SmallVectorImpl<int> & RepeatedMask)11028 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
11029                                         ArrayRef<int> Mask,
11030                                         SmallVectorImpl<int> &RepeatedMask) {
11031   return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
11032                                      Mask, RepeatedMask);
11033 }
11034 
11035 /// Checks whether the vector elements referenced by two shuffle masks are
11036 /// equivalent.
IsElementEquivalent(int MaskSize,SDValue Op,SDValue ExpectedOp,int Idx,int ExpectedIdx)11037 static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
11038                                 int Idx, int ExpectedIdx) {
11039   assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
11040          ExpectedIdx < MaskSize && "Out of range element index");
11041   if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
11042     return false;
11043 
11044   switch (Op.getOpcode()) {
11045   case ISD::BUILD_VECTOR:
11046     // If the values are build vectors, we can look through them to find
11047     // equivalent inputs that make the shuffles equivalent.
11048     // TODO: Handle MaskSize != Op.getNumOperands()?
11049     if (MaskSize == (int)Op.getNumOperands() &&
11050         MaskSize == (int)ExpectedOp.getNumOperands())
11051       return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
11052     break;
11053   case X86ISD::VBROADCAST:
11054   case X86ISD::VBROADCAST_LOAD:
11055     // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
11056     return (Op == ExpectedOp &&
11057             (int)Op.getValueType().getVectorNumElements() == MaskSize);
11058   case X86ISD::HADD:
11059   case X86ISD::HSUB:
11060   case X86ISD::FHADD:
11061   case X86ISD::FHSUB:
11062   case X86ISD::PACKSS:
11063   case X86ISD::PACKUS:
11064     // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
11065     // TODO: Handle MaskSize != NumElts?
11066     // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
11067     if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
11068       MVT VT = Op.getSimpleValueType();
11069       int NumElts = VT.getVectorNumElements();
11070       if (MaskSize == NumElts) {
11071         int NumLanes = VT.getSizeInBits() / 128;
11072         int NumEltsPerLane = NumElts / NumLanes;
11073         int NumHalfEltsPerLane = NumEltsPerLane / 2;
11074         bool SameLane =
11075             (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
11076         bool SameElt =
11077             (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
11078         return SameLane && SameElt;
11079       }
11080     }
11081     break;
11082   }
11083 
11084   return false;
11085 }
11086 
11087 /// Checks whether a shuffle mask is equivalent to an explicit list of
11088 /// arguments.
11089 ///
11090 /// This is a fast way to test a shuffle mask against a fixed pattern:
11091 ///
11092 ///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
11093 ///
11094 /// It returns true if the mask is exactly as wide as the argument list, and
11095 /// each element of the mask is either -1 (signifying undef) or the value given
11096 /// in the argument.
isShuffleEquivalent(ArrayRef<int> Mask,ArrayRef<int> ExpectedMask,SDValue V1=SDValue (),SDValue V2=SDValue ())11097 static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
11098                                 SDValue V1 = SDValue(),
11099                                 SDValue V2 = SDValue()) {
11100   int Size = Mask.size();
11101   if (Size != (int)ExpectedMask.size())
11102     return false;
11103 
11104   for (int i = 0; i < Size; ++i) {
11105     assert(Mask[i] >= -1 && "Out of bound mask element!");
11106     int MaskIdx = Mask[i];
11107     int ExpectedIdx = ExpectedMask[i];
11108     if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
11109       SDValue MaskV = MaskIdx < Size ? V1 : V2;
11110       SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
11111       MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
11112       ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
11113       if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
11114         return false;
11115     }
11116   }
11117   return true;
11118 }
11119 
11120 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
11121 ///
11122 /// The masks must be exactly the same width.
11123 ///
11124 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
11125 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
11126 ///
11127 /// SM_SentinelZero is accepted as a valid negative index but must match in
11128 /// both.
isTargetShuffleEquivalent(MVT VT,ArrayRef<int> Mask,ArrayRef<int> ExpectedMask,SDValue V1=SDValue (),SDValue V2=SDValue ())11129 static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
11130                                       ArrayRef<int> ExpectedMask,
11131                                       SDValue V1 = SDValue(),
11132                                       SDValue V2 = SDValue()) {
11133   int Size = Mask.size();
11134   if (Size != (int)ExpectedMask.size())
11135     return false;
11136   assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
11137          "Illegal target shuffle mask");
11138 
11139   // Check for out-of-range target shuffle mask indices.
11140   if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
11141     return false;
11142 
11143   // Don't use V1/V2 if they're not the same size as the shuffle mask type.
11144   if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())
11145     V1 = SDValue();
11146   if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())
11147     V2 = SDValue();
11148 
11149   for (int i = 0; i < Size; ++i) {
11150     int MaskIdx = Mask[i];
11151     int ExpectedIdx = ExpectedMask[i];
11152     if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
11153       continue;
11154     if (0 <= MaskIdx && 0 <= ExpectedIdx) {
11155       SDValue MaskV = MaskIdx < Size ? V1 : V2;
11156       SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
11157       MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
11158       ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
11159       if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
11160         continue;
11161     }
11162     // TODO - handle SM_Sentinel equivalences.
11163     return false;
11164   }
11165   return true;
11166 }
11167 
11168 // Attempt to create a shuffle mask from a VSELECT condition mask.
createShuffleMaskFromVSELECT(SmallVectorImpl<int> & Mask,SDValue Cond)11169 static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
11170                                          SDValue Cond) {
11171   EVT CondVT = Cond.getValueType();
11172   unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
11173   unsigned NumElts = CondVT.getVectorNumElements();
11174 
11175   APInt UndefElts;
11176   SmallVector<APInt, 32> EltBits;
11177   if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
11178                                      true, false))
11179     return false;
11180 
11181   Mask.resize(NumElts, SM_SentinelUndef);
11182 
11183   for (int i = 0; i != (int)NumElts; ++i) {
11184     Mask[i] = i;
11185     // Arbitrarily choose from the 2nd operand if the select condition element
11186     // is undef.
11187     // TODO: Can we do better by matching patterns such as even/odd?
11188     if (UndefElts[i] || EltBits[i].isNullValue())
11189       Mask[i] += NumElts;
11190   }
11191 
11192   return true;
11193 }
11194 
11195 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
11196 // instructions.
isUnpackWdShuffleMask(ArrayRef<int> Mask,MVT VT)11197 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
11198   if (VT != MVT::v8i32 && VT != MVT::v8f32)
11199     return false;
11200 
11201   SmallVector<int, 8> Unpcklwd;
11202   createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
11203                           /* Unary = */ false);
11204   SmallVector<int, 8> Unpckhwd;
11205   createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
11206                           /* Unary = */ false);
11207   bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd) ||
11208                          isTargetShuffleEquivalent(VT, Mask, Unpckhwd));
11209   return IsUnpackwdMask;
11210 }
11211 
is128BitUnpackShuffleMask(ArrayRef<int> Mask)11212 static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
11213   // Create 128-bit vector type based on mask size.
11214   MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
11215   MVT VT = MVT::getVectorVT(EltVT, Mask.size());
11216 
11217   // We can't assume a canonical shuffle mask, so try the commuted version too.
11218   SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
11219   ShuffleVectorSDNode::commuteMask(CommutedMask);
11220 
11221   // Match any of unary/binary or low/high.
11222   for (unsigned i = 0; i != 4; ++i) {
11223     SmallVector<int, 16> UnpackMask;
11224     createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
11225     if (isTargetShuffleEquivalent(VT, Mask, UnpackMask) ||
11226         isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask))
11227       return true;
11228   }
11229   return false;
11230 }
11231 
11232 /// Return true if a shuffle mask chooses elements identically in its top and
11233 /// bottom halves. For example, any splat mask has the same top and bottom
11234 /// halves. If an element is undefined in only one half of the mask, the halves
11235 /// are not considered identical.
hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask)11236 static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
11237   assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
11238   unsigned HalfSize = Mask.size() / 2;
11239   for (unsigned i = 0; i != HalfSize; ++i) {
11240     if (Mask[i] != Mask[i + HalfSize])
11241       return false;
11242   }
11243   return true;
11244 }
11245 
11246 /// Get a 4-lane 8-bit shuffle immediate for a mask.
11247 ///
11248 /// This helper function produces an 8-bit shuffle immediate corresponding to
11249 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
11250 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
11251 /// example.
11252 ///
11253 /// NB: We rely heavily on "undef" masks preserving the input lane.
getV4X86ShuffleImm(ArrayRef<int> Mask)11254 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
11255   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
11256   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
11257   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
11258   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
11259   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
11260 
11261   // If the mask only uses one non-undef element, then fully 'splat' it to
11262   // improve later broadcast matching.
11263   int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
11264   assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
11265 
11266   int FirstElt = Mask[FirstIndex];
11267   if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
11268     return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
11269 
11270   unsigned Imm = 0;
11271   Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
11272   Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
11273   Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
11274   Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
11275   return Imm;
11276 }
11277 
getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,const SDLoc & DL,SelectionDAG & DAG)11278 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
11279                                           SelectionDAG &DAG) {
11280   return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
11281 }
11282 
11283 // The Shuffle result is as follow:
11284 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
11285 // Each Zeroable's element correspond to a particular Mask's element.
11286 // As described in computeZeroableShuffleElements function.
11287 //
11288 // The function looks for a sub-mask that the nonzero elements are in
11289 // increasing order. If such sub-mask exist. The function returns true.
isNonZeroElementsInOrder(const APInt & Zeroable,ArrayRef<int> Mask,const EVT & VectorType,bool & IsZeroSideLeft)11290 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
11291                                      ArrayRef<int> Mask, const EVT &VectorType,
11292                                      bool &IsZeroSideLeft) {
11293   int NextElement = -1;
11294   // Check if the Mask's nonzero elements are in increasing order.
11295   for (int i = 0, e = Mask.size(); i < e; i++) {
11296     // Checks if the mask's zeros elements are built from only zeros.
11297     assert(Mask[i] >= -1 && "Out of bound mask element!");
11298     if (Mask[i] < 0)
11299       return false;
11300     if (Zeroable[i])
11301       continue;
11302     // Find the lowest non zero element
11303     if (NextElement < 0) {
11304       NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
11305       IsZeroSideLeft = NextElement != 0;
11306     }
11307     // Exit if the mask's non zero elements are not in increasing order.
11308     if (NextElement != Mask[i])
11309       return false;
11310     NextElement++;
11311   }
11312   return true;
11313 }
11314 
11315 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
lowerShuffleWithPSHUFB(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,SDValue V1,SDValue V2,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)11316 static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
11317                                       ArrayRef<int> Mask, SDValue V1,
11318                                       SDValue V2, const APInt &Zeroable,
11319                                       const X86Subtarget &Subtarget,
11320                                       SelectionDAG &DAG) {
11321   int Size = Mask.size();
11322   int LaneSize = 128 / VT.getScalarSizeInBits();
11323   const int NumBytes = VT.getSizeInBits() / 8;
11324   const int NumEltBytes = VT.getScalarSizeInBits() / 8;
11325 
11326   assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
11327          (Subtarget.hasAVX2() && VT.is256BitVector()) ||
11328          (Subtarget.hasBWI() && VT.is512BitVector()));
11329 
11330   SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
11331   // Sign bit set in i8 mask means zero element.
11332   SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
11333 
11334   SDValue V;
11335   for (int i = 0; i < NumBytes; ++i) {
11336     int M = Mask[i / NumEltBytes];
11337     if (M < 0) {
11338       PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
11339       continue;
11340     }
11341     if (Zeroable[i / NumEltBytes]) {
11342       PSHUFBMask[i] = ZeroMask;
11343       continue;
11344     }
11345 
11346     // We can only use a single input of V1 or V2.
11347     SDValue SrcV = (M >= Size ? V2 : V1);
11348     if (V && V != SrcV)
11349       return SDValue();
11350     V = SrcV;
11351     M %= Size;
11352 
11353     // PSHUFB can't cross lanes, ensure this doesn't happen.
11354     if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
11355       return SDValue();
11356 
11357     M = M % LaneSize;
11358     M = M * NumEltBytes + (i % NumEltBytes);
11359     PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
11360   }
11361   assert(V && "Failed to find a source input");
11362 
11363   MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
11364   return DAG.getBitcast(
11365       VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
11366                       DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
11367 }
11368 
11369 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
11370                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
11371                            const SDLoc &dl);
11372 
11373 // X86 has dedicated shuffle that can be lowered to VEXPAND
lowerShuffleToEXPAND(const SDLoc & DL,MVT VT,const APInt & Zeroable,ArrayRef<int> Mask,SDValue & V1,SDValue & V2,SelectionDAG & DAG,const X86Subtarget & Subtarget)11374 static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
11375                                     const APInt &Zeroable,
11376                                     ArrayRef<int> Mask, SDValue &V1,
11377                                     SDValue &V2, SelectionDAG &DAG,
11378                                     const X86Subtarget &Subtarget) {
11379   bool IsLeftZeroSide = true;
11380   if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
11381                                 IsLeftZeroSide))
11382     return SDValue();
11383   unsigned VEXPANDMask = (~Zeroable).getZExtValue();
11384   MVT IntegerType =
11385       MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
11386   SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
11387   unsigned NumElts = VT.getVectorNumElements();
11388   assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
11389          "Unexpected number of vector elements");
11390   SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
11391                               Subtarget, DAG, DL);
11392   SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
11393   SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
11394   return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
11395 }
11396 
matchShuffleWithUNPCK(MVT VT,SDValue & V1,SDValue & V2,unsigned & UnpackOpcode,bool IsUnary,ArrayRef<int> TargetMask,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget)11397 static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
11398                                   unsigned &UnpackOpcode, bool IsUnary,
11399                                   ArrayRef<int> TargetMask, const SDLoc &DL,
11400                                   SelectionDAG &DAG,
11401                                   const X86Subtarget &Subtarget) {
11402   int NumElts = VT.getVectorNumElements();
11403 
11404   bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
11405   for (int i = 0; i != NumElts; i += 2) {
11406     int M1 = TargetMask[i + 0];
11407     int M2 = TargetMask[i + 1];
11408     Undef1 &= (SM_SentinelUndef == M1);
11409     Undef2 &= (SM_SentinelUndef == M2);
11410     Zero1 &= isUndefOrZero(M1);
11411     Zero2 &= isUndefOrZero(M2);
11412   }
11413   assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
11414          "Zeroable shuffle detected");
11415 
11416   // Attempt to match the target mask against the unpack lo/hi mask patterns.
11417   SmallVector<int, 64> Unpckl, Unpckh;
11418   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
11419   if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, V1,
11420                                 (IsUnary ? V1 : V2))) {
11421     UnpackOpcode = X86ISD::UNPCKL;
11422     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11423     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11424     return true;
11425   }
11426 
11427   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
11428   if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, V1,
11429                                 (IsUnary ? V1 : V2))) {
11430     UnpackOpcode = X86ISD::UNPCKH;
11431     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11432     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11433     return true;
11434   }
11435 
11436   // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
11437   if (IsUnary && (Zero1 || Zero2)) {
11438     // Don't bother if we can blend instead.
11439     if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
11440         isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
11441       return false;
11442 
11443     bool MatchLo = true, MatchHi = true;
11444     for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
11445       int M = TargetMask[i];
11446 
11447       // Ignore if the input is known to be zero or the index is undef.
11448       if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
11449           (M == SM_SentinelUndef))
11450         continue;
11451 
11452       MatchLo &= (M == Unpckl[i]);
11453       MatchHi &= (M == Unpckh[i]);
11454     }
11455 
11456     if (MatchLo || MatchHi) {
11457       UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11458       V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11459       V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11460       return true;
11461     }
11462   }
11463 
11464   // If a binary shuffle, commute and try again.
11465   if (!IsUnary) {
11466     ShuffleVectorSDNode::commuteMask(Unpckl);
11467     if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) {
11468       UnpackOpcode = X86ISD::UNPCKL;
11469       std::swap(V1, V2);
11470       return true;
11471     }
11472 
11473     ShuffleVectorSDNode::commuteMask(Unpckh);
11474     if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) {
11475       UnpackOpcode = X86ISD::UNPCKH;
11476       std::swap(V1, V2);
11477       return true;
11478     }
11479   }
11480 
11481   return false;
11482 }
11483 
11484 // X86 has dedicated unpack instructions that can handle specific blend
11485 // operations: UNPCKH and UNPCKL.
lowerShuffleWithUNPCK(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,SDValue V1,SDValue V2,SelectionDAG & DAG)11486 static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
11487                                      ArrayRef<int> Mask, SDValue V1, SDValue V2,
11488                                      SelectionDAG &DAG) {
11489   SmallVector<int, 8> Unpckl;
11490   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
11491   if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11492     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
11493 
11494   SmallVector<int, 8> Unpckh;
11495   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
11496   if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11497     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
11498 
11499   // Commute and try again.
11500   ShuffleVectorSDNode::commuteMask(Unpckl);
11501   if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11502     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
11503 
11504   ShuffleVectorSDNode::commuteMask(Unpckh);
11505   if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11506     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
11507 
11508   return SDValue();
11509 }
11510 
11511 /// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
11512 /// followed by unpack 256-bit.
lowerShuffleWithUNPCK256(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,SDValue V1,SDValue V2,SelectionDAG & DAG)11513 static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
11514                                         ArrayRef<int> Mask, SDValue V1,
11515                                         SDValue V2, SelectionDAG &DAG) {
11516   SmallVector<int, 32> Unpckl, Unpckh;
11517   createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
11518   createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
11519 
11520   unsigned UnpackOpcode;
11521   if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11522     UnpackOpcode = X86ISD::UNPCKL;
11523   else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11524     UnpackOpcode = X86ISD::UNPCKH;
11525   else
11526     return SDValue();
11527 
11528   // This is a "natural" unpack operation (rather than the 128-bit sectored
11529   // operation implemented by AVX). We need to rearrange 64-bit chunks of the
11530   // input in order to use the x86 instruction.
11531   V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
11532                             DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
11533   V1 = DAG.getBitcast(VT, V1);
11534   return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
11535 }
11536 
11537 // Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
11538 // source into the lower elements and zeroing the upper elements.
matchShuffleAsVTRUNC(MVT & SrcVT,MVT & DstVT,MVT VT,ArrayRef<int> Mask,const APInt & Zeroable,const X86Subtarget & Subtarget)11539 static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
11540                                  ArrayRef<int> Mask, const APInt &Zeroable,
11541                                  const X86Subtarget &Subtarget) {
11542   if (!VT.is512BitVector() && !Subtarget.hasVLX())
11543     return false;
11544 
11545   unsigned NumElts = Mask.size();
11546   unsigned EltSizeInBits = VT.getScalarSizeInBits();
11547   unsigned MaxScale = 64 / EltSizeInBits;
11548 
11549   for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11550     unsigned SrcEltBits = EltSizeInBits * Scale;
11551     if (SrcEltBits < 32 && !Subtarget.hasBWI())
11552       continue;
11553     unsigned NumSrcElts = NumElts / Scale;
11554     if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
11555       continue;
11556     unsigned UpperElts = NumElts - NumSrcElts;
11557     if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11558       continue;
11559     SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
11560     SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
11561     DstVT = MVT::getIntegerVT(EltSizeInBits);
11562     if ((NumSrcElts * EltSizeInBits) >= 128) {
11563       // ISD::TRUNCATE
11564       DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
11565     } else {
11566       // X86ISD::VTRUNC
11567       DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
11568     }
11569     return true;
11570   }
11571 
11572   return false;
11573 }
11574 
11575 // Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
11576 // element padding to the final DstVT.
getAVX512TruncNode(const SDLoc & DL,MVT DstVT,SDValue Src,const X86Subtarget & Subtarget,SelectionDAG & DAG,bool ZeroUppers)11577 static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
11578                                   const X86Subtarget &Subtarget,
11579                                   SelectionDAG &DAG, bool ZeroUppers) {
11580   MVT SrcVT = Src.getSimpleValueType();
11581   MVT DstSVT = DstVT.getScalarType();
11582   unsigned NumDstElts = DstVT.getVectorNumElements();
11583   unsigned NumSrcElts = SrcVT.getVectorNumElements();
11584   unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
11585 
11586   if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
11587     return SDValue();
11588 
11589   // Perform a direct ISD::TRUNCATE if possible.
11590   if (NumSrcElts == NumDstElts)
11591     return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
11592 
11593   if (NumSrcElts > NumDstElts) {
11594     MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11595     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11596     return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
11597   }
11598 
11599   if ((NumSrcElts * DstEltSizeInBits) >= 128) {
11600     MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11601     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11602     return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11603                           DstVT.getSizeInBits());
11604   }
11605 
11606   // Non-VLX targets must truncate from a 512-bit type, so we need to
11607   // widen, truncate and then possibly extract the original subvector.
11608   if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
11609     SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
11610     return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
11611   }
11612 
11613   // Fallback to a X86ISD::VTRUNC, padding if necessary.
11614   MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
11615   SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
11616   if (DstVT != TruncVT)
11617     Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11618                            DstVT.getSizeInBits());
11619   return Trunc;
11620 }
11621 
11622 // Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
11623 //
11624 // An example is the following:
11625 //
11626 // t0: ch = EntryToken
11627 //           t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
11628 //         t25: v4i32 = truncate t2
11629 //       t41: v8i16 = bitcast t25
11630 //       t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
11631 //       Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
11632 //     t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
11633 //   t18: v2i64 = bitcast t51
11634 //
11635 // One can just use a single vpmovdw instruction, without avx512vl we need to
11636 // use the zmm variant and extract the lower subvector, padding with zeroes.
11637 // TODO: Merge with lowerShuffleAsVTRUNC.
lowerShuffleWithVPMOV(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)11638 static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
11639                                      SDValue V2, ArrayRef<int> Mask,
11640                                      const APInt &Zeroable,
11641                                      const X86Subtarget &Subtarget,
11642                                      SelectionDAG &DAG) {
11643   assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
11644   if (!Subtarget.hasAVX512())
11645     return SDValue();
11646 
11647   unsigned NumElts = VT.getVectorNumElements();
11648   unsigned EltSizeInBits = VT.getScalarSizeInBits();
11649   unsigned MaxScale = 64 / EltSizeInBits;
11650   for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11651     unsigned NumSrcElts = NumElts / Scale;
11652     unsigned UpperElts = NumElts - NumSrcElts;
11653     if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
11654         !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11655       continue;
11656 
11657     SDValue Src = V1;
11658     if (!Src.hasOneUse())
11659       return SDValue();
11660 
11661     Src = peekThroughOneUseBitcasts(Src);
11662     if (Src.getOpcode() != ISD::TRUNCATE ||
11663         Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))
11664       return SDValue();
11665     Src = Src.getOperand(0);
11666 
11667     // VPMOVWB is only available with avx512bw.
11668     MVT SrcVT = Src.getSimpleValueType();
11669     if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
11670         !Subtarget.hasBWI())
11671       return SDValue();
11672 
11673     bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
11674     return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11675   }
11676 
11677   return SDValue();
11678 }
11679 
11680 // Attempt to match binary shuffle patterns as a truncate.
lowerShuffleAsVTRUNC(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)11681 static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
11682                                     SDValue V2, ArrayRef<int> Mask,
11683                                     const APInt &Zeroable,
11684                                     const X86Subtarget &Subtarget,
11685                                     SelectionDAG &DAG) {
11686   assert((VT.is128BitVector() || VT.is256BitVector()) &&
11687          "Unexpected VTRUNC type");
11688   if (!Subtarget.hasAVX512())
11689     return SDValue();
11690 
11691   unsigned NumElts = VT.getVectorNumElements();
11692   unsigned EltSizeInBits = VT.getScalarSizeInBits();
11693   unsigned MaxScale = 64 / EltSizeInBits;
11694   for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11695     // TODO: Support non-BWI VPMOVWB truncations?
11696     unsigned SrcEltBits = EltSizeInBits * Scale;
11697     if (SrcEltBits < 32 && !Subtarget.hasBWI())
11698       continue;
11699 
11700     // Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...>
11701     // Bail if the V2 elements are undef.
11702     unsigned NumHalfSrcElts = NumElts / Scale;
11703     unsigned NumSrcElts = 2 * NumHalfSrcElts;
11704     if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
11705         isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
11706       continue;
11707 
11708     // The elements beyond the truncation must be undef/zero.
11709     unsigned UpperElts = NumElts - NumSrcElts;
11710     if (UpperElts > 0 &&
11711         !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11712       continue;
11713     bool UndefUppers =
11714         UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
11715 
11716     // As we're using both sources then we need to concat them together
11717     // and truncate from the double-sized src.
11718     MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
11719     SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
11720 
11721     MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11722     MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11723     Src = DAG.getBitcast(SrcVT, Src);
11724     return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11725   }
11726 
11727   return SDValue();
11728 }
11729 
11730 /// Check whether a compaction lowering can be done by dropping even
11731 /// elements and compute how many times even elements must be dropped.
11732 ///
11733 /// This handles shuffles which take every Nth element where N is a power of
11734 /// two. Example shuffle masks:
11735 ///
11736 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
11737 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11738 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
11739 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
11740 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
11741 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
11742 ///
11743 /// Any of these lanes can of course be undef.
11744 ///
11745 /// This routine only supports N <= 3.
11746 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11747 /// for larger N.
11748 ///
11749 /// \returns N above, or the number of times even elements must be dropped if
11750 /// there is such a number. Otherwise returns zero.
canLowerByDroppingEvenElements(ArrayRef<int> Mask,bool IsSingleInput)11751 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11752                                           bool IsSingleInput) {
11753   // The modulus for the shuffle vector entries is based on whether this is
11754   // a single input or not.
11755   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11756   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11757          "We should only be called with masks with a power-of-2 size!");
11758 
11759   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11760 
11761   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11762   // and 2^3 simultaneously. This is because we may have ambiguity with
11763   // partially undef inputs.
11764   bool ViableForN[3] = {true, true, true};
11765 
11766   for (int i = 0, e = Mask.size(); i < e; ++i) {
11767     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11768     // want.
11769     if (Mask[i] < 0)
11770       continue;
11771 
11772     bool IsAnyViable = false;
11773     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11774       if (ViableForN[j]) {
11775         uint64_t N = j + 1;
11776 
11777         // The shuffle mask must be equal to (i * 2^N) % M.
11778         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11779           IsAnyViable = true;
11780         else
11781           ViableForN[j] = false;
11782       }
11783     // Early exit if we exhaust the possible powers of two.
11784     if (!IsAnyViable)
11785       break;
11786   }
11787 
11788   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11789     if (ViableForN[j])
11790       return j + 1;
11791 
11792   // Return 0 as there is no viable power of two.
11793   return 0;
11794 }
11795 
11796 // X86 has dedicated pack instructions that can handle specific truncation
11797 // operations: PACKSS and PACKUS.
11798 // Checks for compaction shuffle masks if MaxStages > 1.
11799 // TODO: Add support for matching multiple PACKSS/PACKUS stages.
matchShuffleWithPACK(MVT VT,MVT & SrcVT,SDValue & V1,SDValue & V2,unsigned & PackOpcode,ArrayRef<int> TargetMask,SelectionDAG & DAG,const X86Subtarget & Subtarget,unsigned MaxStages=1)11800 static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
11801                                  unsigned &PackOpcode, ArrayRef<int> TargetMask,
11802                                  SelectionDAG &DAG,
11803                                  const X86Subtarget &Subtarget,
11804                                  unsigned MaxStages = 1) {
11805   unsigned NumElts = VT.getVectorNumElements();
11806   unsigned BitSize = VT.getScalarSizeInBits();
11807   assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
11808          "Illegal maximum compaction");
11809 
11810   auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
11811     unsigned NumSrcBits = PackVT.getScalarSizeInBits();
11812     unsigned NumPackedBits = NumSrcBits - BitSize;
11813     SDValue VV1 = DAG.getBitcast(PackVT, N1);
11814     SDValue VV2 = DAG.getBitcast(PackVT, N2);
11815     if (Subtarget.hasSSE41() || BitSize == 8) {
11816       APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
11817       if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
11818           (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
11819         V1 = VV1;
11820         V2 = VV2;
11821         SrcVT = PackVT;
11822         PackOpcode = X86ISD::PACKUS;
11823         return true;
11824       }
11825     }
11826     if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > NumPackedBits) &&
11827         (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > NumPackedBits)) {
11828       V1 = VV1;
11829       V2 = VV2;
11830       SrcVT = PackVT;
11831       PackOpcode = X86ISD::PACKSS;
11832       return true;
11833     }
11834     return false;
11835   };
11836 
11837   // Attempt to match against wider and wider compaction patterns.
11838   for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
11839     MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
11840     MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
11841 
11842     // Try binary shuffle.
11843     SmallVector<int, 32> BinaryMask;
11844     createPackShuffleMask(VT, BinaryMask, false, NumStages);
11845     if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, V1, V2))
11846       if (MatchPACK(V1, V2, PackVT))
11847         return true;
11848 
11849     // Try unary shuffle.
11850     SmallVector<int, 32> UnaryMask;
11851     createPackShuffleMask(VT, UnaryMask, true, NumStages);
11852     if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, V1))
11853       if (MatchPACK(V1, V1, PackVT))
11854         return true;
11855   }
11856 
11857   return false;
11858 }
11859 
lowerShuffleWithPACK(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,SDValue V1,SDValue V2,SelectionDAG & DAG,const X86Subtarget & Subtarget)11860 static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
11861                                     SDValue V1, SDValue V2, SelectionDAG &DAG,
11862                                     const X86Subtarget &Subtarget) {
11863   MVT PackVT;
11864   unsigned PackOpcode;
11865   unsigned SizeBits = VT.getSizeInBits();
11866   unsigned EltBits = VT.getScalarSizeInBits();
11867   unsigned MaxStages = Log2_32(64 / EltBits);
11868   if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
11869                             Subtarget, MaxStages))
11870     return SDValue();
11871 
11872   unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
11873   unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
11874 
11875   // Don't lower multi-stage packs on AVX512, truncation is better.
11876   if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
11877     return SDValue();
11878 
11879   // Pack to the largest type possible:
11880   // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
11881   unsigned MaxPackBits = 16;
11882   if (CurrentEltBits > 16 &&
11883       (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
11884     MaxPackBits = 32;
11885 
11886   // Repeatedly pack down to the target size.
11887   SDValue Res;
11888   for (unsigned i = 0; i != NumStages; ++i) {
11889     unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
11890     unsigned NumSrcElts = SizeBits / SrcEltBits;
11891     MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11892     MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
11893     MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11894     MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
11895     Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
11896                       DAG.getBitcast(SrcVT, V2));
11897     V1 = V2 = Res;
11898     CurrentEltBits /= 2;
11899   }
11900   assert(Res && Res.getValueType() == VT &&
11901          "Failed to lower compaction shuffle");
11902   return Res;
11903 }
11904 
11905 /// Try to emit a bitmask instruction for a shuffle.
11906 ///
11907 /// This handles cases where we can model a blend exactly as a bitmask due to
11908 /// one of the inputs being zeroable.
lowerShuffleAsBitMask(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)11909 static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
11910                                      SDValue V2, ArrayRef<int> Mask,
11911                                      const APInt &Zeroable,
11912                                      const X86Subtarget &Subtarget,
11913                                      SelectionDAG &DAG) {
11914   MVT MaskVT = VT;
11915   MVT EltVT = VT.getVectorElementType();
11916   SDValue Zero, AllOnes;
11917   // Use f64 if i64 isn't legal.
11918   if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11919     EltVT = MVT::f64;
11920     MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11921   }
11922 
11923   MVT LogicVT = VT;
11924   if (EltVT == MVT::f32 || EltVT == MVT::f64) {
11925     Zero = DAG.getConstantFP(0.0, DL, EltVT);
11926     APFloat AllOnesValue = APFloat::getAllOnesValue(
11927         SelectionDAG::EVTToAPFloatSemantics(EltVT), EltVT.getSizeInBits());
11928     AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11929     LogicVT =
11930         MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
11931   } else {
11932     Zero = DAG.getConstant(0, DL, EltVT);
11933     AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11934   }
11935 
11936   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11937   SDValue V;
11938   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11939     if (Zeroable[i])
11940       continue;
11941     if (Mask[i] % Size != i)
11942       return SDValue(); // Not a blend.
11943     if (!V)
11944       V = Mask[i] < Size ? V1 : V2;
11945     else if (V != (Mask[i] < Size ? V1 : V2))
11946       return SDValue(); // Can only let one input through the mask.
11947 
11948     VMaskOps[i] = AllOnes;
11949   }
11950   if (!V)
11951     return SDValue(); // No non-zeroable elements!
11952 
11953   SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11954   VMask = DAG.getBitcast(LogicVT, VMask);
11955   V = DAG.getBitcast(LogicVT, V);
11956   SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11957   return DAG.getBitcast(VT, And);
11958 }
11959 
11960 /// Try to emit a blend instruction for a shuffle using bit math.
11961 ///
11962 /// This is used as a fallback approach when first class blend instructions are
11963 /// unavailable. Currently it is only suitable for integer vectors, but could
11964 /// be generalized for floating point vectors if desirable.
lowerShuffleAsBitBlend(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)11965 static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
11966                                       SDValue V2, ArrayRef<int> Mask,
11967                                       SelectionDAG &DAG) {
11968   assert(VT.isInteger() && "Only supports integer vector types!");
11969   MVT EltVT = VT.getVectorElementType();
11970   SDValue Zero = DAG.getConstant(0, DL, EltVT);
11971   SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11972   SmallVector<SDValue, 16> MaskOps;
11973   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11974     if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
11975       return SDValue(); // Shuffled input!
11976     MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
11977   }
11978 
11979   SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
11980   V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
11981   V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
11982   return DAG.getNode(ISD::OR, DL, VT, V1, V2);
11983 }
11984 
11985 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
11986                                     SDValue PreservedSrc,
11987                                     const X86Subtarget &Subtarget,
11988                                     SelectionDAG &DAG);
11989 
matchShuffleAsBlend(SDValue V1,SDValue V2,MutableArrayRef<int> Mask,const APInt & Zeroable,bool & ForceV1Zero,bool & ForceV2Zero,uint64_t & BlendMask)11990 static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
11991                                 MutableArrayRef<int> Mask,
11992                                 const APInt &Zeroable, bool &ForceV1Zero,
11993                                 bool &ForceV2Zero, uint64_t &BlendMask) {
11994   bool V1IsZeroOrUndef =
11995       V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
11996   bool V2IsZeroOrUndef =
11997       V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
11998 
11999   BlendMask = 0;
12000   ForceV1Zero = false, ForceV2Zero = false;
12001   assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
12002 
12003   // Attempt to generate the binary blend mask. If an input is zero then
12004   // we can use any lane.
12005   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12006     int M = Mask[i];
12007     if (M == SM_SentinelUndef)
12008       continue;
12009     if (M == i)
12010       continue;
12011     if (M == i + Size) {
12012       BlendMask |= 1ull << i;
12013       continue;
12014     }
12015     if (Zeroable[i]) {
12016       if (V1IsZeroOrUndef) {
12017         ForceV1Zero = true;
12018         Mask[i] = i;
12019         continue;
12020       }
12021       if (V2IsZeroOrUndef) {
12022         ForceV2Zero = true;
12023         BlendMask |= 1ull << i;
12024         Mask[i] = i + Size;
12025         continue;
12026       }
12027     }
12028     return false;
12029   }
12030   return true;
12031 }
12032 
scaleVectorShuffleBlendMask(uint64_t BlendMask,int Size,int Scale)12033 static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
12034                                             int Scale) {
12035   uint64_t ScaledMask = 0;
12036   for (int i = 0; i != Size; ++i)
12037     if (BlendMask & (1ull << i))
12038       ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
12039   return ScaledMask;
12040 }
12041 
12042 /// Try to emit a blend instruction for a shuffle.
12043 ///
12044 /// This doesn't do any checks for the availability of instructions for blending
12045 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
12046 /// be matched in the backend with the type given. What it does check for is
12047 /// that the shuffle mask is a blend, or convertible into a blend with zero.
lowerShuffleAsBlend(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Original,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)12048 static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
12049                                    SDValue V2, ArrayRef<int> Original,
12050                                    const APInt &Zeroable,
12051                                    const X86Subtarget &Subtarget,
12052                                    SelectionDAG &DAG) {
12053   uint64_t BlendMask = 0;
12054   bool ForceV1Zero = false, ForceV2Zero = false;
12055   SmallVector<int, 64> Mask(Original.begin(), Original.end());
12056   if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
12057                            BlendMask))
12058     return SDValue();
12059 
12060   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
12061   if (ForceV1Zero)
12062     V1 = getZeroVector(VT, Subtarget, DAG, DL);
12063   if (ForceV2Zero)
12064     V2 = getZeroVector(VT, Subtarget, DAG, DL);
12065 
12066   switch (VT.SimpleTy) {
12067   case MVT::v4i64:
12068   case MVT::v8i32:
12069     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
12070     LLVM_FALLTHROUGH;
12071   case MVT::v4f64:
12072   case MVT::v8f32:
12073     assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
12074     LLVM_FALLTHROUGH;
12075   case MVT::v2f64:
12076   case MVT::v2i64:
12077   case MVT::v4f32:
12078   case MVT::v4i32:
12079   case MVT::v8i16:
12080     assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
12081     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
12082                        DAG.getTargetConstant(BlendMask, DL, MVT::i8));
12083   case MVT::v16i16: {
12084     assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
12085     SmallVector<int, 8> RepeatedMask;
12086     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
12087       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
12088       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
12089       BlendMask = 0;
12090       for (int i = 0; i < 8; ++i)
12091         if (RepeatedMask[i] >= 8)
12092           BlendMask |= 1ull << i;
12093       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12094                          DAG.getTargetConstant(BlendMask, DL, MVT::i8));
12095     }
12096     // Use PBLENDW for lower/upper lanes and then blend lanes.
12097     // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
12098     // merge to VSELECT where useful.
12099     uint64_t LoMask = BlendMask & 0xFF;
12100     uint64_t HiMask = (BlendMask >> 8) & 0xFF;
12101     if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
12102       SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12103                                DAG.getTargetConstant(LoMask, DL, MVT::i8));
12104       SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12105                                DAG.getTargetConstant(HiMask, DL, MVT::i8));
12106       return DAG.getVectorShuffle(
12107           MVT::v16i16, DL, Lo, Hi,
12108           {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
12109     }
12110     LLVM_FALLTHROUGH;
12111   }
12112   case MVT::v32i8:
12113     assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
12114     LLVM_FALLTHROUGH;
12115   case MVT::v16i8: {
12116     assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
12117 
12118     // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
12119     if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12120                                                Subtarget, DAG))
12121       return Masked;
12122 
12123     if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
12124       MVT IntegerType =
12125           MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12126       SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12127       return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12128     }
12129 
12130     // If we have VPTERNLOG, we can use that as a bit blend.
12131     if (Subtarget.hasVLX())
12132       if (SDValue BitBlend =
12133               lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
12134         return BitBlend;
12135 
12136     // Scale the blend by the number of bytes per element.
12137     int Scale = VT.getScalarSizeInBits() / 8;
12138 
12139     // This form of blend is always done on bytes. Compute the byte vector
12140     // type.
12141     MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12142 
12143     // x86 allows load folding with blendvb from the 2nd source operand. But
12144     // we are still using LLVM select here (see comment below), so that's V1.
12145     // If V2 can be load-folded and V1 cannot be load-folded, then commute to
12146     // allow that load-folding possibility.
12147     if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
12148       ShuffleVectorSDNode::commuteMask(Mask);
12149       std::swap(V1, V2);
12150     }
12151 
12152     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
12153     // mix of LLVM's code generator and the x86 backend. We tell the code
12154     // generator that boolean values in the elements of an x86 vector register
12155     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
12156     // mapping a select to operand #1, and 'false' mapping to operand #2. The
12157     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
12158     // of the element (the remaining are ignored) and 0 in that high bit would
12159     // mean operand #1 while 1 in the high bit would mean operand #2. So while
12160     // the LLVM model for boolean values in vector elements gets the relevant
12161     // bit set, it is set backwards and over constrained relative to x86's
12162     // actual model.
12163     SmallVector<SDValue, 32> VSELECTMask;
12164     for (int i = 0, Size = Mask.size(); i < Size; ++i)
12165       for (int j = 0; j < Scale; ++j)
12166         VSELECTMask.push_back(
12167             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
12168                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
12169                                           MVT::i8));
12170 
12171     V1 = DAG.getBitcast(BlendVT, V1);
12172     V2 = DAG.getBitcast(BlendVT, V2);
12173     return DAG.getBitcast(
12174         VT,
12175         DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
12176                       V1, V2));
12177   }
12178   case MVT::v16f32:
12179   case MVT::v8f64:
12180   case MVT::v8i64:
12181   case MVT::v16i32:
12182   case MVT::v32i16:
12183   case MVT::v64i8: {
12184     // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
12185     bool OptForSize = DAG.shouldOptForSize();
12186     if (!OptForSize) {
12187       if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12188                                                  Subtarget, DAG))
12189         return Masked;
12190     }
12191 
12192     // Otherwise load an immediate into a GPR, cast to k-register, and use a
12193     // masked move.
12194     MVT IntegerType =
12195         MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12196     SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12197     return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12198   }
12199   default:
12200     llvm_unreachable("Not a supported integer vector type!");
12201   }
12202 }
12203 
12204 /// Try to lower as a blend of elements from two inputs followed by
12205 /// a single-input permutation.
12206 ///
12207 /// This matches the pattern where we can blend elements from two inputs and
12208 /// then reduce the shuffle to a single-input permutation.
lowerShuffleAsBlendAndPermute(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG,bool ImmBlends=false)12209 static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
12210                                              SDValue V1, SDValue V2,
12211                                              ArrayRef<int> Mask,
12212                                              SelectionDAG &DAG,
12213                                              bool ImmBlends = false) {
12214   // We build up the blend mask while checking whether a blend is a viable way
12215   // to reduce the shuffle.
12216   SmallVector<int, 32> BlendMask(Mask.size(), -1);
12217   SmallVector<int, 32> PermuteMask(Mask.size(), -1);
12218 
12219   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12220     if (Mask[i] < 0)
12221       continue;
12222 
12223     assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
12224 
12225     if (BlendMask[Mask[i] % Size] < 0)
12226       BlendMask[Mask[i] % Size] = Mask[i];
12227     else if (BlendMask[Mask[i] % Size] != Mask[i])
12228       return SDValue(); // Can't blend in the needed input!
12229 
12230     PermuteMask[i] = Mask[i] % Size;
12231   }
12232 
12233   // If only immediate blends, then bail if the blend mask can't be widened to
12234   // i16.
12235   unsigned EltSize = VT.getScalarSizeInBits();
12236   if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
12237     return SDValue();
12238 
12239   SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
12240   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
12241 }
12242 
12243 /// Try to lower as an unpack of elements from two inputs followed by
12244 /// a single-input permutation.
12245 ///
12246 /// This matches the pattern where we can unpack elements from two inputs and
12247 /// then reduce the shuffle to a single-input (wider) permutation.
lowerShuffleAsUNPCKAndPermute(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)12248 static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
12249                                              SDValue V1, SDValue V2,
12250                                              ArrayRef<int> Mask,
12251                                              SelectionDAG &DAG) {
12252   int NumElts = Mask.size();
12253   int NumLanes = VT.getSizeInBits() / 128;
12254   int NumLaneElts = NumElts / NumLanes;
12255   int NumHalfLaneElts = NumLaneElts / 2;
12256 
12257   bool MatchLo = true, MatchHi = true;
12258   SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
12259 
12260   // Determine UNPCKL/UNPCKH type and operand order.
12261   for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12262     for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
12263       int M = Mask[Lane + Elt];
12264       if (M < 0)
12265         continue;
12266 
12267       SDValue &Op = Ops[Elt & 1];
12268       if (M < NumElts && (Op.isUndef() || Op == V1))
12269         Op = V1;
12270       else if (NumElts <= M && (Op.isUndef() || Op == V2))
12271         Op = V2;
12272       else
12273         return SDValue();
12274 
12275       int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
12276       MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
12277                  isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
12278       MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
12279                  isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
12280       if (!MatchLo && !MatchHi)
12281         return SDValue();
12282     }
12283   }
12284   assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
12285 
12286   // Now check that each pair of elts come from the same unpack pair
12287   // and set the permute mask based on each pair.
12288   // TODO - Investigate cases where we permute individual elements.
12289   SmallVector<int, 32> PermuteMask(NumElts, -1);
12290   for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12291     for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
12292       int M0 = Mask[Lane + Elt + 0];
12293       int M1 = Mask[Lane + Elt + 1];
12294       if (0 <= M0 && 0 <= M1 &&
12295           (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
12296         return SDValue();
12297       if (0 <= M0)
12298         PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
12299       if (0 <= M1)
12300         PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
12301     }
12302   }
12303 
12304   unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
12305   SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
12306   return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
12307 }
12308 
12309 /// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
12310 /// permuting the elements of the result in place.
lowerShuffleAsByteRotateAndPermute(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)12311 static SDValue lowerShuffleAsByteRotateAndPermute(
12312     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12313     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12314   if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
12315       (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
12316       (VT.is512BitVector() && !Subtarget.hasBWI()))
12317     return SDValue();
12318 
12319   // We don't currently support lane crossing permutes.
12320   if (is128BitLaneCrossingShuffleMask(VT, Mask))
12321     return SDValue();
12322 
12323   int Scale = VT.getScalarSizeInBits() / 8;
12324   int NumLanes = VT.getSizeInBits() / 128;
12325   int NumElts = VT.getVectorNumElements();
12326   int NumEltsPerLane = NumElts / NumLanes;
12327 
12328   // Determine range of mask elts.
12329   bool Blend1 = true;
12330   bool Blend2 = true;
12331   std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
12332   std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
12333   for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12334     for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12335       int M = Mask[Lane + Elt];
12336       if (M < 0)
12337         continue;
12338       if (M < NumElts) {
12339         Blend1 &= (M == (Lane + Elt));
12340         assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
12341         M = M % NumEltsPerLane;
12342         Range1.first = std::min(Range1.first, M);
12343         Range1.second = std::max(Range1.second, M);
12344       } else {
12345         M -= NumElts;
12346         Blend2 &= (M == (Lane + Elt));
12347         assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
12348         M = M % NumEltsPerLane;
12349         Range2.first = std::min(Range2.first, M);
12350         Range2.second = std::max(Range2.second, M);
12351       }
12352     }
12353   }
12354 
12355   // Bail if we don't need both elements.
12356   // TODO - it might be worth doing this for unary shuffles if the permute
12357   // can be widened.
12358   if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
12359       !(0 <= Range2.first && Range2.second < NumEltsPerLane))
12360     return SDValue();
12361 
12362   if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
12363     return SDValue();
12364 
12365   // Rotate the 2 ops so we can access both ranges, then permute the result.
12366   auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
12367     MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12368     SDValue Rotate = DAG.getBitcast(
12369         VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
12370                         DAG.getBitcast(ByteVT, Lo),
12371                         DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
12372     SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
12373     for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12374       for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12375         int M = Mask[Lane + Elt];
12376         if (M < 0)
12377           continue;
12378         if (M < NumElts)
12379           PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
12380         else
12381           PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
12382       }
12383     }
12384     return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
12385   };
12386 
12387   // Check if the ranges are small enough to rotate from either direction.
12388   if (Range2.second < Range1.first)
12389     return RotateAndPermute(V1, V2, Range1.first, 0);
12390   if (Range1.second < Range2.first)
12391     return RotateAndPermute(V2, V1, Range2.first, NumElts);
12392   return SDValue();
12393 }
12394 
12395 /// Generic routine to decompose a shuffle and blend into independent
12396 /// blends and permutes.
12397 ///
12398 /// This matches the extremely common pattern for handling combined
12399 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
12400 /// operations. It will try to pick the best arrangement of shuffles and
12401 /// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
lowerShuffleAsDecomposedShuffleMerge(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)12402 static SDValue lowerShuffleAsDecomposedShuffleMerge(
12403     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12404     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12405   int NumElts = Mask.size();
12406   int NumLanes = VT.getSizeInBits() / 128;
12407   int NumEltsPerLane = NumElts / NumLanes;
12408 
12409   // Shuffle the input elements into the desired positions in V1 and V2 and
12410   // unpack/blend them together.
12411   bool IsAlternating = true;
12412   SmallVector<int, 32> V1Mask(NumElts, -1);
12413   SmallVector<int, 32> V2Mask(NumElts, -1);
12414   SmallVector<int, 32> FinalMask(NumElts, -1);
12415   for (int i = 0; i < NumElts; ++i) {
12416     int M = Mask[i];
12417     if (M >= 0 && M < NumElts) {
12418       V1Mask[i] = M;
12419       FinalMask[i] = i;
12420       IsAlternating &= (i & 1) == 0;
12421     } else if (M >= NumElts) {
12422       V2Mask[i] = M - NumElts;
12423       FinalMask[i] = i + NumElts;
12424       IsAlternating &= (i & 1) == 1;
12425     }
12426   }
12427 
12428   // Try to lower with the simpler initial blend/unpack/rotate strategies unless
12429   // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
12430   // the shuffle may be able to fold with a load or other benefit. However, when
12431   // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
12432   // pre-shuffle first is a better strategy.
12433   if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
12434     // Only prefer immediate blends to unpack/rotate.
12435     if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12436                                                           DAG, true))
12437       return BlendPerm;
12438     if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
12439                                                            DAG))
12440       return UnpackPerm;
12441     if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
12442             DL, VT, V1, V2, Mask, Subtarget, DAG))
12443       return RotatePerm;
12444     // Unpack/rotate failed - try again with variable blends.
12445     if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12446                                                           DAG))
12447       return BlendPerm;
12448   }
12449 
12450   // If the final mask is an alternating blend of vXi8/vXi16, convert to an
12451   // UNPCKL(SHUFFLE, SHUFFLE) pattern.
12452   // TODO: It doesn't have to be alternating - but each lane mustn't have more
12453   // than half the elements coming from each source.
12454   if (IsAlternating && VT.getScalarSizeInBits() < 32) {
12455     V1Mask.assign(NumElts, -1);
12456     V2Mask.assign(NumElts, -1);
12457     FinalMask.assign(NumElts, -1);
12458     for (int i = 0; i != NumElts; i += NumEltsPerLane)
12459       for (int j = 0; j != NumEltsPerLane; ++j) {
12460         int M = Mask[i + j];
12461         if (M >= 0 && M < NumElts) {
12462           V1Mask[i + (j / 2)] = M;
12463           FinalMask[i + j] = i + (j / 2);
12464         } else if (M >= NumElts) {
12465           V2Mask[i + (j / 2)] = M - NumElts;
12466           FinalMask[i + j] = i + (j / 2) + NumElts;
12467         }
12468       }
12469   }
12470 
12471   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
12472   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
12473   return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
12474 }
12475 
12476 /// Try to lower a vector shuffle as a bit rotation.
12477 ///
12478 /// Look for a repeated rotation pattern in each sub group.
12479 /// Returns a ISD::ROTL element rotation amount or -1 if failed.
matchShuffleAsBitRotate(ArrayRef<int> Mask,int NumSubElts)12480 static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
12481   int NumElts = Mask.size();
12482   assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask");
12483 
12484   int RotateAmt = -1;
12485   for (int i = 0; i != NumElts; i += NumSubElts) {
12486     for (int j = 0; j != NumSubElts; ++j) {
12487       int M = Mask[i + j];
12488       if (M < 0)
12489         continue;
12490       if (!isInRange(M, i, i + NumSubElts))
12491         return -1;
12492       int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
12493       if (0 <= RotateAmt && Offset != RotateAmt)
12494         return -1;
12495       RotateAmt = Offset;
12496     }
12497   }
12498   return RotateAmt;
12499 }
12500 
matchShuffleAsBitRotate(MVT & RotateVT,int EltSizeInBits,const X86Subtarget & Subtarget,ArrayRef<int> Mask)12501 static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
12502                                    const X86Subtarget &Subtarget,
12503                                    ArrayRef<int> Mask) {
12504   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12505   assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
12506 
12507   // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
12508   int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
12509   int MaxSubElts = 64 / EltSizeInBits;
12510   for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
12511     int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
12512     if (RotateAmt < 0)
12513       continue;
12514 
12515     int NumElts = Mask.size();
12516     MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
12517     RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
12518     return RotateAmt * EltSizeInBits;
12519   }
12520 
12521   return -1;
12522 }
12523 
12524 /// Lower shuffle using X86ISD::VROTLI rotations.
lowerShuffleAsBitRotate(const SDLoc & DL,MVT VT,SDValue V1,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)12525 static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
12526                                        ArrayRef<int> Mask,
12527                                        const X86Subtarget &Subtarget,
12528                                        SelectionDAG &DAG) {
12529   // Only XOP + AVX512 targets have bit rotation instructions.
12530   // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
12531   bool IsLegal =
12532       (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
12533   if (!IsLegal && Subtarget.hasSSE3())
12534     return SDValue();
12535 
12536   MVT RotateVT;
12537   int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
12538                                           Subtarget, Mask);
12539   if (RotateAmt < 0)
12540     return SDValue();
12541 
12542   // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
12543   // expanded to OR(SRL,SHL), will be more efficient, but if they can
12544   // widen to vXi16 or more then existing lowering should will be better.
12545   if (!IsLegal) {
12546     if ((RotateAmt % 16) == 0)
12547       return SDValue();
12548     // TODO: Use getTargetVShiftByConstNode.
12549     unsigned ShlAmt = RotateAmt;
12550     unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
12551     V1 = DAG.getBitcast(RotateVT, V1);
12552     SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
12553                               DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
12554     SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
12555                               DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
12556     SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
12557     return DAG.getBitcast(VT, Rot);
12558   }
12559 
12560   SDValue Rot =
12561       DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
12562                   DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
12563   return DAG.getBitcast(VT, Rot);
12564 }
12565 
12566 /// Try to match a vector shuffle as an element rotation.
12567 ///
12568 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
matchShuffleAsElementRotate(SDValue & V1,SDValue & V2,ArrayRef<int> Mask)12569 static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
12570                                        ArrayRef<int> Mask) {
12571   int NumElts = Mask.size();
12572 
12573   // We need to detect various ways of spelling a rotation:
12574   //   [11, 12, 13, 14, 15,  0,  1,  2]
12575   //   [-1, 12, 13, 14, -1, -1,  1, -1]
12576   //   [-1, -1, -1, -1, -1, -1,  1,  2]
12577   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
12578   //   [-1,  4,  5,  6, -1, -1,  9, -1]
12579   //   [-1,  4,  5,  6, -1, -1, -1, -1]
12580   int Rotation = 0;
12581   SDValue Lo, Hi;
12582   for (int i = 0; i < NumElts; ++i) {
12583     int M = Mask[i];
12584     assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
12585            "Unexpected mask index.");
12586     if (M < 0)
12587       continue;
12588 
12589     // Determine where a rotated vector would have started.
12590     int StartIdx = i - (M % NumElts);
12591     if (StartIdx == 0)
12592       // The identity rotation isn't interesting, stop.
12593       return -1;
12594 
12595     // If we found the tail of a vector the rotation must be the missing
12596     // front. If we found the head of a vector, it must be how much of the
12597     // head.
12598     int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
12599 
12600     if (Rotation == 0)
12601       Rotation = CandidateRotation;
12602     else if (Rotation != CandidateRotation)
12603       // The rotations don't match, so we can't match this mask.
12604       return -1;
12605 
12606     // Compute which value this mask is pointing at.
12607     SDValue MaskV = M < NumElts ? V1 : V2;
12608 
12609     // Compute which of the two target values this index should be assigned
12610     // to. This reflects whether the high elements are remaining or the low
12611     // elements are remaining.
12612     SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
12613 
12614     // Either set up this value if we've not encountered it before, or check
12615     // that it remains consistent.
12616     if (!TargetV)
12617       TargetV = MaskV;
12618     else if (TargetV != MaskV)
12619       // This may be a rotation, but it pulls from the inputs in some
12620       // unsupported interleaving.
12621       return -1;
12622   }
12623 
12624   // Check that we successfully analyzed the mask, and normalize the results.
12625   assert(Rotation != 0 && "Failed to locate a viable rotation!");
12626   assert((Lo || Hi) && "Failed to find a rotated input vector!");
12627   if (!Lo)
12628     Lo = Hi;
12629   else if (!Hi)
12630     Hi = Lo;
12631 
12632   V1 = Lo;
12633   V2 = Hi;
12634 
12635   return Rotation;
12636 }
12637 
12638 /// Try to lower a vector shuffle as a byte rotation.
12639 ///
12640 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
12641 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
12642 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
12643 /// try to generically lower a vector shuffle through such an pattern. It
12644 /// does not check for the profitability of lowering either as PALIGNR or
12645 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
12646 /// This matches shuffle vectors that look like:
12647 ///
12648 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
12649 ///
12650 /// Essentially it concatenates V1 and V2, shifts right by some number of
12651 /// elements, and takes the low elements as the result. Note that while this is
12652 /// specified as a *right shift* because x86 is little-endian, it is a *left
12653 /// rotate* of the vector lanes.
matchShuffleAsByteRotate(MVT VT,SDValue & V1,SDValue & V2,ArrayRef<int> Mask)12654 static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
12655                                     ArrayRef<int> Mask) {
12656   // Don't accept any shuffles with zero elements.
12657   if (isAnyZero(Mask))
12658     return -1;
12659 
12660   // PALIGNR works on 128-bit lanes.
12661   SmallVector<int, 16> RepeatedMask;
12662   if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
12663     return -1;
12664 
12665   int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
12666   if (Rotation <= 0)
12667     return -1;
12668 
12669   // PALIGNR rotates bytes, so we need to scale the
12670   // rotation based on how many bytes are in the vector lane.
12671   int NumElts = RepeatedMask.size();
12672   int Scale = 16 / NumElts;
12673   return Rotation * Scale;
12674 }
12675 
lowerShuffleAsByteRotate(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)12676 static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
12677                                         SDValue V2, ArrayRef<int> Mask,
12678                                         const X86Subtarget &Subtarget,
12679                                         SelectionDAG &DAG) {
12680   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12681 
12682   SDValue Lo = V1, Hi = V2;
12683   int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
12684   if (ByteRotation <= 0)
12685     return SDValue();
12686 
12687   // Cast the inputs to i8 vector of correct length to match PALIGNR or
12688   // PSLLDQ/PSRLDQ.
12689   MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12690   Lo = DAG.getBitcast(ByteVT, Lo);
12691   Hi = DAG.getBitcast(ByteVT, Hi);
12692 
12693   // SSSE3 targets can use the palignr instruction.
12694   if (Subtarget.hasSSSE3()) {
12695     assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
12696            "512-bit PALIGNR requires BWI instructions");
12697     return DAG.getBitcast(
12698         VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
12699                         DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
12700   }
12701 
12702   assert(VT.is128BitVector() &&
12703          "Rotate-based lowering only supports 128-bit lowering!");
12704   assert(Mask.size() <= 16 &&
12705          "Can shuffle at most 16 bytes in a 128-bit vector!");
12706   assert(ByteVT == MVT::v16i8 &&
12707          "SSE2 rotate lowering only needed for v16i8!");
12708 
12709   // Default SSE2 implementation
12710   int LoByteShift = 16 - ByteRotation;
12711   int HiByteShift = ByteRotation;
12712 
12713   SDValue LoShift =
12714       DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12715                   DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12716   SDValue HiShift =
12717       DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12718                   DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12719   return DAG.getBitcast(VT,
12720                         DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12721 }
12722 
12723 /// Try to lower a vector shuffle as a dword/qword rotation.
12724 ///
12725 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12726 /// rotation of the concatenation of two vectors; This routine will
12727 /// try to generically lower a vector shuffle through such an pattern.
12728 ///
12729 /// Essentially it concatenates V1 and V2, shifts right by some number of
12730 /// elements, and takes the low elements as the result. Note that while this is
12731 /// specified as a *right shift* because x86 is little-endian, it is a *left
12732 /// rotate* of the vector lanes.
lowerShuffleAsVALIGN(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)12733 static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
12734                                     SDValue V2, ArrayRef<int> Mask,
12735                                     const X86Subtarget &Subtarget,
12736                                     SelectionDAG &DAG) {
12737   assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
12738          "Only 32-bit and 64-bit elements are supported!");
12739 
12740   // 128/256-bit vectors are only supported with VLX.
12741   assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
12742          && "VLX required for 128/256-bit vectors");
12743 
12744   SDValue Lo = V1, Hi = V2;
12745   int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12746   if (Rotation <= 0)
12747     return SDValue();
12748 
12749   return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12750                      DAG.getTargetConstant(Rotation, DL, MVT::i8));
12751 }
12752 
12753 /// Try to lower a vector shuffle as a byte shift sequence.
lowerShuffleAsByteShiftMask(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)12754 static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
12755                                            SDValue V2, ArrayRef<int> Mask,
12756                                            const APInt &Zeroable,
12757                                            const X86Subtarget &Subtarget,
12758                                            SelectionDAG &DAG) {
12759   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12760   assert(VT.is128BitVector() && "Only 128-bit vectors supported");
12761 
12762   // We need a shuffle that has zeros at one/both ends and a sequential
12763   // shuffle from one source within.
12764   unsigned ZeroLo = Zeroable.countTrailingOnes();
12765   unsigned ZeroHi = Zeroable.countLeadingOnes();
12766   if (!ZeroLo && !ZeroHi)
12767     return SDValue();
12768 
12769   unsigned NumElts = Mask.size();
12770   unsigned Len = NumElts - (ZeroLo + ZeroHi);
12771   if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12772     return SDValue();
12773 
12774   unsigned Scale = VT.getScalarSizeInBits() / 8;
12775   ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12776   if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12777       !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12778     return SDValue();
12779 
12780   SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12781   Res = DAG.getBitcast(MVT::v16i8, Res);
12782 
12783   // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12784   // inner sequential set of elements, possibly offset:
12785   // 01234567 --> zzzzzz01 --> 1zzzzzzz
12786   // 01234567 --> 4567zzzz --> zzzzz456
12787   // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12788   if (ZeroLo == 0) {
12789     unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12790     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12791                       DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12792     Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12793                       DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12794   } else if (ZeroHi == 0) {
12795     unsigned Shift = Mask[ZeroLo] % NumElts;
12796     Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12797                       DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12798     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12799                       DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12800   } else if (!Subtarget.hasSSSE3()) {
12801     // If we don't have PSHUFB then its worth avoiding an AND constant mask
12802     // by performing 3 byte shifts. Shuffle combining can kick in above that.
12803     // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12804     unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12805     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12806                       DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12807     Shift += Mask[ZeroLo] % NumElts;
12808     Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12809                       DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12810     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12811                       DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12812   } else
12813     return SDValue();
12814 
12815   return DAG.getBitcast(VT, Res);
12816 }
12817 
12818 /// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12819 ///
12820 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12821 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12822 /// matches elements from one of the input vectors shuffled to the left or
12823 /// right with zeroable elements 'shifted in'. It handles both the strictly
12824 /// bit-wise element shifts and the byte shift across an entire 128-bit double
12825 /// quad word lane.
12826 ///
12827 /// PSHL : (little-endian) left bit shift.
12828 /// [ zz, 0, zz,  2 ]
12829 /// [ -1, 4, zz, -1 ]
12830 /// PSRL : (little-endian) right bit shift.
12831 /// [  1, zz,  3, zz]
12832 /// [ -1, -1,  7, zz]
12833 /// PSLLDQ : (little-endian) left byte shift
12834 /// [ zz,  0,  1,  2,  3,  4,  5,  6]
12835 /// [ zz, zz, -1, -1,  2,  3,  4, -1]
12836 /// [ zz, zz, zz, zz, zz, zz, -1,  1]
12837 /// PSRLDQ : (little-endian) right byte shift
12838 /// [  5, 6,  7, zz, zz, zz, zz, zz]
12839 /// [ -1, 5,  6,  7, zz, zz, zz, zz]
12840 /// [  1, 2, -1, -1, -1, -1, zz, zz]
matchShuffleAsShift(MVT & ShiftVT,unsigned & Opcode,unsigned ScalarSizeInBits,ArrayRef<int> Mask,int MaskOffset,const APInt & Zeroable,const X86Subtarget & Subtarget)12841 static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12842                                unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12843                                int MaskOffset, const APInt &Zeroable,
12844                                const X86Subtarget &Subtarget) {
12845   int Size = Mask.size();
12846   unsigned SizeInBits = Size * ScalarSizeInBits;
12847 
12848   auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12849     for (int i = 0; i < Size; i += Scale)
12850       for (int j = 0; j < Shift; ++j)
12851         if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12852           return false;
12853 
12854     return true;
12855   };
12856 
12857   auto MatchShift = [&](int Shift, int Scale, bool Left) {
12858     for (int i = 0; i != Size; i += Scale) {
12859       unsigned Pos = Left ? i + Shift : i;
12860       unsigned Low = Left ? i : i + Shift;
12861       unsigned Len = Scale - Shift;
12862       if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12863         return -1;
12864     }
12865 
12866     int ShiftEltBits = ScalarSizeInBits * Scale;
12867     bool ByteShift = ShiftEltBits > 64;
12868     Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12869                   : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12870     int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12871 
12872     // Normalize the scale for byte shifts to still produce an i64 element
12873     // type.
12874     Scale = ByteShift ? Scale / 2 : Scale;
12875 
12876     // We need to round trip through the appropriate type for the shift.
12877     MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12878     ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12879                         : MVT::getVectorVT(ShiftSVT, Size / Scale);
12880     return (int)ShiftAmt;
12881   };
12882 
12883   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12884   // keep doubling the size of the integer elements up to that. We can
12885   // then shift the elements of the integer vector by whole multiples of
12886   // their width within the elements of the larger integer vector. Test each
12887   // multiple to see if we can find a match with the moved element indices
12888   // and that the shifted in elements are all zeroable.
12889   unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12890   for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12891     for (int Shift = 1; Shift != Scale; ++Shift)
12892       for (bool Left : {true, false})
12893         if (CheckZeros(Shift, Scale, Left)) {
12894           int ShiftAmt = MatchShift(Shift, Scale, Left);
12895           if (0 < ShiftAmt)
12896             return ShiftAmt;
12897         }
12898 
12899   // no match
12900   return -1;
12901 }
12902 
lowerShuffleAsShift(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)12903 static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
12904                                    SDValue V2, ArrayRef<int> Mask,
12905                                    const APInt &Zeroable,
12906                                    const X86Subtarget &Subtarget,
12907                                    SelectionDAG &DAG) {
12908   int Size = Mask.size();
12909   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12910 
12911   MVT ShiftVT;
12912   SDValue V = V1;
12913   unsigned Opcode;
12914 
12915   // Try to match shuffle against V1 shift.
12916   int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12917                                      Mask, 0, Zeroable, Subtarget);
12918 
12919   // If V1 failed, try to match shuffle against V2 shift.
12920   if (ShiftAmt < 0) {
12921     ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12922                                    Mask, Size, Zeroable, Subtarget);
12923     V = V2;
12924   }
12925 
12926   if (ShiftAmt < 0)
12927     return SDValue();
12928 
12929   assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
12930          "Illegal integer vector type");
12931   V = DAG.getBitcast(ShiftVT, V);
12932   V = DAG.getNode(Opcode, DL, ShiftVT, V,
12933                   DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12934   return DAG.getBitcast(VT, V);
12935 }
12936 
12937 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12938 // Remainder of lower half result is zero and upper half is all undef.
matchShuffleAsEXTRQ(MVT VT,SDValue & V1,SDValue & V2,ArrayRef<int> Mask,uint64_t & BitLen,uint64_t & BitIdx,const APInt & Zeroable)12939 static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12940                                 ArrayRef<int> Mask, uint64_t &BitLen,
12941                                 uint64_t &BitIdx, const APInt &Zeroable) {
12942   int Size = Mask.size();
12943   int HalfSize = Size / 2;
12944   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12945   assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
12946 
12947   // Upper half must be undefined.
12948   if (!isUndefUpperHalf(Mask))
12949     return false;
12950 
12951   // Determine the extraction length from the part of the
12952   // lower half that isn't zeroable.
12953   int Len = HalfSize;
12954   for (; Len > 0; --Len)
12955     if (!Zeroable[Len - 1])
12956       break;
12957   assert(Len > 0 && "Zeroable shuffle mask");
12958 
12959   // Attempt to match first Len sequential elements from the lower half.
12960   SDValue Src;
12961   int Idx = -1;
12962   for (int i = 0; i != Len; ++i) {
12963     int M = Mask[i];
12964     if (M == SM_SentinelUndef)
12965       continue;
12966     SDValue &V = (M < Size ? V1 : V2);
12967     M = M % Size;
12968 
12969     // The extracted elements must start at a valid index and all mask
12970     // elements must be in the lower half.
12971     if (i > M || M >= HalfSize)
12972       return false;
12973 
12974     if (Idx < 0 || (Src == V && Idx == (M - i))) {
12975       Src = V;
12976       Idx = M - i;
12977       continue;
12978     }
12979     return false;
12980   }
12981 
12982   if (!Src || Idx < 0)
12983     return false;
12984 
12985   assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12986   BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12987   BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12988   V1 = Src;
12989   return true;
12990 }
12991 
12992 // INSERTQ: Extract lowest Len elements from lower half of second source and
12993 // insert over first source, starting at Idx.
12994 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
matchShuffleAsINSERTQ(MVT VT,SDValue & V1,SDValue & V2,ArrayRef<int> Mask,uint64_t & BitLen,uint64_t & BitIdx)12995 static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12996                                   ArrayRef<int> Mask, uint64_t &BitLen,
12997                                   uint64_t &BitIdx) {
12998   int Size = Mask.size();
12999   int HalfSize = Size / 2;
13000   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
13001 
13002   // Upper half must be undefined.
13003   if (!isUndefUpperHalf(Mask))
13004     return false;
13005 
13006   for (int Idx = 0; Idx != HalfSize; ++Idx) {
13007     SDValue Base;
13008 
13009     // Attempt to match first source from mask before insertion point.
13010     if (isUndefInRange(Mask, 0, Idx)) {
13011       /* EMPTY */
13012     } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
13013       Base = V1;
13014     } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
13015       Base = V2;
13016     } else {
13017       continue;
13018     }
13019 
13020     // Extend the extraction length looking to match both the insertion of
13021     // the second source and the remaining elements of the first.
13022     for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
13023       SDValue Insert;
13024       int Len = Hi - Idx;
13025 
13026       // Match insertion.
13027       if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
13028         Insert = V1;
13029       } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
13030         Insert = V2;
13031       } else {
13032         continue;
13033       }
13034 
13035       // Match the remaining elements of the lower half.
13036       if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
13037         /* EMPTY */
13038       } else if ((!Base || (Base == V1)) &&
13039                  isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
13040         Base = V1;
13041       } else if ((!Base || (Base == V2)) &&
13042                  isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
13043                                             Size + Hi)) {
13044         Base = V2;
13045       } else {
13046         continue;
13047       }
13048 
13049       BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
13050       BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
13051       V1 = Base;
13052       V2 = Insert;
13053       return true;
13054     }
13055   }
13056 
13057   return false;
13058 }
13059 
13060 /// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
lowerShuffleWithSSE4A(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,SelectionDAG & DAG)13061 static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
13062                                      SDValue V2, ArrayRef<int> Mask,
13063                                      const APInt &Zeroable, SelectionDAG &DAG) {
13064   uint64_t BitLen, BitIdx;
13065   if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
13066     return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
13067                        DAG.getTargetConstant(BitLen, DL, MVT::i8),
13068                        DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13069 
13070   if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
13071     return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
13072                        V2 ? V2 : DAG.getUNDEF(VT),
13073                        DAG.getTargetConstant(BitLen, DL, MVT::i8),
13074                        DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13075 
13076   return SDValue();
13077 }
13078 
13079 /// Lower a vector shuffle as a zero or any extension.
13080 ///
13081 /// Given a specific number of elements, element bit width, and extension
13082 /// stride, produce either a zero or any extension based on the available
13083 /// features of the subtarget. The extended elements are consecutive and
13084 /// begin and can start from an offsetted element index in the input; to
13085 /// avoid excess shuffling the offset must either being in the bottom lane
13086 /// or at the start of a higher lane. All extended elements must be from
13087 /// the same lane.
lowerShuffleAsSpecificZeroOrAnyExtend(const SDLoc & DL,MVT VT,int Scale,int Offset,bool AnyExt,SDValue InputV,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)13088 static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
13089     const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
13090     ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13091   assert(Scale > 1 && "Need a scale to extend.");
13092   int EltBits = VT.getScalarSizeInBits();
13093   int NumElements = VT.getVectorNumElements();
13094   int NumEltsPerLane = 128 / EltBits;
13095   int OffsetLane = Offset / NumEltsPerLane;
13096   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
13097          "Only 8, 16, and 32 bit elements can be extended.");
13098   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
13099   assert(0 <= Offset && "Extension offset must be positive.");
13100   assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
13101          "Extension offset must be in the first lane or start an upper lane.");
13102 
13103   // Check that an index is in same lane as the base offset.
13104   auto SafeOffset = [&](int Idx) {
13105     return OffsetLane == (Idx / NumEltsPerLane);
13106   };
13107 
13108   // Shift along an input so that the offset base moves to the first element.
13109   auto ShuffleOffset = [&](SDValue V) {
13110     if (!Offset)
13111       return V;
13112 
13113     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13114     for (int i = 0; i * Scale < NumElements; ++i) {
13115       int SrcIdx = i + Offset;
13116       ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
13117     }
13118     return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
13119   };
13120 
13121   // Found a valid a/zext mask! Try various lowering strategies based on the
13122   // input type and available ISA extensions.
13123   if (Subtarget.hasSSE41()) {
13124     // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
13125     // PUNPCK will catch this in a later shuffle match.
13126     if (Offset && Scale == 2 && VT.is128BitVector())
13127       return SDValue();
13128     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
13129                                  NumElements / Scale);
13130     InputV = ShuffleOffset(InputV);
13131     InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
13132                                     DL, ExtVT, InputV, DAG);
13133     return DAG.getBitcast(VT, InputV);
13134   }
13135 
13136   assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
13137 
13138   // For any extends we can cheat for larger element sizes and use shuffle
13139   // instructions that can fold with a load and/or copy.
13140   if (AnyExt && EltBits == 32) {
13141     int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
13142                          -1};
13143     return DAG.getBitcast(
13144         VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13145                         DAG.getBitcast(MVT::v4i32, InputV),
13146                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13147   }
13148   if (AnyExt && EltBits == 16 && Scale > 2) {
13149     int PSHUFDMask[4] = {Offset / 2, -1,
13150                          SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
13151     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13152                          DAG.getBitcast(MVT::v4i32, InputV),
13153                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13154     int PSHUFWMask[4] = {1, -1, -1, -1};
13155     unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
13156     return DAG.getBitcast(
13157         VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
13158                         DAG.getBitcast(MVT::v8i16, InputV),
13159                         getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
13160   }
13161 
13162   // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
13163   // to 64-bits.
13164   if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
13165     assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
13166     assert(VT.is128BitVector() && "Unexpected vector width!");
13167 
13168     int LoIdx = Offset * EltBits;
13169     SDValue Lo = DAG.getBitcast(
13170         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13171                                 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13172                                 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
13173 
13174     if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
13175       return DAG.getBitcast(VT, Lo);
13176 
13177     int HiIdx = (Offset + 1) * EltBits;
13178     SDValue Hi = DAG.getBitcast(
13179         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13180                                 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13181                                 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
13182     return DAG.getBitcast(VT,
13183                           DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
13184   }
13185 
13186   // If this would require more than 2 unpack instructions to expand, use
13187   // pshufb when available. We can only use more than 2 unpack instructions
13188   // when zero extending i8 elements which also makes it easier to use pshufb.
13189   if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
13190     assert(NumElements == 16 && "Unexpected byte vector width!");
13191     SDValue PSHUFBMask[16];
13192     for (int i = 0; i < 16; ++i) {
13193       int Idx = Offset + (i / Scale);
13194       if ((i % Scale == 0 && SafeOffset(Idx))) {
13195         PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
13196         continue;
13197       }
13198       PSHUFBMask[i] =
13199           AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
13200     }
13201     InputV = DAG.getBitcast(MVT::v16i8, InputV);
13202     return DAG.getBitcast(
13203         VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
13204                         DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
13205   }
13206 
13207   // If we are extending from an offset, ensure we start on a boundary that
13208   // we can unpack from.
13209   int AlignToUnpack = Offset % (NumElements / Scale);
13210   if (AlignToUnpack) {
13211     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13212     for (int i = AlignToUnpack; i < NumElements; ++i)
13213       ShMask[i - AlignToUnpack] = i;
13214     InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
13215     Offset -= AlignToUnpack;
13216   }
13217 
13218   // Otherwise emit a sequence of unpacks.
13219   do {
13220     unsigned UnpackLoHi = X86ISD::UNPCKL;
13221     if (Offset >= (NumElements / 2)) {
13222       UnpackLoHi = X86ISD::UNPCKH;
13223       Offset -= (NumElements / 2);
13224     }
13225 
13226     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
13227     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
13228                          : getZeroVector(InputVT, Subtarget, DAG, DL);
13229     InputV = DAG.getBitcast(InputVT, InputV);
13230     InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
13231     Scale /= 2;
13232     EltBits *= 2;
13233     NumElements /= 2;
13234   } while (Scale > 1);
13235   return DAG.getBitcast(VT, InputV);
13236 }
13237 
13238 /// Try to lower a vector shuffle as a zero extension on any microarch.
13239 ///
13240 /// This routine will try to do everything in its power to cleverly lower
13241 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
13242 /// check for the profitability of this lowering,  it tries to aggressively
13243 /// match this pattern. It will use all of the micro-architectural details it
13244 /// can to emit an efficient lowering. It handles both blends with all-zero
13245 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
13246 /// masking out later).
13247 ///
13248 /// The reason we have dedicated lowering for zext-style shuffles is that they
13249 /// are both incredibly common and often quite performance sensitive.
lowerShuffleAsZeroOrAnyExtend(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)13250 static SDValue lowerShuffleAsZeroOrAnyExtend(
13251     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13252     const APInt &Zeroable, const X86Subtarget &Subtarget,
13253     SelectionDAG &DAG) {
13254   int Bits = VT.getSizeInBits();
13255   int NumLanes = Bits / 128;
13256   int NumElements = VT.getVectorNumElements();
13257   int NumEltsPerLane = NumElements / NumLanes;
13258   assert(VT.getScalarSizeInBits() <= 32 &&
13259          "Exceeds 32-bit integer zero extension limit");
13260   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
13261 
13262   // Define a helper function to check a particular ext-scale and lower to it if
13263   // valid.
13264   auto Lower = [&](int Scale) -> SDValue {
13265     SDValue InputV;
13266     bool AnyExt = true;
13267     int Offset = 0;
13268     int Matches = 0;
13269     for (int i = 0; i < NumElements; ++i) {
13270       int M = Mask[i];
13271       if (M < 0)
13272         continue; // Valid anywhere but doesn't tell us anything.
13273       if (i % Scale != 0) {
13274         // Each of the extended elements need to be zeroable.
13275         if (!Zeroable[i])
13276           return SDValue();
13277 
13278         // We no longer are in the anyext case.
13279         AnyExt = false;
13280         continue;
13281       }
13282 
13283       // Each of the base elements needs to be consecutive indices into the
13284       // same input vector.
13285       SDValue V = M < NumElements ? V1 : V2;
13286       M = M % NumElements;
13287       if (!InputV) {
13288         InputV = V;
13289         Offset = M - (i / Scale);
13290       } else if (InputV != V)
13291         return SDValue(); // Flip-flopping inputs.
13292 
13293       // Offset must start in the lowest 128-bit lane or at the start of an
13294       // upper lane.
13295       // FIXME: Is it ever worth allowing a negative base offset?
13296       if (!((0 <= Offset && Offset < NumEltsPerLane) ||
13297             (Offset % NumEltsPerLane) == 0))
13298         return SDValue();
13299 
13300       // If we are offsetting, all referenced entries must come from the same
13301       // lane.
13302       if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
13303         return SDValue();
13304 
13305       if ((M % NumElements) != (Offset + (i / Scale)))
13306         return SDValue(); // Non-consecutive strided elements.
13307       Matches++;
13308     }
13309 
13310     // If we fail to find an input, we have a zero-shuffle which should always
13311     // have already been handled.
13312     // FIXME: Maybe handle this here in case during blending we end up with one?
13313     if (!InputV)
13314       return SDValue();
13315 
13316     // If we are offsetting, don't extend if we only match a single input, we
13317     // can always do better by using a basic PSHUF or PUNPCK.
13318     if (Offset != 0 && Matches < 2)
13319       return SDValue();
13320 
13321     return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
13322                                                  InputV, Mask, Subtarget, DAG);
13323   };
13324 
13325   // The widest scale possible for extending is to a 64-bit integer.
13326   assert(Bits % 64 == 0 &&
13327          "The number of bits in a vector must be divisible by 64 on x86!");
13328   int NumExtElements = Bits / 64;
13329 
13330   // Each iteration, try extending the elements half as much, but into twice as
13331   // many elements.
13332   for (; NumExtElements < NumElements; NumExtElements *= 2) {
13333     assert(NumElements % NumExtElements == 0 &&
13334            "The input vector size must be divisible by the extended size.");
13335     if (SDValue V = Lower(NumElements / NumExtElements))
13336       return V;
13337   }
13338 
13339   // General extends failed, but 128-bit vectors may be able to use MOVQ.
13340   if (Bits != 128)
13341     return SDValue();
13342 
13343   // Returns one of the source operands if the shuffle can be reduced to a
13344   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
13345   auto CanZExtLowHalf = [&]() {
13346     for (int i = NumElements / 2; i != NumElements; ++i)
13347       if (!Zeroable[i])
13348         return SDValue();
13349     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
13350       return V1;
13351     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
13352       return V2;
13353     return SDValue();
13354   };
13355 
13356   if (SDValue V = CanZExtLowHalf()) {
13357     V = DAG.getBitcast(MVT::v2i64, V);
13358     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
13359     return DAG.getBitcast(VT, V);
13360   }
13361 
13362   // No viable ext lowering found.
13363   return SDValue();
13364 }
13365 
13366 /// Try to get a scalar value for a specific element of a vector.
13367 ///
13368 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
getScalarValueForVectorElement(SDValue V,int Idx,SelectionDAG & DAG)13369 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
13370                                               SelectionDAG &DAG) {
13371   MVT VT = V.getSimpleValueType();
13372   MVT EltVT = VT.getVectorElementType();
13373   V = peekThroughBitcasts(V);
13374 
13375   // If the bitcasts shift the element size, we can't extract an equivalent
13376   // element from it.
13377   MVT NewVT = V.getSimpleValueType();
13378   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
13379     return SDValue();
13380 
13381   if (V.getOpcode() == ISD::BUILD_VECTOR ||
13382       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
13383     // Ensure the scalar operand is the same size as the destination.
13384     // FIXME: Add support for scalar truncation where possible.
13385     SDValue S = V.getOperand(Idx);
13386     if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
13387       return DAG.getBitcast(EltVT, S);
13388   }
13389 
13390   return SDValue();
13391 }
13392 
13393 /// Helper to test for a load that can be folded with x86 shuffles.
13394 ///
13395 /// This is particularly important because the set of instructions varies
13396 /// significantly based on whether the operand is a load or not.
isShuffleFoldableLoad(SDValue V)13397 static bool isShuffleFoldableLoad(SDValue V) {
13398   V = peekThroughBitcasts(V);
13399   return ISD::isNON_EXTLoad(V.getNode());
13400 }
13401 
13402 /// Try to lower insertion of a single element into a zero vector.
13403 ///
13404 /// This is a common pattern that we have especially efficient patterns to lower
13405 /// across all subtarget feature sets.
lowerShuffleAsElementInsertion(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)13406 static SDValue lowerShuffleAsElementInsertion(
13407     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13408     const APInt &Zeroable, const X86Subtarget &Subtarget,
13409     SelectionDAG &DAG) {
13410   MVT ExtVT = VT;
13411   MVT EltVT = VT.getVectorElementType();
13412 
13413   int V2Index =
13414       find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
13415       Mask.begin();
13416   bool IsV1Zeroable = true;
13417   for (int i = 0, Size = Mask.size(); i < Size; ++i)
13418     if (i != V2Index && !Zeroable[i]) {
13419       IsV1Zeroable = false;
13420       break;
13421     }
13422 
13423   // Check for a single input from a SCALAR_TO_VECTOR node.
13424   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
13425   // all the smarts here sunk into that routine. However, the current
13426   // lowering of BUILD_VECTOR makes that nearly impossible until the old
13427   // vector shuffle lowering is dead.
13428   SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
13429                                                DAG);
13430   if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
13431     // We need to zext the scalar if it is smaller than an i32.
13432     V2S = DAG.getBitcast(EltVT, V2S);
13433     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
13434       // Using zext to expand a narrow element won't work for non-zero
13435       // insertions.
13436       if (!IsV1Zeroable)
13437         return SDValue();
13438 
13439       // Zero-extend directly to i32.
13440       ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
13441       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
13442     }
13443     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
13444   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
13445              EltVT == MVT::i16) {
13446     // Either not inserting from the low element of the input or the input
13447     // element size is too small to use VZEXT_MOVL to clear the high bits.
13448     return SDValue();
13449   }
13450 
13451   if (!IsV1Zeroable) {
13452     // If V1 can't be treated as a zero vector we have fewer options to lower
13453     // this. We can't support integer vectors or non-zero targets cheaply, and
13454     // the V1 elements can't be permuted in any way.
13455     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
13456     if (!VT.isFloatingPoint() || V2Index != 0)
13457       return SDValue();
13458     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
13459     V1Mask[V2Index] = -1;
13460     if (!isNoopShuffleMask(V1Mask))
13461       return SDValue();
13462     if (!VT.is128BitVector())
13463       return SDValue();
13464 
13465     // Otherwise, use MOVSD or MOVSS.
13466     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
13467            "Only two types of floating point element types to handle!");
13468     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
13469                        ExtVT, V1, V2);
13470   }
13471 
13472   // This lowering only works for the low element with floating point vectors.
13473   if (VT.isFloatingPoint() && V2Index != 0)
13474     return SDValue();
13475 
13476   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
13477   if (ExtVT != VT)
13478     V2 = DAG.getBitcast(VT, V2);
13479 
13480   if (V2Index != 0) {
13481     // If we have 4 or fewer lanes we can cheaply shuffle the element into
13482     // the desired position. Otherwise it is more efficient to do a vector
13483     // shift left. We know that we can do a vector shift left because all
13484     // the inputs are zero.
13485     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
13486       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
13487       V2Shuffle[V2Index] = 0;
13488       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
13489     } else {
13490       V2 = DAG.getBitcast(MVT::v16i8, V2);
13491       V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
13492                        DAG.getTargetConstant(
13493                            V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
13494       V2 = DAG.getBitcast(VT, V2);
13495     }
13496   }
13497   return V2;
13498 }
13499 
13500 /// Try to lower broadcast of a single - truncated - integer element,
13501 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
13502 ///
13503 /// This assumes we have AVX2.
lowerShuffleAsTruncBroadcast(const SDLoc & DL,MVT VT,SDValue V0,int BroadcastIdx,const X86Subtarget & Subtarget,SelectionDAG & DAG)13504 static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
13505                                             int BroadcastIdx,
13506                                             const X86Subtarget &Subtarget,
13507                                             SelectionDAG &DAG) {
13508   assert(Subtarget.hasAVX2() &&
13509          "We can only lower integer broadcasts with AVX2!");
13510 
13511   MVT EltVT = VT.getVectorElementType();
13512   MVT V0VT = V0.getSimpleValueType();
13513 
13514   assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
13515   assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
13516 
13517   MVT V0EltVT = V0VT.getVectorElementType();
13518   if (!V0EltVT.isInteger())
13519     return SDValue();
13520 
13521   const unsigned EltSize = EltVT.getSizeInBits();
13522   const unsigned V0EltSize = V0EltVT.getSizeInBits();
13523 
13524   // This is only a truncation if the original element type is larger.
13525   if (V0EltSize <= EltSize)
13526     return SDValue();
13527 
13528   assert(((V0EltSize % EltSize) == 0) &&
13529          "Scalar type sizes must all be powers of 2 on x86!");
13530 
13531   const unsigned V0Opc = V0.getOpcode();
13532   const unsigned Scale = V0EltSize / EltSize;
13533   const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
13534 
13535   if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
13536       V0Opc != ISD::BUILD_VECTOR)
13537     return SDValue();
13538 
13539   SDValue Scalar = V0.getOperand(V0BroadcastIdx);
13540 
13541   // If we're extracting non-least-significant bits, shift so we can truncate.
13542   // Hopefully, we can fold away the trunc/srl/load into the broadcast.
13543   // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
13544   // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
13545   if (const int OffsetIdx = BroadcastIdx % Scale)
13546     Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
13547                          DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
13548 
13549   return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
13550                      DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
13551 }
13552 
13553 /// Test whether this can be lowered with a single SHUFPS instruction.
13554 ///
13555 /// This is used to disable more specialized lowerings when the shufps lowering
13556 /// will happen to be efficient.
isSingleSHUFPSMask(ArrayRef<int> Mask)13557 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
13558   // This routine only handles 128-bit shufps.
13559   assert(Mask.size() == 4 && "Unsupported mask size!");
13560   assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
13561   assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
13562   assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
13563   assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
13564 
13565   // To lower with a single SHUFPS we need to have the low half and high half
13566   // each requiring a single input.
13567   if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
13568     return false;
13569   if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
13570     return false;
13571 
13572   return true;
13573 }
13574 
13575 /// If we are extracting two 128-bit halves of a vector and shuffling the
13576 /// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
13577 /// multi-shuffle lowering.
lowerShuffleOfExtractsAsVperm(const SDLoc & DL,SDValue N0,SDValue N1,ArrayRef<int> Mask,SelectionDAG & DAG)13578 static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
13579                                              SDValue N1, ArrayRef<int> Mask,
13580                                              SelectionDAG &DAG) {
13581   MVT VT = N0.getSimpleValueType();
13582   assert((VT.is128BitVector() &&
13583           (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
13584          "VPERM* family of shuffles requires 32-bit or 64-bit elements");
13585 
13586   // Check that both sources are extracts of the same source vector.
13587   if (!N0.hasOneUse() || !N1.hasOneUse() ||
13588       N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13589       N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13590       N0.getOperand(0) != N1.getOperand(0))
13591     return SDValue();
13592 
13593   SDValue WideVec = N0.getOperand(0);
13594   MVT WideVT = WideVec.getSimpleValueType();
13595   if (!WideVT.is256BitVector())
13596     return SDValue();
13597 
13598   // Match extracts of each half of the wide source vector. Commute the shuffle
13599   // if the extract of the low half is N1.
13600   unsigned NumElts = VT.getVectorNumElements();
13601   SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
13602   const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
13603   const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
13604   if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13605     ShuffleVectorSDNode::commuteMask(NewMask);
13606   else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13607     return SDValue();
13608 
13609   // Final bailout: if the mask is simple, we are better off using an extract
13610   // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13611   // because that avoids a constant load from memory.
13612   if (NumElts == 4 &&
13613       (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
13614     return SDValue();
13615 
13616   // Extend the shuffle mask with undef elements.
13617   NewMask.append(NumElts, -1);
13618 
13619   // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13620   SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13621                                       NewMask);
13622   // This is free: ymm -> xmm.
13623   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13624                      DAG.getIntPtrConstant(0, DL));
13625 }
13626 
13627 /// Try to lower broadcast of a single element.
13628 ///
13629 /// For convenience, this code also bundles all of the subtarget feature set
13630 /// filtering. While a little annoying to re-dispatch on type here, there isn't
13631 /// a convenient way to factor it out.
lowerShuffleAsBroadcast(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)13632 static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
13633                                        SDValue V2, ArrayRef<int> Mask,
13634                                        const X86Subtarget &Subtarget,
13635                                        SelectionDAG &DAG) {
13636   if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13637         (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
13638         (Subtarget.hasAVX2() && VT.isInteger())))
13639     return SDValue();
13640 
13641   // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13642   // we can only broadcast from a register with AVX2.
13643   unsigned NumEltBits = VT.getScalarSizeInBits();
13644   unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13645                         ? X86ISD::MOVDDUP
13646                         : X86ISD::VBROADCAST;
13647   bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13648 
13649   // Check that the mask is a broadcast.
13650   int BroadcastIdx = getSplatIndex(Mask);
13651   if (BroadcastIdx < 0)
13652     return SDValue();
13653   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
13654                                             "a sorted mask where the broadcast "
13655                                             "comes from V1.");
13656 
13657   // Go up the chain of (vector) values to find a scalar load that we can
13658   // combine with the broadcast.
13659   // TODO: Combine this logic with findEltLoadSrc() used by
13660   //       EltsFromConsecutiveLoads().
13661   int BitOffset = BroadcastIdx * NumEltBits;
13662   SDValue V = V1;
13663   for (;;) {
13664     switch (V.getOpcode()) {
13665     case ISD::BITCAST: {
13666       V = V.getOperand(0);
13667       continue;
13668     }
13669     case ISD::CONCAT_VECTORS: {
13670       int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13671       int OpIdx = BitOffset / OpBitWidth;
13672       V = V.getOperand(OpIdx);
13673       BitOffset %= OpBitWidth;
13674       continue;
13675     }
13676     case ISD::EXTRACT_SUBVECTOR: {
13677       // The extraction index adds to the existing offset.
13678       unsigned EltBitWidth = V.getScalarValueSizeInBits();
13679       unsigned Idx = V.getConstantOperandVal(1);
13680       unsigned BeginOffset = Idx * EltBitWidth;
13681       BitOffset += BeginOffset;
13682       V = V.getOperand(0);
13683       continue;
13684     }
13685     case ISD::INSERT_SUBVECTOR: {
13686       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13687       int EltBitWidth = VOuter.getScalarValueSizeInBits();
13688       int Idx = (int)V.getConstantOperandVal(2);
13689       int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13690       int BeginOffset = Idx * EltBitWidth;
13691       int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13692       if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13693         BitOffset -= BeginOffset;
13694         V = VInner;
13695       } else {
13696         V = VOuter;
13697       }
13698       continue;
13699     }
13700     }
13701     break;
13702   }
13703   assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
13704   BroadcastIdx = BitOffset / NumEltBits;
13705 
13706   // Do we need to bitcast the source to retrieve the original broadcast index?
13707   bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13708 
13709   // Check if this is a broadcast of a scalar. We special case lowering
13710   // for scalars so that we can more effectively fold with loads.
13711   // If the original value has a larger element type than the shuffle, the
13712   // broadcast element is in essence truncated. Make that explicit to ease
13713   // folding.
13714   if (BitCastSrc && VT.isInteger())
13715     if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13716             DL, VT, V, BroadcastIdx, Subtarget, DAG))
13717       return TruncBroadcast;
13718 
13719   // Also check the simpler case, where we can directly reuse the scalar.
13720   if (!BitCastSrc &&
13721       ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13722        (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13723     V = V.getOperand(BroadcastIdx);
13724 
13725     // If we can't broadcast from a register, check that the input is a load.
13726     if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13727       return SDValue();
13728   } else if (ISD::isNormalLoad(V.getNode()) &&
13729              cast<LoadSDNode>(V)->isSimple()) {
13730     // We do not check for one-use of the vector load because a broadcast load
13731     // is expected to be a win for code size, register pressure, and possibly
13732     // uops even if the original vector load is not eliminated.
13733 
13734     // Reduce the vector load and shuffle to a broadcasted scalar load.
13735     LoadSDNode *Ld = cast<LoadSDNode>(V);
13736     SDValue BaseAddr = Ld->getOperand(1);
13737     MVT SVT = VT.getScalarType();
13738     unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13739     assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
13740     SDValue NewAddr =
13741         DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
13742 
13743     // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13744     // than MOVDDUP.
13745     // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13746     if (Opcode == X86ISD::VBROADCAST) {
13747       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13748       SDValue Ops[] = {Ld->getChain(), NewAddr};
13749       V = DAG.getMemIntrinsicNode(
13750           X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13751           DAG.getMachineFunction().getMachineMemOperand(
13752               Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13753       DAG.makeEquivalentMemoryOrdering(Ld, V);
13754       return DAG.getBitcast(VT, V);
13755     }
13756     assert(SVT == MVT::f64 && "Unexpected VT!");
13757     V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13758                     DAG.getMachineFunction().getMachineMemOperand(
13759                         Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13760     DAG.makeEquivalentMemoryOrdering(Ld, V);
13761   } else if (!BroadcastFromReg) {
13762     // We can't broadcast from a vector register.
13763     return SDValue();
13764   } else if (BitOffset != 0) {
13765     // We can only broadcast from the zero-element of a vector register,
13766     // but it can be advantageous to broadcast from the zero-element of a
13767     // subvector.
13768     if (!VT.is256BitVector() && !VT.is512BitVector())
13769       return SDValue();
13770 
13771     // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13772     if (VT == MVT::v4f64 || VT == MVT::v4i64)
13773       return SDValue();
13774 
13775     // Only broadcast the zero-element of a 128-bit subvector.
13776     if ((BitOffset % 128) != 0)
13777       return SDValue();
13778 
13779     assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13780            "Unexpected bit-offset");
13781     assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13782            "Unexpected vector size");
13783     unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13784     V = extract128BitVector(V, ExtractIdx, DAG, DL);
13785   }
13786 
13787   // On AVX we can use VBROADCAST directly for scalar sources.
13788   if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13789     V = DAG.getBitcast(MVT::f64, V);
13790     if (Subtarget.hasAVX()) {
13791       V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13792       return DAG.getBitcast(VT, V);
13793     }
13794     V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13795   }
13796 
13797   // If this is a scalar, do the broadcast on this type and bitcast.
13798   if (!V.getValueType().isVector()) {
13799     assert(V.getScalarValueSizeInBits() == NumEltBits &&
13800            "Unexpected scalar size");
13801     MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13802                                        VT.getVectorNumElements());
13803     return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13804   }
13805 
13806   // We only support broadcasting from 128-bit vectors to minimize the
13807   // number of patterns we need to deal with in isel. So extract down to
13808   // 128-bits, removing as many bitcasts as possible.
13809   if (V.getValueSizeInBits() > 128)
13810     V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
13811 
13812   // Otherwise cast V to a vector with the same element type as VT, but
13813   // possibly narrower than VT. Then perform the broadcast.
13814   unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13815   MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13816   return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13817 }
13818 
13819 // Check for whether we can use INSERTPS to perform the shuffle. We only use
13820 // INSERTPS when the V1 elements are already in the correct locations
13821 // because otherwise we can just always use two SHUFPS instructions which
13822 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13823 // perform INSERTPS if a single V1 element is out of place and all V2
13824 // elements are zeroable.
matchShuffleAsInsertPS(SDValue & V1,SDValue & V2,unsigned & InsertPSMask,const APInt & Zeroable,ArrayRef<int> Mask,SelectionDAG & DAG)13825 static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
13826                                    unsigned &InsertPSMask,
13827                                    const APInt &Zeroable,
13828                                    ArrayRef<int> Mask, SelectionDAG &DAG) {
13829   assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
13830   assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
13831   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13832 
13833   // Attempt to match INSERTPS with one element from VA or VB being
13834   // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13835   // are updated.
13836   auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13837                              ArrayRef<int> CandidateMask) {
13838     unsigned ZMask = 0;
13839     int VADstIndex = -1;
13840     int VBDstIndex = -1;
13841     bool VAUsedInPlace = false;
13842 
13843     for (int i = 0; i < 4; ++i) {
13844       // Synthesize a zero mask from the zeroable elements (includes undefs).
13845       if (Zeroable[i]) {
13846         ZMask |= 1 << i;
13847         continue;
13848       }
13849 
13850       // Flag if we use any VA inputs in place.
13851       if (i == CandidateMask[i]) {
13852         VAUsedInPlace = true;
13853         continue;
13854       }
13855 
13856       // We can only insert a single non-zeroable element.
13857       if (VADstIndex >= 0 || VBDstIndex >= 0)
13858         return false;
13859 
13860       if (CandidateMask[i] < 4) {
13861         // VA input out of place for insertion.
13862         VADstIndex = i;
13863       } else {
13864         // VB input for insertion.
13865         VBDstIndex = i;
13866       }
13867     }
13868 
13869     // Don't bother if we have no (non-zeroable) element for insertion.
13870     if (VADstIndex < 0 && VBDstIndex < 0)
13871       return false;
13872 
13873     // Determine element insertion src/dst indices. The src index is from the
13874     // start of the inserted vector, not the start of the concatenated vector.
13875     unsigned VBSrcIndex = 0;
13876     if (VADstIndex >= 0) {
13877       // If we have a VA input out of place, we use VA as the V2 element
13878       // insertion and don't use the original V2 at all.
13879       VBSrcIndex = CandidateMask[VADstIndex];
13880       VBDstIndex = VADstIndex;
13881       VB = VA;
13882     } else {
13883       VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13884     }
13885 
13886     // If no V1 inputs are used in place, then the result is created only from
13887     // the zero mask and the V2 insertion - so remove V1 dependency.
13888     if (!VAUsedInPlace)
13889       VA = DAG.getUNDEF(MVT::v4f32);
13890 
13891     // Update V1, V2 and InsertPSMask accordingly.
13892     V1 = VA;
13893     V2 = VB;
13894 
13895     // Insert the V2 element into the desired position.
13896     InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13897     assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13898     return true;
13899   };
13900 
13901   if (matchAsInsertPS(V1, V2, Mask))
13902     return true;
13903 
13904   // Commute and try again.
13905   SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
13906   ShuffleVectorSDNode::commuteMask(CommutedMask);
13907   if (matchAsInsertPS(V2, V1, CommutedMask))
13908     return true;
13909 
13910   return false;
13911 }
13912 
lowerShuffleAsInsertPS(const SDLoc & DL,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,SelectionDAG & DAG)13913 static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
13914                                       ArrayRef<int> Mask, const APInt &Zeroable,
13915                                       SelectionDAG &DAG) {
13916   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13917   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13918 
13919   // Attempt to match the insertps pattern.
13920   unsigned InsertPSMask = 0;
13921   if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13922     return SDValue();
13923 
13924   // Insert the V2 element into the desired position.
13925   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13926                      DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13927 }
13928 
13929 /// Try to lower a shuffle as a permute of the inputs followed by an
13930 /// UNPCK instruction.
13931 ///
13932 /// This specifically targets cases where we end up with alternating between
13933 /// the two inputs, and so can permute them into something that feeds a single
13934 /// UNPCK instruction. Note that this routine only targets integer vectors
13935 /// because for floating point vectors we have a generalized SHUFPS lowering
13936 /// strategy that handles everything that doesn't *exactly* match an unpack,
13937 /// making this clever lowering unnecessary.
lowerShuffleAsPermuteAndUnpack(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)13938 static SDValue lowerShuffleAsPermuteAndUnpack(
13939     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13940     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13941   assert(!VT.isFloatingPoint() &&
13942          "This routine only supports integer vectors.");
13943   assert(VT.is128BitVector() &&
13944          "This routine only works on 128-bit vectors.");
13945   assert(!V2.isUndef() &&
13946          "This routine should only be used when blending two inputs.");
13947   assert(Mask.size() >= 2 && "Single element masks are invalid.");
13948 
13949   int Size = Mask.size();
13950 
13951   int NumLoInputs =
13952       count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
13953   int NumHiInputs =
13954       count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
13955 
13956   bool UnpackLo = NumLoInputs >= NumHiInputs;
13957 
13958   auto TryUnpack = [&](int ScalarSize, int Scale) {
13959     SmallVector<int, 16> V1Mask((unsigned)Size, -1);
13960     SmallVector<int, 16> V2Mask((unsigned)Size, -1);
13961 
13962     for (int i = 0; i < Size; ++i) {
13963       if (Mask[i] < 0)
13964         continue;
13965 
13966       // Each element of the unpack contains Scale elements from this mask.
13967       int UnpackIdx = i / Scale;
13968 
13969       // We only handle the case where V1 feeds the first slots of the unpack.
13970       // We rely on canonicalization to ensure this is the case.
13971       if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
13972         return SDValue();
13973 
13974       // Setup the mask for this input. The indexing is tricky as we have to
13975       // handle the unpack stride.
13976       SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
13977       VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
13978           Mask[i] % Size;
13979     }
13980 
13981     // If we will have to shuffle both inputs to use the unpack, check whether
13982     // we can just unpack first and shuffle the result. If so, skip this unpack.
13983     if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
13984         !isNoopShuffleMask(V2Mask))
13985       return SDValue();
13986 
13987     // Shuffle the inputs into place.
13988     V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
13989     V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
13990 
13991     // Cast the inputs to the type we will use to unpack them.
13992     MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
13993     V1 = DAG.getBitcast(UnpackVT, V1);
13994     V2 = DAG.getBitcast(UnpackVT, V2);
13995 
13996     // Unpack the inputs and cast the result back to the desired type.
13997     return DAG.getBitcast(
13998         VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
13999                         UnpackVT, V1, V2));
14000   };
14001 
14002   // We try each unpack from the largest to the smallest to try and find one
14003   // that fits this mask.
14004   int OrigScalarSize = VT.getScalarSizeInBits();
14005   for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
14006     if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
14007       return Unpack;
14008 
14009   // If we're shuffling with a zero vector then we're better off not doing
14010   // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
14011   if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
14012       ISD::isBuildVectorAllZeros(V2.getNode()))
14013     return SDValue();
14014 
14015   // If none of the unpack-rooted lowerings worked (or were profitable) try an
14016   // initial unpack.
14017   if (NumLoInputs == 0 || NumHiInputs == 0) {
14018     assert((NumLoInputs > 0 || NumHiInputs > 0) &&
14019            "We have to have *some* inputs!");
14020     int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
14021 
14022     // FIXME: We could consider the total complexity of the permute of each
14023     // possible unpacking. Or at the least we should consider how many
14024     // half-crossings are created.
14025     // FIXME: We could consider commuting the unpacks.
14026 
14027     SmallVector<int, 32> PermMask((unsigned)Size, -1);
14028     for (int i = 0; i < Size; ++i) {
14029       if (Mask[i] < 0)
14030         continue;
14031 
14032       assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
14033 
14034       PermMask[i] =
14035           2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
14036     }
14037     return DAG.getVectorShuffle(
14038         VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
14039                             DL, VT, V1, V2),
14040         DAG.getUNDEF(VT), PermMask);
14041   }
14042 
14043   return SDValue();
14044 }
14045 
14046 /// Handle lowering of 2-lane 64-bit floating point shuffles.
14047 ///
14048 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
14049 /// support for floating point shuffles but not integer shuffles. These
14050 /// instructions will incur a domain crossing penalty on some chips though so
14051 /// it is better to avoid lowering through this for integer vectors where
14052 /// possible.
lowerV2F64Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)14053 static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14054                                  const APInt &Zeroable, SDValue V1, SDValue V2,
14055                                  const X86Subtarget &Subtarget,
14056                                  SelectionDAG &DAG) {
14057   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
14058   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
14059   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
14060 
14061   if (V2.isUndef()) {
14062     // Check for being able to broadcast a single element.
14063     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
14064                                                     Mask, Subtarget, DAG))
14065       return Broadcast;
14066 
14067     // Straight shuffle of a single input vector. Simulate this by using the
14068     // single input as both of the "inputs" to this instruction..
14069     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
14070 
14071     if (Subtarget.hasAVX()) {
14072       // If we have AVX, we can use VPERMILPS which will allow folding a load
14073       // into the shuffle.
14074       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
14075                          DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14076     }
14077 
14078     return DAG.getNode(
14079         X86ISD::SHUFP, DL, MVT::v2f64,
14080         Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14081         Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14082         DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14083   }
14084   assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
14085   assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
14086   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
14087   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
14088 
14089   if (Subtarget.hasAVX2())
14090     if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14091       return Extract;
14092 
14093   // When loading a scalar and then shuffling it into a vector we can often do
14094   // the insertion cheaply.
14095   if (SDValue Insertion = lowerShuffleAsElementInsertion(
14096           DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14097     return Insertion;
14098   // Try inverting the insertion since for v2 masks it is easy to do and we
14099   // can't reliably sort the mask one way or the other.
14100   int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
14101                         Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
14102   if (SDValue Insertion = lowerShuffleAsElementInsertion(
14103           DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14104     return Insertion;
14105 
14106   // Try to use one of the special instruction patterns to handle two common
14107   // blend patterns if a zero-blend above didn't work.
14108   if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
14109       isShuffleEquivalent(Mask, {1, 3}, V1, V2))
14110     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
14111       // We can either use a special instruction to load over the low double or
14112       // to move just the low double.
14113       return DAG.getNode(
14114           X86ISD::MOVSD, DL, MVT::v2f64, V2,
14115           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
14116 
14117   if (Subtarget.hasSSE41())
14118     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
14119                                             Zeroable, Subtarget, DAG))
14120       return Blend;
14121 
14122   // Use dedicated unpack instructions for masks that match their pattern.
14123   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
14124     return V;
14125 
14126   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
14127   return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
14128                      DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14129 }
14130 
14131 /// Handle lowering of 2-lane 64-bit integer shuffles.
14132 ///
14133 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
14134 /// the integer unit to minimize domain crossing penalties. However, for blends
14135 /// it falls back to the floating point shuffle operation with appropriate bit
14136 /// casting.
lowerV2I64Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)14137 static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14138                                  const APInt &Zeroable, SDValue V1, SDValue V2,
14139                                  const X86Subtarget &Subtarget,
14140                                  SelectionDAG &DAG) {
14141   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
14142   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
14143   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
14144 
14145   if (V2.isUndef()) {
14146     // Check for being able to broadcast a single element.
14147     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
14148                                                     Mask, Subtarget, DAG))
14149       return Broadcast;
14150 
14151     // Straight shuffle of a single input vector. For everything from SSE2
14152     // onward this has a single fast instruction with no scary immediates.
14153     // We have to map the mask as it is actually a v4i32 shuffle instruction.
14154     V1 = DAG.getBitcast(MVT::v4i32, V1);
14155     int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
14156                           Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
14157                           Mask[1] < 0 ? -1 : (Mask[1] * 2),
14158                           Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
14159     return DAG.getBitcast(
14160         MVT::v2i64,
14161         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14162                     getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
14163   }
14164   assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
14165   assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
14166   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
14167   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
14168 
14169   if (Subtarget.hasAVX2())
14170     if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14171       return Extract;
14172 
14173   // Try to use shift instructions.
14174   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
14175                                           Zeroable, Subtarget, DAG))
14176     return Shift;
14177 
14178   // When loading a scalar and then shuffling it into a vector we can often do
14179   // the insertion cheaply.
14180   if (SDValue Insertion = lowerShuffleAsElementInsertion(
14181           DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14182     return Insertion;
14183   // Try inverting the insertion since for v2 masks it is easy to do and we
14184   // can't reliably sort the mask one way or the other.
14185   int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
14186   if (SDValue Insertion = lowerShuffleAsElementInsertion(
14187           DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14188     return Insertion;
14189 
14190   // We have different paths for blend lowering, but they all must use the
14191   // *exact* same predicate.
14192   bool IsBlendSupported = Subtarget.hasSSE41();
14193   if (IsBlendSupported)
14194     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
14195                                             Zeroable, Subtarget, DAG))
14196       return Blend;
14197 
14198   // Use dedicated unpack instructions for masks that match their pattern.
14199   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
14200     return V;
14201 
14202   // Try to use byte rotation instructions.
14203   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14204   if (Subtarget.hasSSSE3()) {
14205     if (Subtarget.hasVLX())
14206       if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
14207                                                 Subtarget, DAG))
14208         return Rotate;
14209 
14210     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
14211                                                   Subtarget, DAG))
14212       return Rotate;
14213   }
14214 
14215   // If we have direct support for blends, we should lower by decomposing into
14216   // a permute. That will be faster than the domain cross.
14217   if (IsBlendSupported)
14218     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
14219                                                 Subtarget, DAG);
14220 
14221   // We implement this with SHUFPD which is pretty lame because it will likely
14222   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
14223   // However, all the alternatives are still more cycles and newer chips don't
14224   // have this problem. It would be really nice if x86 had better shuffles here.
14225   V1 = DAG.getBitcast(MVT::v2f64, V1);
14226   V2 = DAG.getBitcast(MVT::v2f64, V2);
14227   return DAG.getBitcast(MVT::v2i64,
14228                         DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
14229 }
14230 
14231 /// Lower a vector shuffle using the SHUFPS instruction.
14232 ///
14233 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
14234 /// It makes no assumptions about whether this is the *best* lowering, it simply
14235 /// uses it.
lowerShuffleWithSHUFPS(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,SDValue V1,SDValue V2,SelectionDAG & DAG)14236 static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
14237                                       ArrayRef<int> Mask, SDValue V1,
14238                                       SDValue V2, SelectionDAG &DAG) {
14239   SDValue LowV = V1, HighV = V2;
14240   SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
14241   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14242 
14243   if (NumV2Elements == 1) {
14244     int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
14245 
14246     // Compute the index adjacent to V2Index and in the same half by toggling
14247     // the low bit.
14248     int V2AdjIndex = V2Index ^ 1;
14249 
14250     if (Mask[V2AdjIndex] < 0) {
14251       // Handles all the cases where we have a single V2 element and an undef.
14252       // This will only ever happen in the high lanes because we commute the
14253       // vector otherwise.
14254       if (V2Index < 2)
14255         std::swap(LowV, HighV);
14256       NewMask[V2Index] -= 4;
14257     } else {
14258       // Handle the case where the V2 element ends up adjacent to a V1 element.
14259       // To make this work, blend them together as the first step.
14260       int V1Index = V2AdjIndex;
14261       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
14262       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
14263                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14264 
14265       // Now proceed to reconstruct the final blend as we have the necessary
14266       // high or low half formed.
14267       if (V2Index < 2) {
14268         LowV = V2;
14269         HighV = V1;
14270       } else {
14271         HighV = V2;
14272       }
14273       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
14274       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
14275     }
14276   } else if (NumV2Elements == 2) {
14277     if (Mask[0] < 4 && Mask[1] < 4) {
14278       // Handle the easy case where we have V1 in the low lanes and V2 in the
14279       // high lanes.
14280       NewMask[2] -= 4;
14281       NewMask[3] -= 4;
14282     } else if (Mask[2] < 4 && Mask[3] < 4) {
14283       // We also handle the reversed case because this utility may get called
14284       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
14285       // arrange things in the right direction.
14286       NewMask[0] -= 4;
14287       NewMask[1] -= 4;
14288       HighV = V1;
14289       LowV = V2;
14290     } else {
14291       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
14292       // trying to place elements directly, just blend them and set up the final
14293       // shuffle to place them.
14294 
14295       // The first two blend mask elements are for V1, the second two are for
14296       // V2.
14297       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
14298                           Mask[2] < 4 ? Mask[2] : Mask[3],
14299                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
14300                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
14301       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
14302                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14303 
14304       // Now we do a normal shuffle of V1 by giving V1 as both operands to
14305       // a blend.
14306       LowV = HighV = V1;
14307       NewMask[0] = Mask[0] < 4 ? 0 : 2;
14308       NewMask[1] = Mask[0] < 4 ? 2 : 0;
14309       NewMask[2] = Mask[2] < 4 ? 1 : 3;
14310       NewMask[3] = Mask[2] < 4 ? 3 : 1;
14311     }
14312   } else if (NumV2Elements == 3) {
14313     // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
14314     // we can get here due to other paths (e.g repeated mask matching) that we
14315     // don't want to do another round of lowerVECTOR_SHUFFLE.
14316     ShuffleVectorSDNode::commuteMask(NewMask);
14317     return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
14318   }
14319   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
14320                      getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
14321 }
14322 
14323 /// Lower 4-lane 32-bit floating point shuffles.
14324 ///
14325 /// Uses instructions exclusively from the floating point unit to minimize
14326 /// domain crossing penalties, as these are sufficient to implement all v4f32
14327 /// shuffles.
lowerV4F32Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)14328 static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14329                                  const APInt &Zeroable, SDValue V1, SDValue V2,
14330                                  const X86Subtarget &Subtarget,
14331                                  SelectionDAG &DAG) {
14332   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
14333   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
14334   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
14335 
14336   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14337 
14338   if (NumV2Elements == 0) {
14339     // Check for being able to broadcast a single element.
14340     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
14341                                                     Mask, Subtarget, DAG))
14342       return Broadcast;
14343 
14344     // Use even/odd duplicate instructions for masks that match their pattern.
14345     if (Subtarget.hasSSE3()) {
14346       if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
14347         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
14348       if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
14349         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
14350     }
14351 
14352     if (Subtarget.hasAVX()) {
14353       // If we have AVX, we can use VPERMILPS which will allow folding a load
14354       // into the shuffle.
14355       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
14356                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14357     }
14358 
14359     // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
14360     // in SSE1 because otherwise they are widened to v2f64 and never get here.
14361     if (!Subtarget.hasSSE2()) {
14362       if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
14363         return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
14364       if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
14365         return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
14366     }
14367 
14368     // Otherwise, use a straight shuffle of a single input vector. We pass the
14369     // input vector to both operands to simulate this with a SHUFPS.
14370     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
14371                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14372   }
14373 
14374   if (Subtarget.hasAVX2())
14375     if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14376       return Extract;
14377 
14378   // There are special ways we can lower some single-element blends. However, we
14379   // have custom ways we can lower more complex single-element blends below that
14380   // we defer to if both this and BLENDPS fail to match, so restrict this to
14381   // when the V2 input is targeting element 0 of the mask -- that is the fast
14382   // case here.
14383   if (NumV2Elements == 1 && Mask[0] >= 4)
14384     if (SDValue V = lowerShuffleAsElementInsertion(
14385             DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14386       return V;
14387 
14388   if (Subtarget.hasSSE41()) {
14389     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
14390                                             Zeroable, Subtarget, DAG))
14391       return Blend;
14392 
14393     // Use INSERTPS if we can complete the shuffle efficiently.
14394     if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
14395       return V;
14396 
14397     if (!isSingleSHUFPSMask(Mask))
14398       if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
14399                                                             V2, Mask, DAG))
14400         return BlendPerm;
14401   }
14402 
14403   // Use low/high mov instructions. These are only valid in SSE1 because
14404   // otherwise they are widened to v2f64 and never get here.
14405   if (!Subtarget.hasSSE2()) {
14406     if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
14407       return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
14408     if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
14409       return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
14410   }
14411 
14412   // Use dedicated unpack instructions for masks that match their pattern.
14413   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
14414     return V;
14415 
14416   // Otherwise fall back to a SHUFPS lowering strategy.
14417   return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
14418 }
14419 
14420 /// Lower 4-lane i32 vector shuffles.
14421 ///
14422 /// We try to handle these with integer-domain shuffles where we can, but for
14423 /// blends we use the floating point domain blend instructions.
lowerV4I32Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)14424 static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14425                                  const APInt &Zeroable, SDValue V1, SDValue V2,
14426                                  const X86Subtarget &Subtarget,
14427                                  SelectionDAG &DAG) {
14428   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
14429   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
14430   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
14431 
14432   // Whenever we can lower this as a zext, that instruction is strictly faster
14433   // than any alternative. It also allows us to fold memory operands into the
14434   // shuffle in many cases.
14435   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
14436                                                    Zeroable, Subtarget, DAG))
14437     return ZExt;
14438 
14439   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14440 
14441   if (NumV2Elements == 0) {
14442     // Try to use broadcast unless the mask only has one non-undef element.
14443     if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
14444       if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
14445                                                       Mask, Subtarget, DAG))
14446         return Broadcast;
14447     }
14448 
14449     // Straight shuffle of a single input vector. For everything from SSE2
14450     // onward this has a single fast instruction with no scary immediates.
14451     // We coerce the shuffle pattern to be compatible with UNPCK instructions
14452     // but we aren't actually going to use the UNPCK instruction because doing
14453     // so prevents folding a load into this instruction or making a copy.
14454     const int UnpackLoMask[] = {0, 0, 1, 1};
14455     const int UnpackHiMask[] = {2, 2, 3, 3};
14456     if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
14457       Mask = UnpackLoMask;
14458     else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
14459       Mask = UnpackHiMask;
14460 
14461     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14462                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14463   }
14464 
14465   if (Subtarget.hasAVX2())
14466     if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14467       return Extract;
14468 
14469   // Try to use shift instructions.
14470   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
14471                                           Zeroable, Subtarget, DAG))
14472     return Shift;
14473 
14474   // There are special ways we can lower some single-element blends.
14475   if (NumV2Elements == 1)
14476     if (SDValue V = lowerShuffleAsElementInsertion(
14477             DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14478       return V;
14479 
14480   // We have different paths for blend lowering, but they all must use the
14481   // *exact* same predicate.
14482   bool IsBlendSupported = Subtarget.hasSSE41();
14483   if (IsBlendSupported)
14484     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
14485                                             Zeroable, Subtarget, DAG))
14486       return Blend;
14487 
14488   if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
14489                                              Zeroable, Subtarget, DAG))
14490     return Masked;
14491 
14492   // Use dedicated unpack instructions for masks that match their pattern.
14493   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
14494     return V;
14495 
14496   // Try to use byte rotation instructions.
14497   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14498   if (Subtarget.hasSSSE3()) {
14499     if (Subtarget.hasVLX())
14500       if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
14501                                                 Subtarget, DAG))
14502         return Rotate;
14503 
14504     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
14505                                                   Subtarget, DAG))
14506       return Rotate;
14507   }
14508 
14509   // Assume that a single SHUFPS is faster than an alternative sequence of
14510   // multiple instructions (even if the CPU has a domain penalty).
14511   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14512   if (!isSingleSHUFPSMask(Mask)) {
14513     // If we have direct support for blends, we should lower by decomposing into
14514     // a permute. That will be faster than the domain cross.
14515     if (IsBlendSupported)
14516       return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
14517                                                   Subtarget, DAG);
14518 
14519     // Try to lower by permuting the inputs into an unpack instruction.
14520     if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
14521                                                         Mask, Subtarget, DAG))
14522       return Unpack;
14523   }
14524 
14525   // We implement this with SHUFPS because it can blend from two vectors.
14526   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
14527   // up the inputs, bypassing domain shift penalties that we would incur if we
14528   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
14529   // relevant.
14530   SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
14531   SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
14532   SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
14533   return DAG.getBitcast(MVT::v4i32, ShufPS);
14534 }
14535 
14536 /// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
14537 /// shuffle lowering, and the most complex part.
14538 ///
14539 /// The lowering strategy is to try to form pairs of input lanes which are
14540 /// targeted at the same half of the final vector, and then use a dword shuffle
14541 /// to place them onto the right half, and finally unpack the paired lanes into
14542 /// their final position.
14543 ///
14544 /// The exact breakdown of how to form these dword pairs and align them on the
14545 /// correct sides is really tricky. See the comments within the function for
14546 /// more of the details.
14547 ///
14548 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
14549 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
14550 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
14551 /// vector, form the analogous 128-bit 8-element Mask.
lowerV8I16GeneralSingleInputShuffle(const SDLoc & DL,MVT VT,SDValue V,MutableArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)14552 static SDValue lowerV8I16GeneralSingleInputShuffle(
14553     const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
14554     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14555   assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
14556   MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
14557 
14558   assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
14559   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
14560   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
14561 
14562   // Attempt to directly match PSHUFLW or PSHUFHW.
14563   if (isUndefOrInRange(LoMask, 0, 4) &&
14564       isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
14565     return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14566                        getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14567   }
14568   if (isUndefOrInRange(HiMask, 4, 8) &&
14569       isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
14570     for (int i = 0; i != 4; ++i)
14571       HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
14572     return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14573                        getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14574   }
14575 
14576   SmallVector<int, 4> LoInputs;
14577   copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
14578   array_pod_sort(LoInputs.begin(), LoInputs.end());
14579   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
14580   SmallVector<int, 4> HiInputs;
14581   copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
14582   array_pod_sort(HiInputs.begin(), HiInputs.end());
14583   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
14584   int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
14585   int NumHToL = LoInputs.size() - NumLToL;
14586   int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
14587   int NumHToH = HiInputs.size() - NumLToH;
14588   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
14589   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
14590   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
14591   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
14592 
14593   // If we are shuffling values from one half - check how many different DWORD
14594   // pairs we need to create. If only 1 or 2 then we can perform this as a
14595   // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
14596   auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
14597                                ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
14598     V = DAG.getNode(ShufWOp, DL, VT, V,
14599                     getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14600     V = DAG.getBitcast(PSHUFDVT, V);
14601     V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
14602                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
14603     return DAG.getBitcast(VT, V);
14604   };
14605 
14606   if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
14607     int PSHUFDMask[4] = { -1, -1, -1, -1 };
14608     SmallVector<std::pair<int, int>, 4> DWordPairs;
14609     int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
14610 
14611     // Collect the different DWORD pairs.
14612     for (int DWord = 0; DWord != 4; ++DWord) {
14613       int M0 = Mask[2 * DWord + 0];
14614       int M1 = Mask[2 * DWord + 1];
14615       M0 = (M0 >= 0 ? M0 % 4 : M0);
14616       M1 = (M1 >= 0 ? M1 % 4 : M1);
14617       if (M0 < 0 && M1 < 0)
14618         continue;
14619 
14620       bool Match = false;
14621       for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
14622         auto &DWordPair = DWordPairs[j];
14623         if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
14624             (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
14625           DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
14626           DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
14627           PSHUFDMask[DWord] = DOffset + j;
14628           Match = true;
14629           break;
14630         }
14631       }
14632       if (!Match) {
14633         PSHUFDMask[DWord] = DOffset + DWordPairs.size();
14634         DWordPairs.push_back(std::make_pair(M0, M1));
14635       }
14636     }
14637 
14638     if (DWordPairs.size() <= 2) {
14639       DWordPairs.resize(2, std::make_pair(-1, -1));
14640       int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
14641                               DWordPairs[1].first, DWordPairs[1].second};
14642       if ((NumHToL + NumHToH) == 0)
14643         return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
14644       if ((NumLToL + NumLToH) == 0)
14645         return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
14646     }
14647   }
14648 
14649   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
14650   // such inputs we can swap two of the dwords across the half mark and end up
14651   // with <=2 inputs to each half in each half. Once there, we can fall through
14652   // to the generic code below. For example:
14653   //
14654   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14655   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
14656   //
14657   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
14658   // and an existing 2-into-2 on the other half. In this case we may have to
14659   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
14660   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
14661   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
14662   // because any other situation (including a 3-into-1 or 1-into-3 in the other
14663   // half than the one we target for fixing) will be fixed when we re-enter this
14664   // path. We will also combine away any sequence of PSHUFD instructions that
14665   // result into a single instruction. Here is an example of the tricky case:
14666   //
14667   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14668   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
14669   //
14670   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
14671   //
14672   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
14673   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
14674   //
14675   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14676   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14677   //
14678   // The result is fine to be handled by the generic logic.
14679   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14680                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14681                           int AOffset, int BOffset) {
14682     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
14683            "Must call this with A having 3 or 1 inputs from the A half.");
14684     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
14685            "Must call this with B having 1 or 3 inputs from the B half.");
14686     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
14687            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14688 
14689     bool ThreeAInputs = AToAInputs.size() == 3;
14690 
14691     // Compute the index of dword with only one word among the three inputs in
14692     // a half by taking the sum of the half with three inputs and subtracting
14693     // the sum of the actual three inputs. The difference is the remaining
14694     // slot.
14695     int ADWord = 0, BDWord = 0;
14696     int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14697     int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14698     int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14699     ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14700     int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14701     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14702     int TripleNonInputIdx =
14703         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14704     TripleDWord = TripleNonInputIdx / 2;
14705 
14706     // We use xor with one to compute the adjacent DWord to whichever one the
14707     // OneInput is in.
14708     OneInputDWord = (OneInput / 2) ^ 1;
14709 
14710     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14711     // and BToA inputs. If there is also such a problem with the BToB and AToB
14712     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14713     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14714     // is essential that we don't *create* a 3<-1 as then we might oscillate.
14715     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14716       // Compute how many inputs will be flipped by swapping these DWords. We
14717       // need
14718       // to balance this to ensure we don't form a 3-1 shuffle in the other
14719       // half.
14720       int NumFlippedAToBInputs =
14721           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
14722           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
14723       int NumFlippedBToBInputs =
14724           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
14725           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
14726       if ((NumFlippedAToBInputs == 1 &&
14727            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14728           (NumFlippedBToBInputs == 1 &&
14729            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14730         // We choose whether to fix the A half or B half based on whether that
14731         // half has zero flipped inputs. At zero, we may not be able to fix it
14732         // with that half. We also bias towards fixing the B half because that
14733         // will more commonly be the high half, and we have to bias one way.
14734         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14735                                                        ArrayRef<int> Inputs) {
14736           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14737           bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14738           // Determine whether the free index is in the flipped dword or the
14739           // unflipped dword based on where the pinned index is. We use this bit
14740           // in an xor to conditionally select the adjacent dword.
14741           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14742           bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14743           if (IsFixIdxInput == IsFixFreeIdxInput)
14744             FixFreeIdx += 1;
14745           IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14746           assert(IsFixIdxInput != IsFixFreeIdxInput &&
14747                  "We need to be changing the number of flipped inputs!");
14748           int PSHUFHalfMask[] = {0, 1, 2, 3};
14749           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14750           V = DAG.getNode(
14751               FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14752               MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14753               getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14754 
14755           for (int &M : Mask)
14756             if (M >= 0 && M == FixIdx)
14757               M = FixFreeIdx;
14758             else if (M >= 0 && M == FixFreeIdx)
14759               M = FixIdx;
14760         };
14761         if (NumFlippedBToBInputs != 0) {
14762           int BPinnedIdx =
14763               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14764           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14765         } else {
14766           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
14767           int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14768           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14769         }
14770       }
14771     }
14772 
14773     int PSHUFDMask[] = {0, 1, 2, 3};
14774     PSHUFDMask[ADWord] = BDWord;
14775     PSHUFDMask[BDWord] = ADWord;
14776     V = DAG.getBitcast(
14777         VT,
14778         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14779                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14780 
14781     // Adjust the mask to match the new locations of A and B.
14782     for (int &M : Mask)
14783       if (M >= 0 && M/2 == ADWord)
14784         M = 2 * BDWord + M % 2;
14785       else if (M >= 0 && M/2 == BDWord)
14786         M = 2 * ADWord + M % 2;
14787 
14788     // Recurse back into this routine to re-compute state now that this isn't
14789     // a 3 and 1 problem.
14790     return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14791   };
14792   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14793     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14794   if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14795     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14796 
14797   // At this point there are at most two inputs to the low and high halves from
14798   // each half. That means the inputs can always be grouped into dwords and
14799   // those dwords can then be moved to the correct half with a dword shuffle.
14800   // We use at most one low and one high word shuffle to collect these paired
14801   // inputs into dwords, and finally a dword shuffle to place them.
14802   int PSHUFLMask[4] = {-1, -1, -1, -1};
14803   int PSHUFHMask[4] = {-1, -1, -1, -1};
14804   int PSHUFDMask[4] = {-1, -1, -1, -1};
14805 
14806   // First fix the masks for all the inputs that are staying in their
14807   // original halves. This will then dictate the targets of the cross-half
14808   // shuffles.
14809   auto fixInPlaceInputs =
14810       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14811                     MutableArrayRef<int> SourceHalfMask,
14812                     MutableArrayRef<int> HalfMask, int HalfOffset) {
14813     if (InPlaceInputs.empty())
14814       return;
14815     if (InPlaceInputs.size() == 1) {
14816       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14817           InPlaceInputs[0] - HalfOffset;
14818       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14819       return;
14820     }
14821     if (IncomingInputs.empty()) {
14822       // Just fix all of the in place inputs.
14823       for (int Input : InPlaceInputs) {
14824         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14825         PSHUFDMask[Input / 2] = Input / 2;
14826       }
14827       return;
14828     }
14829 
14830     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
14831     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14832         InPlaceInputs[0] - HalfOffset;
14833     // Put the second input next to the first so that they are packed into
14834     // a dword. We find the adjacent index by toggling the low bit.
14835     int AdjIndex = InPlaceInputs[0] ^ 1;
14836     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14837     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
14838     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14839   };
14840   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14841   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14842 
14843   // Now gather the cross-half inputs and place them into a free dword of
14844   // their target half.
14845   // FIXME: This operation could almost certainly be simplified dramatically to
14846   // look more like the 3-1 fixing operation.
14847   auto moveInputsToRightHalf = [&PSHUFDMask](
14848       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14849       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14850       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14851       int DestOffset) {
14852     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14853       return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14854     };
14855     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14856                                                int Word) {
14857       int LowWord = Word & ~1;
14858       int HighWord = Word | 1;
14859       return isWordClobbered(SourceHalfMask, LowWord) ||
14860              isWordClobbered(SourceHalfMask, HighWord);
14861     };
14862 
14863     if (IncomingInputs.empty())
14864       return;
14865 
14866     if (ExistingInputs.empty()) {
14867       // Map any dwords with inputs from them into the right half.
14868       for (int Input : IncomingInputs) {
14869         // If the source half mask maps over the inputs, turn those into
14870         // swaps and use the swapped lane.
14871         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14872           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14873             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14874                 Input - SourceOffset;
14875             // We have to swap the uses in our half mask in one sweep.
14876             for (int &M : HalfMask)
14877               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14878                 M = Input;
14879               else if (M == Input)
14880                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14881           } else {
14882             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
14883                        Input - SourceOffset &&
14884                    "Previous placement doesn't match!");
14885           }
14886           // Note that this correctly re-maps both when we do a swap and when
14887           // we observe the other side of the swap above. We rely on that to
14888           // avoid swapping the members of the input list directly.
14889           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14890         }
14891 
14892         // Map the input's dword into the correct half.
14893         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14894           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14895         else
14896           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
14897                      Input / 2 &&
14898                  "Previous placement doesn't match!");
14899       }
14900 
14901       // And just directly shift any other-half mask elements to be same-half
14902       // as we will have mirrored the dword containing the element into the
14903       // same position within that half.
14904       for (int &M : HalfMask)
14905         if (M >= SourceOffset && M < SourceOffset + 4) {
14906           M = M - SourceOffset + DestOffset;
14907           assert(M >= 0 && "This should never wrap below zero!");
14908         }
14909       return;
14910     }
14911 
14912     // Ensure we have the input in a viable dword of its current half. This
14913     // is particularly tricky because the original position may be clobbered
14914     // by inputs being moved and *staying* in that half.
14915     if (IncomingInputs.size() == 1) {
14916       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14917         int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14918                          SourceOffset;
14919         SourceHalfMask[InputFixed - SourceOffset] =
14920             IncomingInputs[0] - SourceOffset;
14921         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
14922                      InputFixed);
14923         IncomingInputs[0] = InputFixed;
14924       }
14925     } else if (IncomingInputs.size() == 2) {
14926       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14927           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14928         // We have two non-adjacent or clobbered inputs we need to extract from
14929         // the source half. To do this, we need to map them into some adjacent
14930         // dword slot in the source mask.
14931         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14932                               IncomingInputs[1] - SourceOffset};
14933 
14934         // If there is a free slot in the source half mask adjacent to one of
14935         // the inputs, place the other input in it. We use (Index XOR 1) to
14936         // compute an adjacent index.
14937         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14938             SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14939           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14940           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14941           InputsFixed[1] = InputsFixed[0] ^ 1;
14942         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14943                    SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14944           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14945           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14946           InputsFixed[0] = InputsFixed[1] ^ 1;
14947         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14948                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14949           // The two inputs are in the same DWord but it is clobbered and the
14950           // adjacent DWord isn't used at all. Move both inputs to the free
14951           // slot.
14952           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14953           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14954           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14955           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14956         } else {
14957           // The only way we hit this point is if there is no clobbering
14958           // (because there are no off-half inputs to this half) and there is no
14959           // free slot adjacent to one of the inputs. In this case, we have to
14960           // swap an input with a non-input.
14961           for (int i = 0; i < 4; ++i)
14962             assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14963                    "We can't handle any clobbers here!");
14964           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14965                  "Cannot have adjacent inputs here!");
14966 
14967           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14968           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14969 
14970           // We also have to update the final source mask in this case because
14971           // it may need to undo the above swap.
14972           for (int &M : FinalSourceHalfMask)
14973             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14974               M = InputsFixed[1] + SourceOffset;
14975             else if (M == InputsFixed[1] + SourceOffset)
14976               M = (InputsFixed[0] ^ 1) + SourceOffset;
14977 
14978           InputsFixed[1] = InputsFixed[0] ^ 1;
14979         }
14980 
14981         // Point everything at the fixed inputs.
14982         for (int &M : HalfMask)
14983           if (M == IncomingInputs[0])
14984             M = InputsFixed[0] + SourceOffset;
14985           else if (M == IncomingInputs[1])
14986             M = InputsFixed[1] + SourceOffset;
14987 
14988         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14989         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14990       }
14991     } else {
14992       llvm_unreachable("Unhandled input size!");
14993     }
14994 
14995     // Now hoist the DWord down to the right half.
14996     int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14997     assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14998     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14999     for (int &M : HalfMask)
15000       for (int Input : IncomingInputs)
15001         if (M == Input)
15002           M = FreeDWord * 2 + Input % 2;
15003   };
15004   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
15005                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
15006   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
15007                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
15008 
15009   // Now enact all the shuffles we've computed to move the inputs into their
15010   // target half.
15011   if (!isNoopShuffleMask(PSHUFLMask))
15012     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15013                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
15014   if (!isNoopShuffleMask(PSHUFHMask))
15015     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15016                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
15017   if (!isNoopShuffleMask(PSHUFDMask))
15018     V = DAG.getBitcast(
15019         VT,
15020         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
15021                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15022 
15023   // At this point, each half should contain all its inputs, and we can then
15024   // just shuffle them into their final position.
15025   assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
15026          "Failed to lift all the high half inputs to the low mask!");
15027   assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
15028          "Failed to lift all the low half inputs to the high mask!");
15029 
15030   // Do a half shuffle for the low mask.
15031   if (!isNoopShuffleMask(LoMask))
15032     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15033                     getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
15034 
15035   // Do a half shuffle with the high mask after shifting its values down.
15036   for (int &M : HiMask)
15037     if (M >= 0)
15038       M -= 4;
15039   if (!isNoopShuffleMask(HiMask))
15040     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15041                     getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
15042 
15043   return V;
15044 }
15045 
15046 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
15047 /// blend if only one input is used.
lowerShuffleAsBlendOfPSHUFBs(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,SelectionDAG & DAG,bool & V1InUse,bool & V2InUse)15048 static SDValue lowerShuffleAsBlendOfPSHUFBs(
15049     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15050     const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
15051   assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&
15052          "Lane crossing shuffle masks not supported");
15053 
15054   int NumBytes = VT.getSizeInBits() / 8;
15055   int Size = Mask.size();
15056   int Scale = NumBytes / Size;
15057 
15058   SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15059   SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15060   V1InUse = false;
15061   V2InUse = false;
15062 
15063   for (int i = 0; i < NumBytes; ++i) {
15064     int M = Mask[i / Scale];
15065     if (M < 0)
15066       continue;
15067 
15068     const int ZeroMask = 0x80;
15069     int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
15070     int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
15071     if (Zeroable[i / Scale])
15072       V1Idx = V2Idx = ZeroMask;
15073 
15074     V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
15075     V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
15076     V1InUse |= (ZeroMask != V1Idx);
15077     V2InUse |= (ZeroMask != V2Idx);
15078   }
15079 
15080   MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
15081   if (V1InUse)
15082     V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
15083                      DAG.getBuildVector(ShufVT, DL, V1Mask));
15084   if (V2InUse)
15085     V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
15086                      DAG.getBuildVector(ShufVT, DL, V2Mask));
15087 
15088   // If we need shuffled inputs from both, blend the two.
15089   SDValue V;
15090   if (V1InUse && V2InUse)
15091     V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
15092   else
15093     V = V1InUse ? V1 : V2;
15094 
15095   // Cast the result back to the correct type.
15096   return DAG.getBitcast(VT, V);
15097 }
15098 
15099 /// Generic lowering of 8-lane i16 shuffles.
15100 ///
15101 /// This handles both single-input shuffles and combined shuffle/blends with
15102 /// two inputs. The single input shuffles are immediately delegated to
15103 /// a dedicated lowering routine.
15104 ///
15105 /// The blends are lowered in one of three fundamental ways. If there are few
15106 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
15107 /// of the input is significantly cheaper when lowered as an interleaving of
15108 /// the two inputs, try to interleave them. Otherwise, blend the low and high
15109 /// halves of the inputs separately (making them have relatively few inputs)
15110 /// and then concatenate them.
lowerV8I16Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)15111 static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15112                                  const APInt &Zeroable, SDValue V1, SDValue V2,
15113                                  const X86Subtarget &Subtarget,
15114                                  SelectionDAG &DAG) {
15115   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
15116   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
15117   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
15118 
15119   // Whenever we can lower this as a zext, that instruction is strictly faster
15120   // than any alternative.
15121   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
15122                                                    Zeroable, Subtarget, DAG))
15123     return ZExt;
15124 
15125   // Try to use lower using a truncation.
15126   if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15127                                         Subtarget, DAG))
15128     return V;
15129 
15130   int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
15131 
15132   if (NumV2Inputs == 0) {
15133     // Try to use shift instructions.
15134     if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
15135                                             Zeroable, Subtarget, DAG))
15136       return Shift;
15137 
15138     // Check for being able to broadcast a single element.
15139     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
15140                                                     Mask, Subtarget, DAG))
15141       return Broadcast;
15142 
15143     // Try to use bit rotation instructions.
15144     if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
15145                                                  Subtarget, DAG))
15146       return Rotate;
15147 
15148     // Use dedicated unpack instructions for masks that match their pattern.
15149     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15150       return V;
15151 
15152     // Use dedicated pack instructions for masks that match their pattern.
15153     if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15154                                          Subtarget))
15155       return V;
15156 
15157     // Try to use byte rotation instructions.
15158     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
15159                                                   Subtarget, DAG))
15160       return Rotate;
15161 
15162     // Make a copy of the mask so it can be modified.
15163     SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
15164     return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
15165                                                Subtarget, DAG);
15166   }
15167 
15168   assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
15169          "All single-input shuffles should be canonicalized to be V1-input "
15170          "shuffles.");
15171 
15172   // Try to use shift instructions.
15173   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
15174                                           Zeroable, Subtarget, DAG))
15175     return Shift;
15176 
15177   // See if we can use SSE4A Extraction / Insertion.
15178   if (Subtarget.hasSSE4A())
15179     if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
15180                                           Zeroable, DAG))
15181       return V;
15182 
15183   // There are special ways we can lower some single-element blends.
15184   if (NumV2Inputs == 1)
15185     if (SDValue V = lowerShuffleAsElementInsertion(
15186             DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15187       return V;
15188 
15189   // We have different paths for blend lowering, but they all must use the
15190   // *exact* same predicate.
15191   bool IsBlendSupported = Subtarget.hasSSE41();
15192   if (IsBlendSupported)
15193     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
15194                                             Zeroable, Subtarget, DAG))
15195       return Blend;
15196 
15197   if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
15198                                              Zeroable, Subtarget, DAG))
15199     return Masked;
15200 
15201   // Use dedicated unpack instructions for masks that match their pattern.
15202   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15203     return V;
15204 
15205   // Use dedicated pack instructions for masks that match their pattern.
15206   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15207                                        Subtarget))
15208     return V;
15209 
15210   // Try to use lower using a truncation.
15211   if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15212                                        Subtarget, DAG))
15213     return V;
15214 
15215   // Try to use byte rotation instructions.
15216   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
15217                                                 Subtarget, DAG))
15218     return Rotate;
15219 
15220   if (SDValue BitBlend =
15221           lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
15222     return BitBlend;
15223 
15224   // Try to use byte shift instructions to mask.
15225   if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
15226                                               Zeroable, Subtarget, DAG))
15227     return V;
15228 
15229   // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
15230   // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
15231   // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
15232   int NumEvenDrops = canLowerByDroppingEvenElements(Mask, false);
15233   if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
15234       !Subtarget.hasVLX()) {
15235     SmallVector<SDValue, 8> DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32));
15236     for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
15237       DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
15238     SDValue DWordClearMask = DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
15239     V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
15240                      DWordClearMask);
15241     V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
15242                      DWordClearMask);
15243     // Now pack things back together.
15244     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
15245     if (NumEvenDrops == 2) {
15246       Result = DAG.getBitcast(MVT::v4i32, Result);
15247       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
15248     }
15249     return Result;
15250   }
15251 
15252   // Try to lower by permuting the inputs into an unpack instruction.
15253   if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
15254                                                       Mask, Subtarget, DAG))
15255     return Unpack;
15256 
15257   // If we can't directly blend but can use PSHUFB, that will be better as it
15258   // can both shuffle and set up the inefficient blend.
15259   if (!IsBlendSupported && Subtarget.hasSSSE3()) {
15260     bool V1InUse, V2InUse;
15261     return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
15262                                         Zeroable, DAG, V1InUse, V2InUse);
15263   }
15264 
15265   // We can always bit-blend if we have to so the fallback strategy is to
15266   // decompose into single-input permutes and blends/unpacks.
15267   return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
15268                                               Mask, Subtarget, DAG);
15269 }
15270 
15271 // Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
15272 // sub-512-bit shuffles are padded to 512-bits for the shuffle and then
15273 // the active subvector is extracted.
lowerShuffleWithPERMV(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)15274 static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
15275                                      ArrayRef<int> Mask, SDValue V1, SDValue V2,
15276                                      const X86Subtarget &Subtarget,
15277                                      SelectionDAG &DAG) {
15278   MVT MaskVT = VT.changeTypeToInteger();
15279   SDValue MaskNode;
15280   MVT ShuffleVT = VT;
15281   if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
15282     V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
15283     V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
15284     ShuffleVT = V1.getSimpleValueType();
15285 
15286     // Adjust mask to correct indices for the second input.
15287     int NumElts = VT.getVectorNumElements();
15288     unsigned Scale = 512 / VT.getSizeInBits();
15289     SmallVector<int, 32> AdjustedMask(Mask.begin(), Mask.end());
15290     for (int &M : AdjustedMask)
15291       if (NumElts <= M)
15292         M += (Scale - 1) * NumElts;
15293     MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
15294     MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
15295   } else {
15296     MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
15297   }
15298 
15299   SDValue Result;
15300   if (V2.isUndef())
15301     Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
15302   else
15303     Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
15304 
15305   if (VT != ShuffleVT)
15306     Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
15307 
15308   return Result;
15309 }
15310 
15311 /// Generic lowering of v16i8 shuffles.
15312 ///
15313 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
15314 /// detect any complexity reducing interleaving. If that doesn't help, it uses
15315 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
15316 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
15317 /// back together.
lowerV16I8Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)15318 static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15319                                  const APInt &Zeroable, SDValue V1, SDValue V2,
15320                                  const X86Subtarget &Subtarget,
15321                                  SelectionDAG &DAG) {
15322   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
15323   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
15324   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
15325 
15326   // Try to use shift instructions.
15327   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
15328                                           Zeroable, Subtarget, DAG))
15329     return Shift;
15330 
15331   // Try to use byte rotation instructions.
15332   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
15333                                                 Subtarget, DAG))
15334     return Rotate;
15335 
15336   // Use dedicated pack instructions for masks that match their pattern.
15337   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
15338                                        Subtarget))
15339     return V;
15340 
15341   // Try to use a zext lowering.
15342   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
15343                                                    Zeroable, Subtarget, DAG))
15344     return ZExt;
15345 
15346   // Try to use lower using a truncation.
15347   if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15348                                         Subtarget, DAG))
15349     return V;
15350 
15351   if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15352                                        Subtarget, DAG))
15353     return V;
15354 
15355   // See if we can use SSE4A Extraction / Insertion.
15356   if (Subtarget.hasSSE4A())
15357     if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
15358                                           Zeroable, DAG))
15359       return V;
15360 
15361   int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
15362 
15363   // For single-input shuffles, there are some nicer lowering tricks we can use.
15364   if (NumV2Elements == 0) {
15365     // Check for being able to broadcast a single element.
15366     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
15367                                                     Mask, Subtarget, DAG))
15368       return Broadcast;
15369 
15370     // Try to use bit rotation instructions.
15371     if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
15372                                                  Subtarget, DAG))
15373       return Rotate;
15374 
15375     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
15376       return V;
15377 
15378     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
15379     // Notably, this handles splat and partial-splat shuffles more efficiently.
15380     // However, it only makes sense if the pre-duplication shuffle simplifies
15381     // things significantly. Currently, this means we need to be able to
15382     // express the pre-duplication shuffle as an i16 shuffle.
15383     //
15384     // FIXME: We should check for other patterns which can be widened into an
15385     // i16 shuffle as well.
15386     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
15387       for (int i = 0; i < 16; i += 2)
15388         if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
15389           return false;
15390 
15391       return true;
15392     };
15393     auto tryToWidenViaDuplication = [&]() -> SDValue {
15394       if (!canWidenViaDuplication(Mask))
15395         return SDValue();
15396       SmallVector<int, 4> LoInputs;
15397       copy_if(Mask, std::back_inserter(LoInputs),
15398               [](int M) { return M >= 0 && M < 8; });
15399       array_pod_sort(LoInputs.begin(), LoInputs.end());
15400       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
15401                      LoInputs.end());
15402       SmallVector<int, 4> HiInputs;
15403       copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
15404       array_pod_sort(HiInputs.begin(), HiInputs.end());
15405       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
15406                      HiInputs.end());
15407 
15408       bool TargetLo = LoInputs.size() >= HiInputs.size();
15409       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
15410       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
15411 
15412       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
15413       SmallDenseMap<int, int, 8> LaneMap;
15414       for (int I : InPlaceInputs) {
15415         PreDupI16Shuffle[I/2] = I/2;
15416         LaneMap[I] = I;
15417       }
15418       int j = TargetLo ? 0 : 4, je = j + 4;
15419       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
15420         // Check if j is already a shuffle of this input. This happens when
15421         // there are two adjacent bytes after we move the low one.
15422         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
15423           // If we haven't yet mapped the input, search for a slot into which
15424           // we can map it.
15425           while (j < je && PreDupI16Shuffle[j] >= 0)
15426             ++j;
15427 
15428           if (j == je)
15429             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
15430             return SDValue();
15431 
15432           // Map this input with the i16 shuffle.
15433           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
15434         }
15435 
15436         // Update the lane map based on the mapping we ended up with.
15437         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
15438       }
15439       V1 = DAG.getBitcast(
15440           MVT::v16i8,
15441           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15442                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
15443 
15444       // Unpack the bytes to form the i16s that will be shuffled into place.
15445       bool EvenInUse = false, OddInUse = false;
15446       for (int i = 0; i < 16; i += 2) {
15447         EvenInUse |= (Mask[i + 0] >= 0);
15448         OddInUse |= (Mask[i + 1] >= 0);
15449         if (EvenInUse && OddInUse)
15450           break;
15451       }
15452       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
15453                        MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
15454                        OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
15455 
15456       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
15457       for (int i = 0; i < 16; ++i)
15458         if (Mask[i] >= 0) {
15459           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
15460           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
15461           if (PostDupI16Shuffle[i / 2] < 0)
15462             PostDupI16Shuffle[i / 2] = MappedMask;
15463           else
15464             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
15465                    "Conflicting entries in the original shuffle!");
15466         }
15467       return DAG.getBitcast(
15468           MVT::v16i8,
15469           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15470                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
15471     };
15472     if (SDValue V = tryToWidenViaDuplication())
15473       return V;
15474   }
15475 
15476   if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
15477                                              Zeroable, Subtarget, DAG))
15478     return Masked;
15479 
15480   // Use dedicated unpack instructions for masks that match their pattern.
15481   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
15482     return V;
15483 
15484   // Try to use byte shift instructions to mask.
15485   if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
15486                                               Zeroable, Subtarget, DAG))
15487     return V;
15488 
15489   // Check for compaction patterns.
15490   bool IsSingleInput = V2.isUndef();
15491   int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput);
15492 
15493   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
15494   // with PSHUFB. It is important to do this before we attempt to generate any
15495   // blends but after all of the single-input lowerings. If the single input
15496   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
15497   // want to preserve that and we can DAG combine any longer sequences into
15498   // a PSHUFB in the end. But once we start blending from multiple inputs,
15499   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
15500   // and there are *very* few patterns that would actually be faster than the
15501   // PSHUFB approach because of its ability to zero lanes.
15502   //
15503   // If the mask is a binary compaction, we can more efficiently perform this
15504   // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
15505   //
15506   // FIXME: The only exceptions to the above are blends which are exact
15507   // interleavings with direct instructions supporting them. We currently don't
15508   // handle those well here.
15509   if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
15510     bool V1InUse = false;
15511     bool V2InUse = false;
15512 
15513     SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
15514         DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
15515 
15516     // If both V1 and V2 are in use and we can use a direct blend or an unpack,
15517     // do so. This avoids using them to handle blends-with-zero which is
15518     // important as a single pshufb is significantly faster for that.
15519     if (V1InUse && V2InUse) {
15520       if (Subtarget.hasSSE41())
15521         if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
15522                                                 Zeroable, Subtarget, DAG))
15523           return Blend;
15524 
15525       // We can use an unpack to do the blending rather than an or in some
15526       // cases. Even though the or may be (very minorly) more efficient, we
15527       // preference this lowering because there are common cases where part of
15528       // the complexity of the shuffles goes away when we do the final blend as
15529       // an unpack.
15530       // FIXME: It might be worth trying to detect if the unpack-feeding
15531       // shuffles will both be pshufb, in which case we shouldn't bother with
15532       // this.
15533       if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
15534               DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15535         return Unpack;
15536 
15537       // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
15538       if (Subtarget.hasVBMI())
15539         return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
15540                                      DAG);
15541 
15542       // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
15543       if (Subtarget.hasXOP()) {
15544         SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
15545         return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
15546       }
15547 
15548       // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
15549       // PALIGNR will be cheaper than the second PSHUFB+OR.
15550       if (SDValue V = lowerShuffleAsByteRotateAndPermute(
15551               DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15552         return V;
15553     }
15554 
15555     return PSHUFB;
15556   }
15557 
15558   // There are special ways we can lower some single-element blends.
15559   if (NumV2Elements == 1)
15560     if (SDValue V = lowerShuffleAsElementInsertion(
15561             DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
15562       return V;
15563 
15564   if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
15565     return Blend;
15566 
15567   // Check whether a compaction lowering can be done. This handles shuffles
15568   // which take every Nth element for some even N. See the helper function for
15569   // details.
15570   //
15571   // We special case these as they can be particularly efficiently handled with
15572   // the PACKUSB instruction on x86 and they show up in common patterns of
15573   // rearranging bytes to truncate wide elements.
15574   if (NumEvenDrops) {
15575     // NumEvenDrops is the power of two stride of the elements. Another way of
15576     // thinking about it is that we need to drop the even elements this many
15577     // times to get the original input.
15578 
15579     // First we need to zero all the dropped bytes.
15580     assert(NumEvenDrops <= 3 &&
15581            "No support for dropping even elements more than 3 times.");
15582     SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
15583     for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
15584       WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
15585     SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
15586     V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
15587                      WordClearMask);
15588     if (!IsSingleInput)
15589       V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
15590                        WordClearMask);
15591 
15592     // Now pack things back together.
15593     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15594                                  IsSingleInput ? V1 : V2);
15595     for (int i = 1; i < NumEvenDrops; ++i) {
15596       Result = DAG.getBitcast(MVT::v8i16, Result);
15597       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
15598     }
15599     return Result;
15600   }
15601 
15602   // Handle multi-input cases by blending/unpacking single-input shuffles.
15603   if (NumV2Elements > 0)
15604     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15605                                                 Subtarget, DAG);
15606 
15607   // The fallback path for single-input shuffles widens this into two v8i16
15608   // vectors with unpacks, shuffles those, and then pulls them back together
15609   // with a pack.
15610   SDValue V = V1;
15611 
15612   std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15613   std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15614   for (int i = 0; i < 16; ++i)
15615     if (Mask[i] >= 0)
15616       (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15617 
15618   SDValue VLoHalf, VHiHalf;
15619   // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15620   // them out and avoid using UNPCK{L,H} to extract the elements of V as
15621   // i16s.
15622   if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15623       none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15624     // Use a mask to drop the high bytes.
15625     VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15626     VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15627                           DAG.getConstant(0x00FF, DL, MVT::v8i16));
15628 
15629     // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15630     VHiHalf = DAG.getUNDEF(MVT::v8i16);
15631 
15632     // Squash the masks to point directly into VLoHalf.
15633     for (int &M : LoBlendMask)
15634       if (M >= 0)
15635         M /= 2;
15636     for (int &M : HiBlendMask)
15637       if (M >= 0)
15638         M /= 2;
15639   } else {
15640     // Otherwise just unpack the low half of V into VLoHalf and the high half into
15641     // VHiHalf so that we can blend them as i16s.
15642     SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15643 
15644     VLoHalf = DAG.getBitcast(
15645         MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15646     VHiHalf = DAG.getBitcast(
15647         MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15648   }
15649 
15650   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15651   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15652 
15653   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15654 }
15655 
15656 /// Dispatching routine to lower various 128-bit x86 vector shuffles.
15657 ///
15658 /// This routine breaks down the specific type of 128-bit shuffle and
15659 /// dispatches to the lowering routines accordingly.
lower128BitShuffle(const SDLoc & DL,ArrayRef<int> Mask,MVT VT,SDValue V1,SDValue V2,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)15660 static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
15661                                   MVT VT, SDValue V1, SDValue V2,
15662                                   const APInt &Zeroable,
15663                                   const X86Subtarget &Subtarget,
15664                                   SelectionDAG &DAG) {
15665   switch (VT.SimpleTy) {
15666   case MVT::v2i64:
15667     return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15668   case MVT::v2f64:
15669     return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15670   case MVT::v4i32:
15671     return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15672   case MVT::v4f32:
15673     return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15674   case MVT::v8i16:
15675     return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15676   case MVT::v16i8:
15677     return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15678 
15679   default:
15680     llvm_unreachable("Unimplemented!");
15681   }
15682 }
15683 
15684 /// Generic routine to split vector shuffle into half-sized shuffles.
15685 ///
15686 /// This routine just extracts two subvectors, shuffles them independently, and
15687 /// then concatenates them back together. This should work effectively with all
15688 /// AVX vector shuffle types.
splitAndLowerShuffle(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)15689 static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
15690                                     SDValue V2, ArrayRef<int> Mask,
15691                                     SelectionDAG &DAG) {
15692   assert(VT.getSizeInBits() >= 256 &&
15693          "Only for 256-bit or wider vector shuffles!");
15694   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
15695   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
15696 
15697   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15698   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15699 
15700   int NumElements = VT.getVectorNumElements();
15701   int SplitNumElements = NumElements / 2;
15702   MVT ScalarVT = VT.getVectorElementType();
15703   MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15704 
15705   // Use splitVector/extractSubVector so that split build-vectors just build two
15706   // narrower build vectors. This helps shuffling with splats and zeros.
15707   auto SplitVector = [&](SDValue V) {
15708     SDValue LoV, HiV;
15709     std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15710     return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15711                           DAG.getBitcast(SplitVT, HiV));
15712   };
15713 
15714   SDValue LoV1, HiV1, LoV2, HiV2;
15715   std::tie(LoV1, HiV1) = SplitVector(V1);
15716   std::tie(LoV2, HiV2) = SplitVector(V2);
15717 
15718   // Now create two 4-way blends of these half-width vectors.
15719   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15720     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
15721     SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15722     SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15723     SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15724     for (int i = 0; i < SplitNumElements; ++i) {
15725       int M = HalfMask[i];
15726       if (M >= NumElements) {
15727         if (M >= NumElements + SplitNumElements)
15728           UseHiV2 = true;
15729         else
15730           UseLoV2 = true;
15731         V2BlendMask[i] = M - NumElements;
15732         BlendMask[i] = SplitNumElements + i;
15733       } else if (M >= 0) {
15734         if (M >= SplitNumElements)
15735           UseHiV1 = true;
15736         else
15737           UseLoV1 = true;
15738         V1BlendMask[i] = M;
15739         BlendMask[i] = i;
15740       }
15741     }
15742 
15743     // Because the lowering happens after all combining takes place, we need to
15744     // manually combine these blend masks as much as possible so that we create
15745     // a minimal number of high-level vector shuffle nodes.
15746 
15747     // First try just blending the halves of V1 or V2.
15748     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15749       return DAG.getUNDEF(SplitVT);
15750     if (!UseLoV2 && !UseHiV2)
15751       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15752     if (!UseLoV1 && !UseHiV1)
15753       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15754 
15755     SDValue V1Blend, V2Blend;
15756     if (UseLoV1 && UseHiV1) {
15757       V1Blend =
15758         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15759     } else {
15760       // We only use half of V1 so map the usage down into the final blend mask.
15761       V1Blend = UseLoV1 ? LoV1 : HiV1;
15762       for (int i = 0; i < SplitNumElements; ++i)
15763         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15764           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15765     }
15766     if (UseLoV2 && UseHiV2) {
15767       V2Blend =
15768         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15769     } else {
15770       // We only use half of V2 so map the usage down into the final blend mask.
15771       V2Blend = UseLoV2 ? LoV2 : HiV2;
15772       for (int i = 0; i < SplitNumElements; ++i)
15773         if (BlendMask[i] >= SplitNumElements)
15774           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15775     }
15776     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15777   };
15778   SDValue Lo = HalfBlend(LoMask);
15779   SDValue Hi = HalfBlend(HiMask);
15780   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15781 }
15782 
15783 /// Either split a vector in halves or decompose the shuffles and the
15784 /// blend/unpack.
15785 ///
15786 /// This is provided as a good fallback for many lowerings of non-single-input
15787 /// shuffles with more than one 128-bit lane. In those cases, we want to select
15788 /// between splitting the shuffle into 128-bit components and stitching those
15789 /// back together vs. extracting the single-input shuffles and blending those
15790 /// results.
lowerShuffleAsSplitOrBlend(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)15791 static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
15792                                           SDValue V2, ArrayRef<int> Mask,
15793                                           const X86Subtarget &Subtarget,
15794                                           SelectionDAG &DAG) {
15795   assert(!V2.isUndef() && "This routine must not be used to lower single-input "
15796          "shuffles as it could then recurse on itself.");
15797   int Size = Mask.size();
15798 
15799   // If this can be modeled as a broadcast of two elements followed by a blend,
15800   // prefer that lowering. This is especially important because broadcasts can
15801   // often fold with memory operands.
15802   auto DoBothBroadcast = [&] {
15803     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15804     for (int M : Mask)
15805       if (M >= Size) {
15806         if (V2BroadcastIdx < 0)
15807           V2BroadcastIdx = M - Size;
15808         else if (M - Size != V2BroadcastIdx)
15809           return false;
15810       } else if (M >= 0) {
15811         if (V1BroadcastIdx < 0)
15812           V1BroadcastIdx = M;
15813         else if (M != V1BroadcastIdx)
15814           return false;
15815       }
15816     return true;
15817   };
15818   if (DoBothBroadcast())
15819     return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
15820                                                 DAG);
15821 
15822   // If the inputs all stem from a single 128-bit lane of each input, then we
15823   // split them rather than blending because the split will decompose to
15824   // unusually few instructions.
15825   int LaneCount = VT.getSizeInBits() / 128;
15826   int LaneSize = Size / LaneCount;
15827   SmallBitVector LaneInputs[2];
15828   LaneInputs[0].resize(LaneCount, false);
15829   LaneInputs[1].resize(LaneCount, false);
15830   for (int i = 0; i < Size; ++i)
15831     if (Mask[i] >= 0)
15832       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15833   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15834     return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
15835 
15836   // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15837   // requires that the decomposed single-input shuffles don't end up here.
15838   return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
15839                                               DAG);
15840 }
15841 
15842 // Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15843 // TODO: Extend to support v8f32 (+ 512-bit shuffles).
lowerShuffleAsLanePermuteAndSHUFP(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)15844 static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
15845                                                  SDValue V1, SDValue V2,
15846                                                  ArrayRef<int> Mask,
15847                                                  SelectionDAG &DAG) {
15848   assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15849 
15850   int LHSMask[4] = {-1, -1, -1, -1};
15851   int RHSMask[4] = {-1, -1, -1, -1};
15852   unsigned SHUFPMask = 0;
15853 
15854   // As SHUFPD uses a single LHS/RHS element per lane, we can always
15855   // perform the shuffle once the lanes have been shuffled in place.
15856   for (int i = 0; i != 4; ++i) {
15857     int M = Mask[i];
15858     if (M < 0)
15859       continue;
15860     int LaneBase = i & ~1;
15861     auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15862     LaneMask[LaneBase + (M & 1)] = M;
15863     SHUFPMask |= (M & 1) << i;
15864   }
15865 
15866   SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15867   SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15868   return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15869                      DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
15870 }
15871 
15872 /// Lower a vector shuffle crossing multiple 128-bit lanes as
15873 /// a lane permutation followed by a per-lane permutation.
15874 ///
15875 /// This is mainly for cases where we can have non-repeating permutes
15876 /// in each lane.
15877 ///
15878 /// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15879 /// we should investigate merging them.
lowerShuffleAsLanePermuteAndPermute(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG,const X86Subtarget & Subtarget)15880 static SDValue lowerShuffleAsLanePermuteAndPermute(
15881     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15882     SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15883   int NumElts = VT.getVectorNumElements();
15884   int NumLanes = VT.getSizeInBits() / 128;
15885   int NumEltsPerLane = NumElts / NumLanes;
15886   bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15887 
15888   /// Attempts to find a sublane permute with the given size
15889   /// that gets all elements into their target lanes.
15890   ///
15891   /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15892   /// If unsuccessful, returns false and may overwrite InLaneMask.
15893   auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15894     int NumSublanesPerLane = NumSublanes / NumLanes;
15895     int NumEltsPerSublane = NumElts / NumSublanes;
15896 
15897     SmallVector<int, 16> CrossLaneMask;
15898     SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15899     // CrossLaneMask but one entry == one sublane.
15900     SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15901 
15902     for (int i = 0; i != NumElts; ++i) {
15903       int M = Mask[i];
15904       if (M < 0)
15905         continue;
15906 
15907       int SrcSublane = M / NumEltsPerSublane;
15908       int DstLane = i / NumEltsPerLane;
15909 
15910       // We only need to get the elements into the right lane, not sublane.
15911       // So search all sublanes that make up the destination lane.
15912       bool Found = false;
15913       int DstSubStart = DstLane * NumSublanesPerLane;
15914       int DstSubEnd = DstSubStart + NumSublanesPerLane;
15915       for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15916         if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15917           continue;
15918 
15919         Found = true;
15920         CrossLaneMaskLarge[DstSublane] = SrcSublane;
15921         int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15922         InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15923         break;
15924       }
15925       if (!Found)
15926         return SDValue();
15927     }
15928 
15929     // Fill CrossLaneMask using CrossLaneMaskLarge.
15930     narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15931 
15932     if (!CanUseSublanes) {
15933       // If we're only shuffling a single lowest lane and the rest are identity
15934       // then don't bother.
15935       // TODO - isShuffleMaskInputInPlace could be extended to something like
15936       // this.
15937       int NumIdentityLanes = 0;
15938       bool OnlyShuffleLowestLane = true;
15939       for (int i = 0; i != NumLanes; ++i) {
15940         int LaneOffset = i * NumEltsPerLane;
15941         if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15942                                        i * NumEltsPerLane))
15943           NumIdentityLanes++;
15944         else if (CrossLaneMask[LaneOffset] != 0)
15945           OnlyShuffleLowestLane = false;
15946       }
15947       if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15948         return SDValue();
15949     }
15950 
15951     SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15952     return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15953                                 InLaneMask);
15954   };
15955 
15956   // First attempt a solution with full lanes.
15957   if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15958     return V;
15959 
15960   // The rest of the solutions use sublanes.
15961   if (!CanUseSublanes)
15962     return SDValue();
15963 
15964   // Then attempt a solution with 64-bit sublanes (vpermq).
15965   if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15966     return V;
15967 
15968   // If that doesn't work and we have fast variable shuffle,
15969   // attempt 32-bit sublanes (vpermd).
15970   if (!Subtarget.hasFastVariableShuffle())
15971     return SDValue();
15972 
15973   return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
15974 }
15975 
15976 /// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15977 /// source with a lane permutation.
15978 ///
15979 /// This lowering strategy results in four instructions in the worst case for a
15980 /// single-input cross lane shuffle which is lower than any other fully general
15981 /// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15982 /// shuffle pattern should be handled prior to trying this lowering.
lowerShuffleAsLanePermuteAndShuffle(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG,const X86Subtarget & Subtarget)15983 static SDValue lowerShuffleAsLanePermuteAndShuffle(
15984     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15985     SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15986   // FIXME: This should probably be generalized for 512-bit vectors as well.
15987   assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15988   int Size = Mask.size();
15989   int LaneSize = Size / 2;
15990 
15991   // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15992   // Only do this if the elements aren't all from the lower lane,
15993   // otherwise we're (probably) better off doing a split.
15994   if (VT == MVT::v4f64 &&
15995       !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15996     if (SDValue V =
15997             lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
15998       return V;
15999 
16000   // If there are only inputs from one 128-bit lane, splitting will in fact be
16001   // less expensive. The flags track whether the given lane contains an element
16002   // that crosses to another lane.
16003   if (!Subtarget.hasAVX2()) {
16004     bool LaneCrossing[2] = {false, false};
16005     for (int i = 0; i < Size; ++i)
16006       if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
16007         LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
16008     if (!LaneCrossing[0] || !LaneCrossing[1])
16009       return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16010   } else {
16011     bool LaneUsed[2] = {false, false};
16012     for (int i = 0; i < Size; ++i)
16013       if (Mask[i] >= 0)
16014         LaneUsed[(Mask[i] % Size) / LaneSize] = true;
16015     if (!LaneUsed[0] || !LaneUsed[1])
16016       return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16017   }
16018 
16019   // TODO - we could support shuffling V2 in the Flipped input.
16020   assert(V2.isUndef() &&
16021          "This last part of this routine only works on single input shuffles");
16022 
16023   SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());
16024   for (int i = 0; i < Size; ++i) {
16025     int &M = InLaneMask[i];
16026     if (M < 0)
16027       continue;
16028     if (((M % Size) / LaneSize) != (i / LaneSize))
16029       M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
16030   }
16031   assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
16032          "In-lane shuffle mask expected");
16033 
16034   // Flip the lanes, and shuffle the results which should now be in-lane.
16035   MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
16036   SDValue Flipped = DAG.getBitcast(PVT, V1);
16037   Flipped =
16038       DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
16039   Flipped = DAG.getBitcast(VT, Flipped);
16040   return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
16041 }
16042 
16043 /// Handle lowering 2-lane 128-bit shuffles.
lowerV2X128Shuffle(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)16044 static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
16045                                   SDValue V2, ArrayRef<int> Mask,
16046                                   const APInt &Zeroable,
16047                                   const X86Subtarget &Subtarget,
16048                                   SelectionDAG &DAG) {
16049   // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
16050   if (Subtarget.hasAVX2() && V2.isUndef())
16051     return SDValue();
16052 
16053   bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
16054 
16055   SmallVector<int, 4> WidenedMask;
16056   if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
16057     return SDValue();
16058 
16059   bool IsLowZero = (Zeroable & 0x3) == 0x3;
16060   bool IsHighZero = (Zeroable & 0xc) == 0xc;
16061 
16062   // Try to use an insert into a zero vector.
16063   if (WidenedMask[0] == 0 && IsHighZero) {
16064     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16065     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16066                               DAG.getIntPtrConstant(0, DL));
16067     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16068                        getZeroVector(VT, Subtarget, DAG, DL), LoV,
16069                        DAG.getIntPtrConstant(0, DL));
16070   }
16071 
16072   // TODO: If minimizing size and one of the inputs is a zero vector and the
16073   // the zero vector has only one use, we could use a VPERM2X128 to save the
16074   // instruction bytes needed to explicitly generate the zero vector.
16075 
16076   // Blends are faster and handle all the non-lane-crossing cases.
16077   if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
16078                                           Subtarget, DAG))
16079     return Blend;
16080 
16081   // If either input operand is a zero vector, use VPERM2X128 because its mask
16082   // allows us to replace the zero input with an implicit zero.
16083   if (!IsLowZero && !IsHighZero) {
16084     // Check for patterns which can be matched with a single insert of a 128-bit
16085     // subvector.
16086     bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
16087     if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
16088 
16089       // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
16090       // this will likely become vinsertf128 which can't fold a 256-bit memop.
16091       if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
16092         MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16093         SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
16094                                      OnlyUsesV1 ? V1 : V2,
16095                                      DAG.getIntPtrConstant(0, DL));
16096         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16097                            DAG.getIntPtrConstant(2, DL));
16098       }
16099     }
16100 
16101     // Try to use SHUF128 if possible.
16102     if (Subtarget.hasVLX()) {
16103       if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
16104         unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
16105                             ((WidenedMask[1] % 2) << 1);
16106         return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
16107                            DAG.getTargetConstant(PermMask, DL, MVT::i8));
16108       }
16109     }
16110   }
16111 
16112   // Otherwise form a 128-bit permutation. After accounting for undefs,
16113   // convert the 64-bit shuffle mask selection values into 128-bit
16114   // selection bits by dividing the indexes by 2 and shifting into positions
16115   // defined by a vperm2*128 instruction's immediate control byte.
16116 
16117   // The immediate permute control byte looks like this:
16118   //    [1:0] - select 128 bits from sources for low half of destination
16119   //    [2]   - ignore
16120   //    [3]   - zero low half of destination
16121   //    [5:4] - select 128 bits from sources for high half of destination
16122   //    [6]   - ignore
16123   //    [7]   - zero high half of destination
16124 
16125   assert((WidenedMask[0] >= 0 || IsLowZero) &&
16126          (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
16127 
16128   unsigned PermMask = 0;
16129   PermMask |= IsLowZero  ? 0x08 : (WidenedMask[0] << 0);
16130   PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
16131 
16132   // Check the immediate mask and replace unused sources with undef.
16133   if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
16134     V1 = DAG.getUNDEF(VT);
16135   if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
16136     V2 = DAG.getUNDEF(VT);
16137 
16138   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
16139                      DAG.getTargetConstant(PermMask, DL, MVT::i8));
16140 }
16141 
16142 /// Lower a vector shuffle by first fixing the 128-bit lanes and then
16143 /// shuffling each lane.
16144 ///
16145 /// This attempts to create a repeated lane shuffle where each lane uses one
16146 /// or two of the lanes of the inputs. The lanes of the input vectors are
16147 /// shuffled in one or two independent shuffles to get the lanes into the
16148 /// position needed by the final shuffle.
lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)16149 static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
16150     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16151     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16152   assert(!V2.isUndef() && "This is only useful with multiple inputs.");
16153 
16154   if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16155     return SDValue();
16156 
16157   int NumElts = Mask.size();
16158   int NumLanes = VT.getSizeInBits() / 128;
16159   int NumLaneElts = 128 / VT.getScalarSizeInBits();
16160   SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
16161   SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
16162 
16163   // First pass will try to fill in the RepeatMask from lanes that need two
16164   // sources.
16165   for (int Lane = 0; Lane != NumLanes; ++Lane) {
16166     int Srcs[2] = {-1, -1};
16167     SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
16168     for (int i = 0; i != NumLaneElts; ++i) {
16169       int M = Mask[(Lane * NumLaneElts) + i];
16170       if (M < 0)
16171         continue;
16172       // Determine which of the possible input lanes (NumLanes from each source)
16173       // this element comes from. Assign that as one of the sources for this
16174       // lane. We can assign up to 2 sources for this lane. If we run out
16175       // sources we can't do anything.
16176       int LaneSrc = M / NumLaneElts;
16177       int Src;
16178       if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
16179         Src = 0;
16180       else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
16181         Src = 1;
16182       else
16183         return SDValue();
16184 
16185       Srcs[Src] = LaneSrc;
16186       InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
16187     }
16188 
16189     // If this lane has two sources, see if it fits with the repeat mask so far.
16190     if (Srcs[1] < 0)
16191       continue;
16192 
16193     LaneSrcs[Lane][0] = Srcs[0];
16194     LaneSrcs[Lane][1] = Srcs[1];
16195 
16196     auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
16197       assert(M1.size() == M2.size() && "Unexpected mask size");
16198       for (int i = 0, e = M1.size(); i != e; ++i)
16199         if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
16200           return false;
16201       return true;
16202     };
16203 
16204     auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
16205       assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
16206       for (int i = 0, e = MergedMask.size(); i != e; ++i) {
16207         int M = Mask[i];
16208         if (M < 0)
16209           continue;
16210         assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
16211                "Unexpected mask element");
16212         MergedMask[i] = M;
16213       }
16214     };
16215 
16216     if (MatchMasks(InLaneMask, RepeatMask)) {
16217       // Merge this lane mask into the final repeat mask.
16218       MergeMasks(InLaneMask, RepeatMask);
16219       continue;
16220     }
16221 
16222     // Didn't find a match. Swap the operands and try again.
16223     std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
16224     ShuffleVectorSDNode::commuteMask(InLaneMask);
16225 
16226     if (MatchMasks(InLaneMask, RepeatMask)) {
16227       // Merge this lane mask into the final repeat mask.
16228       MergeMasks(InLaneMask, RepeatMask);
16229       continue;
16230     }
16231 
16232     // Couldn't find a match with the operands in either order.
16233     return SDValue();
16234   }
16235 
16236   // Now handle any lanes with only one source.
16237   for (int Lane = 0; Lane != NumLanes; ++Lane) {
16238     // If this lane has already been processed, skip it.
16239     if (LaneSrcs[Lane][0] >= 0)
16240       continue;
16241 
16242     for (int i = 0; i != NumLaneElts; ++i) {
16243       int M = Mask[(Lane * NumLaneElts) + i];
16244       if (M < 0)
16245         continue;
16246 
16247       // If RepeatMask isn't defined yet we can define it ourself.
16248       if (RepeatMask[i] < 0)
16249         RepeatMask[i] = M % NumLaneElts;
16250 
16251       if (RepeatMask[i] < NumElts) {
16252         if (RepeatMask[i] != M % NumLaneElts)
16253           return SDValue();
16254         LaneSrcs[Lane][0] = M / NumLaneElts;
16255       } else {
16256         if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
16257           return SDValue();
16258         LaneSrcs[Lane][1] = M / NumLaneElts;
16259       }
16260     }
16261 
16262     if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
16263       return SDValue();
16264   }
16265 
16266   SmallVector<int, 16> NewMask(NumElts, -1);
16267   for (int Lane = 0; Lane != NumLanes; ++Lane) {
16268     int Src = LaneSrcs[Lane][0];
16269     for (int i = 0; i != NumLaneElts; ++i) {
16270       int M = -1;
16271       if (Src >= 0)
16272         M = Src * NumLaneElts + i;
16273       NewMask[Lane * NumLaneElts + i] = M;
16274     }
16275   }
16276   SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16277   // Ensure we didn't get back the shuffle we started with.
16278   // FIXME: This is a hack to make up for some splat handling code in
16279   // getVectorShuffle.
16280   if (isa<ShuffleVectorSDNode>(NewV1) &&
16281       cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
16282     return SDValue();
16283 
16284   for (int Lane = 0; Lane != NumLanes; ++Lane) {
16285     int Src = LaneSrcs[Lane][1];
16286     for (int i = 0; i != NumLaneElts; ++i) {
16287       int M = -1;
16288       if (Src >= 0)
16289         M = Src * NumLaneElts + i;
16290       NewMask[Lane * NumLaneElts + i] = M;
16291     }
16292   }
16293   SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16294   // Ensure we didn't get back the shuffle we started with.
16295   // FIXME: This is a hack to make up for some splat handling code in
16296   // getVectorShuffle.
16297   if (isa<ShuffleVectorSDNode>(NewV2) &&
16298       cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
16299     return SDValue();
16300 
16301   for (int i = 0; i != NumElts; ++i) {
16302     NewMask[i] = RepeatMask[i % NumLaneElts];
16303     if (NewMask[i] < 0)
16304       continue;
16305 
16306     NewMask[i] += (i / NumLaneElts) * NumLaneElts;
16307   }
16308   return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
16309 }
16310 
16311 /// If the input shuffle mask results in a vector that is undefined in all upper
16312 /// or lower half elements and that mask accesses only 2 halves of the
16313 /// shuffle's operands, return true. A mask of half the width with mask indexes
16314 /// adjusted to access the extracted halves of the original shuffle operands is
16315 /// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
16316 /// lower half of each input operand is accessed.
16317 static bool
getHalfShuffleMask(ArrayRef<int> Mask,MutableArrayRef<int> HalfMask,int & HalfIdx1,int & HalfIdx2)16318 getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
16319                    int &HalfIdx1, int &HalfIdx2) {
16320   assert((Mask.size() == HalfMask.size() * 2) &&
16321          "Expected input mask to be twice as long as output");
16322 
16323   // Exactly one half of the result must be undef to allow narrowing.
16324   bool UndefLower = isUndefLowerHalf(Mask);
16325   bool UndefUpper = isUndefUpperHalf(Mask);
16326   if (UndefLower == UndefUpper)
16327     return false;
16328 
16329   unsigned HalfNumElts = HalfMask.size();
16330   unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
16331   HalfIdx1 = -1;
16332   HalfIdx2 = -1;
16333   for (unsigned i = 0; i != HalfNumElts; ++i) {
16334     int M = Mask[i + MaskIndexOffset];
16335     if (M < 0) {
16336       HalfMask[i] = M;
16337       continue;
16338     }
16339 
16340     // Determine which of the 4 half vectors this element is from.
16341     // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
16342     int HalfIdx = M / HalfNumElts;
16343 
16344     // Determine the element index into its half vector source.
16345     int HalfElt = M % HalfNumElts;
16346 
16347     // We can shuffle with up to 2 half vectors, set the new 'half'
16348     // shuffle mask accordingly.
16349     if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
16350       HalfMask[i] = HalfElt;
16351       HalfIdx1 = HalfIdx;
16352       continue;
16353     }
16354     if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
16355       HalfMask[i] = HalfElt + HalfNumElts;
16356       HalfIdx2 = HalfIdx;
16357       continue;
16358     }
16359 
16360     // Too many half vectors referenced.
16361     return false;
16362   }
16363 
16364   return true;
16365 }
16366 
16367 /// Given the output values from getHalfShuffleMask(), create a half width
16368 /// shuffle of extracted vectors followed by an insert back to full width.
getShuffleHalfVectors(const SDLoc & DL,SDValue V1,SDValue V2,ArrayRef<int> HalfMask,int HalfIdx1,int HalfIdx2,bool UndefLower,SelectionDAG & DAG,bool UseConcat=false)16369 static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
16370                                      ArrayRef<int> HalfMask, int HalfIdx1,
16371                                      int HalfIdx2, bool UndefLower,
16372                                      SelectionDAG &DAG, bool UseConcat = false) {
16373   assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
16374   assert(V1.getValueType().isSimple() && "Expecting only simple types");
16375 
16376   MVT VT = V1.getSimpleValueType();
16377   MVT HalfVT = VT.getHalfNumVectorElementsVT();
16378   unsigned HalfNumElts = HalfVT.getVectorNumElements();
16379 
16380   auto getHalfVector = [&](int HalfIdx) {
16381     if (HalfIdx < 0)
16382       return DAG.getUNDEF(HalfVT);
16383     SDValue V = (HalfIdx < 2 ? V1 : V2);
16384     HalfIdx = (HalfIdx % 2) * HalfNumElts;
16385     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
16386                        DAG.getIntPtrConstant(HalfIdx, DL));
16387   };
16388 
16389   // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
16390   SDValue Half1 = getHalfVector(HalfIdx1);
16391   SDValue Half2 = getHalfVector(HalfIdx2);
16392   SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
16393   if (UseConcat) {
16394     SDValue Op0 = V;
16395     SDValue Op1 = DAG.getUNDEF(HalfVT);
16396     if (UndefLower)
16397       std::swap(Op0, Op1);
16398     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
16399   }
16400 
16401   unsigned Offset = UndefLower ? HalfNumElts : 0;
16402   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
16403                      DAG.getIntPtrConstant(Offset, DL));
16404 }
16405 
16406 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
16407 /// This allows for fast cases such as subvector extraction/insertion
16408 /// or shuffling smaller vector types which can lower more efficiently.
lowerShuffleWithUndefHalf(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)16409 static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
16410                                          SDValue V2, ArrayRef<int> Mask,
16411                                          const X86Subtarget &Subtarget,
16412                                          SelectionDAG &DAG) {
16413   assert((VT.is256BitVector() || VT.is512BitVector()) &&
16414          "Expected 256-bit or 512-bit vector");
16415 
16416   bool UndefLower = isUndefLowerHalf(Mask);
16417   if (!UndefLower && !isUndefUpperHalf(Mask))
16418     return SDValue();
16419 
16420   assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
16421          "Completely undef shuffle mask should have been simplified already");
16422 
16423   // Upper half is undef and lower half is whole upper subvector.
16424   // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
16425   MVT HalfVT = VT.getHalfNumVectorElementsVT();
16426   unsigned HalfNumElts = HalfVT.getVectorNumElements();
16427   if (!UndefLower &&
16428       isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
16429     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16430                              DAG.getIntPtrConstant(HalfNumElts, DL));
16431     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16432                        DAG.getIntPtrConstant(0, DL));
16433   }
16434 
16435   // Lower half is undef and upper half is whole lower subvector.
16436   // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
16437   if (UndefLower &&
16438       isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
16439     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16440                              DAG.getIntPtrConstant(0, DL));
16441     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16442                        DAG.getIntPtrConstant(HalfNumElts, DL));
16443   }
16444 
16445   int HalfIdx1, HalfIdx2;
16446   SmallVector<int, 8> HalfMask(HalfNumElts);
16447   if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
16448     return SDValue();
16449 
16450   assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
16451 
16452   // Only shuffle the halves of the inputs when useful.
16453   unsigned NumLowerHalves =
16454       (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
16455   unsigned NumUpperHalves =
16456       (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
16457   assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
16458 
16459   // Determine the larger pattern of undef/halves, then decide if it's worth
16460   // splitting the shuffle based on subtarget capabilities and types.
16461   unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
16462   if (!UndefLower) {
16463     // XXXXuuuu: no insert is needed.
16464     // Always extract lowers when setting lower - these are all free subreg ops.
16465     if (NumUpperHalves == 0)
16466       return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16467                                    UndefLower, DAG);
16468 
16469     if (NumUpperHalves == 1) {
16470       // AVX2 has efficient 32/64-bit element cross-lane shuffles.
16471       if (Subtarget.hasAVX2()) {
16472         // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
16473         if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
16474             !is128BitUnpackShuffleMask(HalfMask) &&
16475             (!isSingleSHUFPSMask(HalfMask) ||
16476              Subtarget.hasFastVariableShuffle()))
16477           return SDValue();
16478         // If this is a unary shuffle (assume that the 2nd operand is
16479         // canonicalized to undef), then we can use vpermpd. Otherwise, we
16480         // are better off extracting the upper half of 1 operand and using a
16481         // narrow shuffle.
16482         if (EltWidth == 64 && V2.isUndef())
16483           return SDValue();
16484       }
16485       // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16486       if (Subtarget.hasAVX512() && VT.is512BitVector())
16487         return SDValue();
16488       // Extract + narrow shuffle is better than the wide alternative.
16489       return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16490                                    UndefLower, DAG);
16491     }
16492 
16493     // Don't extract both uppers, instead shuffle and then extract.
16494     assert(NumUpperHalves == 2 && "Half vector count went wrong");
16495     return SDValue();
16496   }
16497 
16498   // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16499   if (NumUpperHalves == 0) {
16500     // AVX2 has efficient 64-bit element cross-lane shuffles.
16501     // TODO: Refine to account for unary shuffle, splat, and other masks?
16502     if (Subtarget.hasAVX2() && EltWidth == 64)
16503       return SDValue();
16504     // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16505     if (Subtarget.hasAVX512() && VT.is512BitVector())
16506       return SDValue();
16507     // Narrow shuffle + insert is better than the wide alternative.
16508     return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16509                                  UndefLower, DAG);
16510   }
16511 
16512   // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16513   return SDValue();
16514 }
16515 
16516 /// Test whether the specified input (0 or 1) is in-place blended by the
16517 /// given mask.
16518 ///
16519 /// This returns true if the elements from a particular input are already in the
16520 /// slot required by the given mask and require no permutation.
isShuffleMaskInputInPlace(int Input,ArrayRef<int> Mask)16521 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
16522   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
16523   int Size = Mask.size();
16524   for (int i = 0; i < Size; ++i)
16525     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
16526       return false;
16527 
16528   return true;
16529 }
16530 
16531 /// Handle case where shuffle sources are coming from the same 128-bit lane and
16532 /// every lane can be represented as the same repeating mask - allowing us to
16533 /// shuffle the sources with the repeating shuffle and then permute the result
16534 /// to the destination lanes.
lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)16535 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
16536     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16537     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16538   int NumElts = VT.getVectorNumElements();
16539   int NumLanes = VT.getSizeInBits() / 128;
16540   int NumLaneElts = NumElts / NumLanes;
16541 
16542   // On AVX2 we may be able to just shuffle the lowest elements and then
16543   // broadcast the result.
16544   if (Subtarget.hasAVX2()) {
16545     for (unsigned BroadcastSize : {16, 32, 64}) {
16546       if (BroadcastSize <= VT.getScalarSizeInBits())
16547         continue;
16548       int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16549 
16550       // Attempt to match a repeating pattern every NumBroadcastElts,
16551       // accounting for UNDEFs but only references the lowest 128-bit
16552       // lane of the inputs.
16553       auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16554         for (int i = 0; i != NumElts; i += NumBroadcastElts)
16555           for (int j = 0; j != NumBroadcastElts; ++j) {
16556             int M = Mask[i + j];
16557             if (M < 0)
16558               continue;
16559             int &R = RepeatMask[j];
16560             if (0 != ((M % NumElts) / NumLaneElts))
16561               return false;
16562             if (0 <= R && R != M)
16563               return false;
16564             R = M;
16565           }
16566         return true;
16567       };
16568 
16569       SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16570       if (!FindRepeatingBroadcastMask(RepeatMask))
16571         continue;
16572 
16573       // Shuffle the (lowest) repeated elements in place for broadcast.
16574       SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16575 
16576       // Shuffle the actual broadcast.
16577       SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16578       for (int i = 0; i != NumElts; i += NumBroadcastElts)
16579         for (int j = 0; j != NumBroadcastElts; ++j)
16580           BroadcastMask[i + j] = j;
16581       return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16582                                   BroadcastMask);
16583     }
16584   }
16585 
16586   // Bail if the shuffle mask doesn't cross 128-bit lanes.
16587   if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16588     return SDValue();
16589 
16590   // Bail if we already have a repeated lane shuffle mask.
16591   SmallVector<int, 8> RepeatedShuffleMask;
16592   if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
16593     return SDValue();
16594 
16595   // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16596   // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
16597   int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
16598   int NumSubLanes = NumLanes * SubLaneScale;
16599   int NumSubLaneElts = NumLaneElts / SubLaneScale;
16600 
16601   // Check that all the sources are coming from the same lane and see if we can
16602   // form a repeating shuffle mask (local to each sub-lane). At the same time,
16603   // determine the source sub-lane for each destination sub-lane.
16604   int TopSrcSubLane = -1;
16605   SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16606   SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
16607       SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
16608       SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
16609 
16610   for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16611     // Extract the sub-lane mask, check that it all comes from the same lane
16612     // and normalize the mask entries to come from the first lane.
16613     int SrcLane = -1;
16614     SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16615     for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16616       int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16617       if (M < 0)
16618         continue;
16619       int Lane = (M % NumElts) / NumLaneElts;
16620       if ((0 <= SrcLane) && (SrcLane != Lane))
16621         return SDValue();
16622       SrcLane = Lane;
16623       int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16624       SubLaneMask[Elt] = LocalM;
16625     }
16626 
16627     // Whole sub-lane is UNDEF.
16628     if (SrcLane < 0)
16629       continue;
16630 
16631     // Attempt to match against the candidate repeated sub-lane masks.
16632     for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16633       auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16634         for (int i = 0; i != NumSubLaneElts; ++i) {
16635           if (M1[i] < 0 || M2[i] < 0)
16636             continue;
16637           if (M1[i] != M2[i])
16638             return false;
16639         }
16640         return true;
16641       };
16642 
16643       auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16644       if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16645         continue;
16646 
16647       // Merge the sub-lane mask into the matching repeated sub-lane mask.
16648       for (int i = 0; i != NumSubLaneElts; ++i) {
16649         int M = SubLaneMask[i];
16650         if (M < 0)
16651           continue;
16652         assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16653                "Unexpected mask element");
16654         RepeatedSubLaneMask[i] = M;
16655       }
16656 
16657       // Track the top most source sub-lane - by setting the remaining to UNDEF
16658       // we can greatly simplify shuffle matching.
16659       int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16660       TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16661       Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16662       break;
16663     }
16664 
16665     // Bail if we failed to find a matching repeated sub-lane mask.
16666     if (Dst2SrcSubLanes[DstSubLane] < 0)
16667       return SDValue();
16668   }
16669   assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16670          "Unexpected source lane");
16671 
16672   // Create a repeating shuffle mask for the entire vector.
16673   SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16674   for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16675     int Lane = SubLane / SubLaneScale;
16676     auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16677     for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16678       int M = RepeatedSubLaneMask[Elt];
16679       if (M < 0)
16680         continue;
16681       int Idx = (SubLane * NumSubLaneElts) + Elt;
16682       RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16683     }
16684   }
16685   SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16686 
16687   // Shuffle each source sub-lane to its destination.
16688   SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16689   for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16690     int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16691     if (SrcSubLane < 0)
16692       continue;
16693     for (int j = 0; j != NumSubLaneElts; ++j)
16694       SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16695   }
16696 
16697   return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16698                               SubLaneMask);
16699 }
16700 
matchShuffleWithSHUFPD(MVT VT,SDValue & V1,SDValue & V2,bool & ForceV1Zero,bool & ForceV2Zero,unsigned & ShuffleImm,ArrayRef<int> Mask,const APInt & Zeroable)16701 static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
16702                                    bool &ForceV1Zero, bool &ForceV2Zero,
16703                                    unsigned &ShuffleImm, ArrayRef<int> Mask,
16704                                    const APInt &Zeroable) {
16705   int NumElts = VT.getVectorNumElements();
16706   assert(VT.getScalarSizeInBits() == 64 &&
16707          (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16708          "Unexpected data type for VSHUFPD");
16709   assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
16710          "Illegal shuffle mask");
16711 
16712   bool ZeroLane[2] = { true, true };
16713   for (int i = 0; i < NumElts; ++i)
16714     ZeroLane[i & 1] &= Zeroable[i];
16715 
16716   // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
16717   // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
16718   ShuffleImm = 0;
16719   bool ShufpdMask = true;
16720   bool CommutableMask = true;
16721   for (int i = 0; i < NumElts; ++i) {
16722     if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16723       continue;
16724     if (Mask[i] < 0)
16725       return false;
16726     int Val = (i & 6) + NumElts * (i & 1);
16727     int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16728     if (Mask[i] < Val || Mask[i] > Val + 1)
16729       ShufpdMask = false;
16730     if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16731       CommutableMask = false;
16732     ShuffleImm |= (Mask[i] % 2) << i;
16733   }
16734 
16735   if (!ShufpdMask && !CommutableMask)
16736     return false;
16737 
16738   if (!ShufpdMask && CommutableMask)
16739     std::swap(V1, V2);
16740 
16741   ForceV1Zero = ZeroLane[0];
16742   ForceV2Zero = ZeroLane[1];
16743   return true;
16744 }
16745 
lowerShuffleWithSHUFPD(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)16746 static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
16747                                       SDValue V2, ArrayRef<int> Mask,
16748                                       const APInt &Zeroable,
16749                                       const X86Subtarget &Subtarget,
16750                                       SelectionDAG &DAG) {
16751   assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16752          "Unexpected data type for VSHUFPD");
16753 
16754   unsigned Immediate = 0;
16755   bool ForceV1Zero = false, ForceV2Zero = false;
16756   if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16757                               Mask, Zeroable))
16758     return SDValue();
16759 
16760   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16761   if (ForceV1Zero)
16762     V1 = getZeroVector(VT, Subtarget, DAG, DL);
16763   if (ForceV2Zero)
16764     V2 = getZeroVector(VT, Subtarget, DAG, DL);
16765 
16766   return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16767                      DAG.getTargetConstant(Immediate, DL, MVT::i8));
16768 }
16769 
16770 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16771 // by zeroable elements in the remaining 24 elements. Turn this into two
16772 // vmovqb instructions shuffled together.
lowerShuffleAsVTRUNCAndUnpack(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const APInt & Zeroable,SelectionDAG & DAG)16773 static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
16774                                              SDValue V1, SDValue V2,
16775                                              ArrayRef<int> Mask,
16776                                              const APInt &Zeroable,
16777                                              SelectionDAG &DAG) {
16778   assert(VT == MVT::v32i8 && "Unexpected type!");
16779 
16780   // The first 8 indices should be every 8th element.
16781   if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16782     return SDValue();
16783 
16784   // Remaining elements need to be zeroable.
16785   if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
16786     return SDValue();
16787 
16788   V1 = DAG.getBitcast(MVT::v4i64, V1);
16789   V2 = DAG.getBitcast(MVT::v4i64, V2);
16790 
16791   V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16792   V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16793 
16794   // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16795   // the upper bits of the result using an unpckldq.
16796   SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16797                                         { 0, 1, 2, 3, 16, 17, 18, 19,
16798                                           4, 5, 6, 7, 20, 21, 22, 23 });
16799   // Insert the unpckldq into a zero vector to widen to v32i8.
16800   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16801                      DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16802                      DAG.getIntPtrConstant(0, DL));
16803 }
16804 
16805 
16806 /// Handle lowering of 4-lane 64-bit floating point shuffles.
16807 ///
16808 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16809 /// isn't available.
lowerV4F64Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)16810 static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16811                                  const APInt &Zeroable, SDValue V1, SDValue V2,
16812                                  const X86Subtarget &Subtarget,
16813                                  SelectionDAG &DAG) {
16814   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16815   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16816   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16817 
16818   if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16819                                      Subtarget, DAG))
16820     return V;
16821 
16822   if (V2.isUndef()) {
16823     // Check for being able to broadcast a single element.
16824     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16825                                                     Mask, Subtarget, DAG))
16826       return Broadcast;
16827 
16828     // Use low duplicate instructions for masks that match their pattern.
16829     if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16830       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16831 
16832     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16833       // Non-half-crossing single input shuffles can be lowered with an
16834       // interleaved permutation.
16835       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16836                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16837       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16838                          DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16839     }
16840 
16841     // With AVX2 we have direct support for this permutation.
16842     if (Subtarget.hasAVX2())
16843       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16844                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16845 
16846     // Try to create an in-lane repeating shuffle mask and then shuffle the
16847     // results into the target lanes.
16848     if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16849             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16850       return V;
16851 
16852     // Try to permute the lanes and then use a per-lane permute.
16853     if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16854                                                         Mask, DAG, Subtarget))
16855       return V;
16856 
16857     // Otherwise, fall back.
16858     return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16859                                                DAG, Subtarget);
16860   }
16861 
16862   // Use dedicated unpack instructions for masks that match their pattern.
16863   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
16864     return V;
16865 
16866   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16867                                           Zeroable, Subtarget, DAG))
16868     return Blend;
16869 
16870   // Check if the blend happens to exactly fit that of SHUFPD.
16871   if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16872                                           Zeroable, Subtarget, DAG))
16873     return Op;
16874 
16875   // If we have lane crossing shuffles AND they don't all come from the lower
16876   // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16877   // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16878   // canonicalize to a blend of splat which isn't necessary for this combine.
16879   if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16880       !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16881       (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16882       (V2.getOpcode() != ISD::BUILD_VECTOR))
16883     if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
16884                                                        Mask, DAG))
16885       return Op;
16886 
16887   // If we have one input in place, then we can permute the other input and
16888   // blend the result.
16889   if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
16890     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16891                                                 Subtarget, DAG);
16892 
16893   // Try to create an in-lane repeating shuffle mask and then shuffle the
16894   // results into the target lanes.
16895   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16896           DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16897     return V;
16898 
16899   // Try to simplify this by merging 128-bit lanes to enable a lane-based
16900   // shuffle. However, if we have AVX2 and either inputs are already in place,
16901   // we will be able to shuffle even across lanes the other input in a single
16902   // instruction so skip this pattern.
16903   if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
16904                                 isShuffleMaskInputInPlace(1, Mask))))
16905     if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
16906             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16907       return V;
16908 
16909   // If we have VLX support, we can use VEXPAND.
16910   if (Subtarget.hasVLX())
16911     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
16912                                          DAG, Subtarget))
16913       return V;
16914 
16915   // If we have AVX2 then we always want to lower with a blend because an v4 we
16916   // can fully permute the elements.
16917   if (Subtarget.hasAVX2())
16918     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16919                                                 Subtarget, DAG);
16920 
16921   // Otherwise fall back on generic lowering.
16922   return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
16923                                     Subtarget, DAG);
16924 }
16925 
16926 /// Handle lowering of 4-lane 64-bit integer shuffles.
16927 ///
16928 /// This routine is only called when we have AVX2 and thus a reasonable
16929 /// instruction set for v4i64 shuffling..
lowerV4I64Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)16930 static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16931                                  const APInt &Zeroable, SDValue V1, SDValue V2,
16932                                  const X86Subtarget &Subtarget,
16933                                  SelectionDAG &DAG) {
16934   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16935   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16936   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16937   assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
16938 
16939   if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16940                                      Subtarget, DAG))
16941     return V;
16942 
16943   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16944                                           Zeroable, Subtarget, DAG))
16945     return Blend;
16946 
16947   // Check for being able to broadcast a single element.
16948   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16949                                                   Subtarget, DAG))
16950     return Broadcast;
16951 
16952   if (V2.isUndef()) {
16953     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16954     // can use lower latency instructions that will operate on both lanes.
16955     SmallVector<int, 2> RepeatedMask;
16956     if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16957       SmallVector<int, 4> PSHUFDMask;
16958       narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16959       return DAG.getBitcast(
16960           MVT::v4i64,
16961           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16962                       DAG.getBitcast(MVT::v8i32, V1),
16963                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16964     }
16965 
16966     // AVX2 provides a direct instruction for permuting a single input across
16967     // lanes.
16968     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16969                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16970   }
16971 
16972   // Try to use shift instructions.
16973   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
16974                                           Zeroable, Subtarget, DAG))
16975     return Shift;
16976 
16977   // If we have VLX support, we can use VALIGN or VEXPAND.
16978   if (Subtarget.hasVLX()) {
16979     if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16980                                               Subtarget, DAG))
16981       return Rotate;
16982 
16983     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
16984                                          DAG, Subtarget))
16985       return V;
16986   }
16987 
16988   // Try to use PALIGNR.
16989   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16990                                                 Subtarget, DAG))
16991     return Rotate;
16992 
16993   // Use dedicated unpack instructions for masks that match their pattern.
16994   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
16995     return V;
16996 
16997   // If we have one input in place, then we can permute the other input and
16998   // blend the result.
16999   if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
17000     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17001                                                 Subtarget, DAG);
17002 
17003   // Try to create an in-lane repeating shuffle mask and then shuffle the
17004   // results into the target lanes.
17005   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17006           DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17007     return V;
17008 
17009   // Try to simplify this by merging 128-bit lanes to enable a lane-based
17010   // shuffle. However, if we have AVX2 and either inputs are already in place,
17011   // we will be able to shuffle even across lanes the other input in a single
17012   // instruction so skip this pattern.
17013   if (!isShuffleMaskInputInPlace(0, Mask) &&
17014       !isShuffleMaskInputInPlace(1, Mask))
17015     if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17016             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17017       return Result;
17018 
17019   // Otherwise fall back on generic blend lowering.
17020   return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17021                                               Subtarget, DAG);
17022 }
17023 
17024 /// Handle lowering of 8-lane 32-bit floating point shuffles.
17025 ///
17026 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
17027 /// isn't available.
lowerV8F32Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)17028 static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17029                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17030                                  const X86Subtarget &Subtarget,
17031                                  SelectionDAG &DAG) {
17032   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
17033   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
17034   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17035 
17036   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
17037                                           Zeroable, Subtarget, DAG))
17038     return Blend;
17039 
17040   // Check for being able to broadcast a single element.
17041   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
17042                                                   Subtarget, DAG))
17043     return Broadcast;
17044 
17045   // If the shuffle mask is repeated in each 128-bit lane, we have many more
17046   // options to efficiently lower the shuffle.
17047   SmallVector<int, 4> RepeatedMask;
17048   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
17049     assert(RepeatedMask.size() == 4 &&
17050            "Repeated masks must be half the mask width!");
17051 
17052     // Use even/odd duplicate instructions for masks that match their pattern.
17053     if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17054       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
17055     if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17056       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
17057 
17058     if (V2.isUndef())
17059       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
17060                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17061 
17062     // Use dedicated unpack instructions for masks that match their pattern.
17063     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
17064       return V;
17065 
17066     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
17067     // have already handled any direct blends.
17068     return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
17069   }
17070 
17071   // Try to create an in-lane repeating shuffle mask and then shuffle the
17072   // results into the target lanes.
17073   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17074           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17075     return V;
17076 
17077   // If we have a single input shuffle with different shuffle patterns in the
17078   // two 128-bit lanes use the variable mask to VPERMILPS.
17079   if (V2.isUndef()) {
17080     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
17081       SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17082       return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
17083     }
17084     if (Subtarget.hasAVX2()) {
17085       SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17086       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
17087     }
17088     // Otherwise, fall back.
17089     return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
17090                                                DAG, Subtarget);
17091   }
17092 
17093   // Try to simplify this by merging 128-bit lanes to enable a lane-based
17094   // shuffle.
17095   if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17096           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17097     return Result;
17098 
17099   // If we have VLX support, we can use VEXPAND.
17100   if (Subtarget.hasVLX())
17101     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
17102                                          DAG, Subtarget))
17103       return V;
17104 
17105   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17106   // since after split we get a more efficient code using vpunpcklwd and
17107   // vpunpckhwd instrs than vblend.
17108   if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
17109     return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
17110                                       DAG);
17111 
17112   // If we have AVX2 then we always want to lower with a blend because at v8 we
17113   // can fully permute the elements.
17114   if (Subtarget.hasAVX2())
17115     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
17116                                                 Subtarget, DAG);
17117 
17118   // Otherwise fall back on generic lowering.
17119   return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
17120                                     Subtarget, DAG);
17121 }
17122 
17123 /// Handle lowering of 8-lane 32-bit integer shuffles.
17124 ///
17125 /// This routine is only called when we have AVX2 and thus a reasonable
17126 /// instruction set for v8i32 shuffling..
lowerV8I32Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)17127 static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17128                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17129                                  const X86Subtarget &Subtarget,
17130                                  SelectionDAG &DAG) {
17131   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
17132   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
17133   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17134   assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
17135 
17136   // Whenever we can lower this as a zext, that instruction is strictly faster
17137   // than any alternative. It also allows us to fold memory operands into the
17138   // shuffle in many cases.
17139   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
17140                                                    Zeroable, Subtarget, DAG))
17141     return ZExt;
17142 
17143   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17144   // since after split we get a more efficient code than vblend by using
17145   // vpunpcklwd and vpunpckhwd instrs.
17146   if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
17147       !Subtarget.hasAVX512())
17148     return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
17149                                       DAG);
17150 
17151   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
17152                                           Zeroable, Subtarget, DAG))
17153     return Blend;
17154 
17155   // Check for being able to broadcast a single element.
17156   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
17157                                                   Subtarget, DAG))
17158     return Broadcast;
17159 
17160   // If the shuffle mask is repeated in each 128-bit lane we can use more
17161   // efficient instructions that mirror the shuffles across the two 128-bit
17162   // lanes.
17163   SmallVector<int, 4> RepeatedMask;
17164   bool Is128BitLaneRepeatedShuffle =
17165       is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
17166   if (Is128BitLaneRepeatedShuffle) {
17167     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17168     if (V2.isUndef())
17169       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
17170                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17171 
17172     // Use dedicated unpack instructions for masks that match their pattern.
17173     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
17174       return V;
17175   }
17176 
17177   // Try to use shift instructions.
17178   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
17179                                           Zeroable, Subtarget, DAG))
17180     return Shift;
17181 
17182   // If we have VLX support, we can use VALIGN or EXPAND.
17183   if (Subtarget.hasVLX()) {
17184     if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
17185                                               Subtarget, DAG))
17186       return Rotate;
17187 
17188     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
17189                                          DAG, Subtarget))
17190       return V;
17191   }
17192 
17193   // Try to use byte rotation instructions.
17194   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
17195                                                 Subtarget, DAG))
17196     return Rotate;
17197 
17198   // Try to create an in-lane repeating shuffle mask and then shuffle the
17199   // results into the target lanes.
17200   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17201           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17202     return V;
17203 
17204   if (V2.isUndef()) {
17205     // Try to produce a fixed cross-128-bit lane permute followed by unpack
17206     // because that should be faster than the variable permute alternatives.
17207     if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
17208       return V;
17209 
17210     // If the shuffle patterns aren't repeated but it's a single input, directly
17211     // generate a cross-lane VPERMD instruction.
17212     SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17213     return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
17214   }
17215 
17216   // Assume that a single SHUFPS is faster than an alternative sequence of
17217   // multiple instructions (even if the CPU has a domain penalty).
17218   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17219   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17220     SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
17221     SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
17222     SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
17223                                             CastV1, CastV2, DAG);
17224     return DAG.getBitcast(MVT::v8i32, ShufPS);
17225   }
17226 
17227   // Try to simplify this by merging 128-bit lanes to enable a lane-based
17228   // shuffle.
17229   if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17230           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17231     return Result;
17232 
17233   // Otherwise fall back on generic blend lowering.
17234   return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
17235                                               Subtarget, DAG);
17236 }
17237 
17238 /// Handle lowering of 16-lane 16-bit integer shuffles.
17239 ///
17240 /// This routine is only called when we have AVX2 and thus a reasonable
17241 /// instruction set for v16i16 shuffling..
lowerV16I16Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)17242 static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17243                                   const APInt &Zeroable, SDValue V1, SDValue V2,
17244                                   const X86Subtarget &Subtarget,
17245                                   SelectionDAG &DAG) {
17246   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
17247   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
17248   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17249   assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
17250 
17251   // Whenever we can lower this as a zext, that instruction is strictly faster
17252   // than any alternative. It also allows us to fold memory operands into the
17253   // shuffle in many cases.
17254   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17255           DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17256     return ZExt;
17257 
17258   // Check for being able to broadcast a single element.
17259   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
17260                                                   Subtarget, DAG))
17261     return Broadcast;
17262 
17263   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
17264                                           Zeroable, Subtarget, DAG))
17265     return Blend;
17266 
17267   // Use dedicated unpack instructions for masks that match their pattern.
17268   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
17269     return V;
17270 
17271   // Use dedicated pack instructions for masks that match their pattern.
17272   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
17273                                        Subtarget))
17274     return V;
17275 
17276   // Try to use lower using a truncation.
17277   if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17278                                        Subtarget, DAG))
17279     return V;
17280 
17281   // Try to use shift instructions.
17282   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
17283                                           Zeroable, Subtarget, DAG))
17284     return Shift;
17285 
17286   // Try to use byte rotation instructions.
17287   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17288                                                 Subtarget, DAG))
17289     return Rotate;
17290 
17291   // Try to create an in-lane repeating shuffle mask and then shuffle the
17292   // results into the target lanes.
17293   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17294           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17295     return V;
17296 
17297   if (V2.isUndef()) {
17298     // Try to use bit rotation instructions.
17299     if (SDValue Rotate =
17300             lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17301       return Rotate;
17302 
17303     // Try to produce a fixed cross-128-bit lane permute followed by unpack
17304     // because that should be faster than the variable permute alternatives.
17305     if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
17306       return V;
17307 
17308     // There are no generalized cross-lane shuffle operations available on i16
17309     // element types.
17310     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17311       if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17312               DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17313         return V;
17314 
17315       return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17316                                                  DAG, Subtarget);
17317     }
17318 
17319     SmallVector<int, 8> RepeatedMask;
17320     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17321       // As this is a single-input shuffle, the repeated mask should be
17322       // a strictly valid v8i16 mask that we can pass through to the v8i16
17323       // lowering to handle even the v16 case.
17324       return lowerV8I16GeneralSingleInputShuffle(
17325           DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17326     }
17327   }
17328 
17329   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17330                                               Zeroable, Subtarget, DAG))
17331     return PSHUFB;
17332 
17333   // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17334   if (Subtarget.hasBWI())
17335     return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17336 
17337   // Try to simplify this by merging 128-bit lanes to enable a lane-based
17338   // shuffle.
17339   if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17340           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17341     return Result;
17342 
17343   // Try to permute the lanes and then use a per-lane permute.
17344   if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17345           DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17346     return V;
17347 
17348   // Otherwise fall back on generic lowering.
17349   return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
17350                                     Subtarget, DAG);
17351 }
17352 
17353 /// Handle lowering of 32-lane 8-bit integer shuffles.
17354 ///
17355 /// This routine is only called when we have AVX2 and thus a reasonable
17356 /// instruction set for v32i8 shuffling..
lowerV32I8Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)17357 static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17358                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17359                                  const X86Subtarget &Subtarget,
17360                                  SelectionDAG &DAG) {
17361   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17362   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17363   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17364   assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
17365 
17366   // Whenever we can lower this as a zext, that instruction is strictly faster
17367   // than any alternative. It also allows us to fold memory operands into the
17368   // shuffle in many cases.
17369   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17370                                                    Zeroable, Subtarget, DAG))
17371     return ZExt;
17372 
17373   // Check for being able to broadcast a single element.
17374   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17375                                                   Subtarget, DAG))
17376     return Broadcast;
17377 
17378   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17379                                           Zeroable, Subtarget, DAG))
17380     return Blend;
17381 
17382   // Use dedicated unpack instructions for masks that match their pattern.
17383   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
17384     return V;
17385 
17386   // Use dedicated pack instructions for masks that match their pattern.
17387   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
17388                                        Subtarget))
17389     return V;
17390 
17391   // Try to use lower using a truncation.
17392   if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17393                                        Subtarget, DAG))
17394     return V;
17395 
17396   // Try to use shift instructions.
17397   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
17398                                           Zeroable, Subtarget, DAG))
17399     return Shift;
17400 
17401   // Try to use byte rotation instructions.
17402   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17403                                                 Subtarget, DAG))
17404     return Rotate;
17405 
17406   // Try to use bit rotation instructions.
17407   if (V2.isUndef())
17408     if (SDValue Rotate =
17409             lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17410       return Rotate;
17411 
17412   // Try to create an in-lane repeating shuffle mask and then shuffle the
17413   // results into the target lanes.
17414   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17415           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17416     return V;
17417 
17418   // There are no generalized cross-lane shuffle operations available on i8
17419   // element types.
17420   if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17421     // Try to produce a fixed cross-128-bit lane permute followed by unpack
17422     // because that should be faster than the variable permute alternatives.
17423     if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
17424       return V;
17425 
17426     if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17427             DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17428       return V;
17429 
17430     return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17431                                                DAG, Subtarget);
17432   }
17433 
17434   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17435                                               Zeroable, Subtarget, DAG))
17436     return PSHUFB;
17437 
17438   // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17439   if (Subtarget.hasVBMI())
17440     return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17441 
17442   // Try to simplify this by merging 128-bit lanes to enable a lane-based
17443   // shuffle.
17444   if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17445           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17446     return Result;
17447 
17448   // Try to permute the lanes and then use a per-lane permute.
17449   if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17450           DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17451     return V;
17452 
17453   // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17454   // by zeroable elements in the remaining 24 elements. Turn this into two
17455   // vmovqb instructions shuffled together.
17456   if (Subtarget.hasVLX())
17457     if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17458                                                   Mask, Zeroable, DAG))
17459       return V;
17460 
17461   // Otherwise fall back on generic lowering.
17462   return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
17463                                     Subtarget, DAG);
17464 }
17465 
17466 /// High-level routine to lower various 256-bit x86 vector shuffles.
17467 ///
17468 /// This routine either breaks down the specific type of a 256-bit x86 vector
17469 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
17470 /// together based on the available instructions.
lower256BitShuffle(const SDLoc & DL,ArrayRef<int> Mask,MVT VT,SDValue V1,SDValue V2,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)17471 static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
17472                                   SDValue V1, SDValue V2, const APInt &Zeroable,
17473                                   const X86Subtarget &Subtarget,
17474                                   SelectionDAG &DAG) {
17475   // If we have a single input to the zero element, insert that into V1 if we
17476   // can do so cheaply.
17477   int NumElts = VT.getVectorNumElements();
17478   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17479 
17480   if (NumV2Elements == 1 && Mask[0] >= NumElts)
17481     if (SDValue Insertion = lowerShuffleAsElementInsertion(
17482             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17483       return Insertion;
17484 
17485   // Handle special cases where the lower or upper half is UNDEF.
17486   if (SDValue V =
17487           lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17488     return V;
17489 
17490   // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17491   // can check for those subtargets here and avoid much of the subtarget
17492   // querying in the per-vector-type lowering routines. With AVX1 we have
17493   // essentially *zero* ability to manipulate a 256-bit vector with integer
17494   // types. Since we'll use floating point types there eventually, just
17495   // immediately cast everything to a float and operate entirely in that domain.
17496   if (VT.isInteger() && !Subtarget.hasAVX2()) {
17497     int ElementBits = VT.getScalarSizeInBits();
17498     if (ElementBits < 32) {
17499       // No floating point type available, if we can't use the bit operations
17500       // for masking/blending then decompose into 128-bit vectors.
17501       if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17502                                             Subtarget, DAG))
17503         return V;
17504       if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17505         return V;
17506       return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
17507     }
17508 
17509     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17510                                 VT.getVectorNumElements());
17511     V1 = DAG.getBitcast(FpVT, V1);
17512     V2 = DAG.getBitcast(FpVT, V2);
17513     return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17514   }
17515 
17516   switch (VT.SimpleTy) {
17517   case MVT::v4f64:
17518     return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17519   case MVT::v4i64:
17520     return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17521   case MVT::v8f32:
17522     return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17523   case MVT::v8i32:
17524     return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17525   case MVT::v16i16:
17526     return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17527   case MVT::v32i8:
17528     return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17529 
17530   default:
17531     llvm_unreachable("Not a valid 256-bit x86 vector type!");
17532   }
17533 }
17534 
17535 /// Try to lower a vector shuffle as a 128-bit shuffles.
lowerV4X128Shuffle(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)17536 static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
17537                                   const APInt &Zeroable, SDValue V1, SDValue V2,
17538                                   const X86Subtarget &Subtarget,
17539                                   SelectionDAG &DAG) {
17540   assert(VT.getScalarSizeInBits() == 64 &&
17541          "Unexpected element type size for 128bit shuffle.");
17542 
17543   // To handle 256 bit vector requires VLX and most probably
17544   // function lowerV2X128VectorShuffle() is better solution.
17545   assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
17546 
17547   // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17548   SmallVector<int, 4> Widened128Mask;
17549   if (!canWidenShuffleElements(Mask, Widened128Mask))
17550     return SDValue();
17551   assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
17552 
17553   // Try to use an insert into a zero vector.
17554   if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17555       (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17556     unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17557     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17558     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17559                               DAG.getIntPtrConstant(0, DL));
17560     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17561                        getZeroVector(VT, Subtarget, DAG, DL), LoV,
17562                        DAG.getIntPtrConstant(0, DL));
17563   }
17564 
17565   // Check for patterns which can be matched with a single insert of a 256-bit
17566   // subvector.
17567   bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17568   if (OnlyUsesV1 ||
17569       isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17570     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17571     SDValue SubVec =
17572         DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17573                     DAG.getIntPtrConstant(0, DL));
17574     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17575                        DAG.getIntPtrConstant(4, DL));
17576   }
17577 
17578   // See if this is an insertion of the lower 128-bits of V2 into V1.
17579   bool IsInsert = true;
17580   int V2Index = -1;
17581   for (int i = 0; i < 4; ++i) {
17582     assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17583     if (Widened128Mask[i] < 0)
17584       continue;
17585 
17586     // Make sure all V1 subvectors are in place.
17587     if (Widened128Mask[i] < 4) {
17588       if (Widened128Mask[i] != i) {
17589         IsInsert = false;
17590         break;
17591       }
17592     } else {
17593       // Make sure we only have a single V2 index and its the lowest 128-bits.
17594       if (V2Index >= 0 || Widened128Mask[i] != 4) {
17595         IsInsert = false;
17596         break;
17597       }
17598       V2Index = i;
17599     }
17600   }
17601   if (IsInsert && V2Index >= 0) {
17602     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17603     SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17604                                  DAG.getIntPtrConstant(0, DL));
17605     return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17606   }
17607 
17608   // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17609   // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17610   // possible we at least ensure the lanes stay sequential to help later
17611   // combines.
17612   SmallVector<int, 2> Widened256Mask;
17613   if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17614     Widened128Mask.clear();
17615     narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17616   }
17617 
17618   // Try to lower to vshuf64x2/vshuf32x4.
17619   SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17620   unsigned PermMask = 0;
17621   // Insure elements came from the same Op.
17622   for (int i = 0; i < 4; ++i) {
17623     assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17624     if (Widened128Mask[i] < 0)
17625       continue;
17626 
17627     SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17628     unsigned OpIndex = i / 2;
17629     if (Ops[OpIndex].isUndef())
17630       Ops[OpIndex] = Op;
17631     else if (Ops[OpIndex] != Op)
17632       return SDValue();
17633 
17634     // Convert the 128-bit shuffle mask selection values into 128-bit selection
17635     // bits defined by a vshuf64x2 instruction's immediate control byte.
17636     PermMask |= (Widened128Mask[i] % 4) << (i * 2);
17637   }
17638 
17639   return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17640                      DAG.getTargetConstant(PermMask, DL, MVT::i8));
17641 }
17642 
17643 /// Handle lowering of 8-lane 64-bit floating point shuffles.
lowerV8F64Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)17644 static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17645                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17646                                  const X86Subtarget &Subtarget,
17647                                  SelectionDAG &DAG) {
17648   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17649   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17650   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17651 
17652   if (V2.isUndef()) {
17653     // Use low duplicate instructions for masks that match their pattern.
17654     if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17655       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17656 
17657     if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17658       // Non-half-crossing single input shuffles can be lowered with an
17659       // interleaved permutation.
17660       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17661                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17662                               ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17663                               ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17664       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17665                          DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17666     }
17667 
17668     SmallVector<int, 4> RepeatedMask;
17669     if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17670       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17671                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17672   }
17673 
17674   if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17675                                            V2, Subtarget, DAG))
17676     return Shuf128;
17677 
17678   if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
17679     return Unpck;
17680 
17681   // Check if the blend happens to exactly fit that of SHUFPD.
17682   if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17683                                           Zeroable, Subtarget, DAG))
17684     return Op;
17685 
17686   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
17687                                        DAG, Subtarget))
17688     return V;
17689 
17690   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17691                                           Zeroable, Subtarget, DAG))
17692     return Blend;
17693 
17694   return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17695 }
17696 
17697 /// Handle lowering of 16-lane 32-bit floating point shuffles.
lowerV16F32Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)17698 static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17699                                   const APInt &Zeroable, SDValue V1, SDValue V2,
17700                                   const X86Subtarget &Subtarget,
17701                                   SelectionDAG &DAG) {
17702   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17703   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17704   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17705 
17706   // If the shuffle mask is repeated in each 128-bit lane, we have many more
17707   // options to efficiently lower the shuffle.
17708   SmallVector<int, 4> RepeatedMask;
17709   if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17710     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17711 
17712     // Use even/odd duplicate instructions for masks that match their pattern.
17713     if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17714       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17715     if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17716       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17717 
17718     if (V2.isUndef())
17719       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17720                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17721 
17722     // Use dedicated unpack instructions for masks that match their pattern.
17723     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
17724       return V;
17725 
17726     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17727                                             Zeroable, Subtarget, DAG))
17728       return Blend;
17729 
17730     // Otherwise, fall back to a SHUFPS sequence.
17731     return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17732   }
17733 
17734   // Try to create an in-lane repeating shuffle mask and then shuffle the
17735   // results into the target lanes.
17736   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17737           DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17738     return V;
17739 
17740   // If we have a single input shuffle with different shuffle patterns in the
17741   // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17742   if (V2.isUndef() &&
17743       !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17744     SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17745     return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17746   }
17747 
17748   // If we have AVX512F support, we can use VEXPAND.
17749   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
17750                                              V1, V2, DAG, Subtarget))
17751     return V;
17752 
17753   return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17754 }
17755 
17756 /// Handle lowering of 8-lane 64-bit integer shuffles.
lowerV8I64Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)17757 static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17758                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17759                                  const X86Subtarget &Subtarget,
17760                                  SelectionDAG &DAG) {
17761   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17762   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17763   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17764 
17765   if (V2.isUndef()) {
17766     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17767     // can use lower latency instructions that will operate on all four
17768     // 128-bit lanes.
17769     SmallVector<int, 2> Repeated128Mask;
17770     if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17771       SmallVector<int, 4> PSHUFDMask;
17772       narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17773       return DAG.getBitcast(
17774           MVT::v8i64,
17775           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17776                       DAG.getBitcast(MVT::v16i32, V1),
17777                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17778     }
17779 
17780     SmallVector<int, 4> Repeated256Mask;
17781     if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17782       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17783                          getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17784   }
17785 
17786   if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17787                                            V2, Subtarget, DAG))
17788     return Shuf128;
17789 
17790   // Try to use shift instructions.
17791   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
17792                                           Zeroable, Subtarget, DAG))
17793     return Shift;
17794 
17795   // Try to use VALIGN.
17796   if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17797                                             Subtarget, DAG))
17798     return Rotate;
17799 
17800   // Try to use PALIGNR.
17801   if (Subtarget.hasBWI())
17802     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17803                                                   Subtarget, DAG))
17804       return Rotate;
17805 
17806   if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
17807     return Unpck;
17808 
17809   // If we have AVX512F support, we can use VEXPAND.
17810   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
17811                                        DAG, Subtarget))
17812     return V;
17813 
17814   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17815                                           Zeroable, Subtarget, DAG))
17816     return Blend;
17817 
17818   return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17819 }
17820 
17821 /// Handle lowering of 16-lane 32-bit integer shuffles.
lowerV16I32Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)17822 static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17823                                   const APInt &Zeroable, SDValue V1, SDValue V2,
17824                                   const X86Subtarget &Subtarget,
17825                                   SelectionDAG &DAG) {
17826   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17827   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17828   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17829 
17830   // Whenever we can lower this as a zext, that instruction is strictly faster
17831   // than any alternative. It also allows us to fold memory operands into the
17832   // shuffle in many cases.
17833   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17834           DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17835     return ZExt;
17836 
17837   // If the shuffle mask is repeated in each 128-bit lane we can use more
17838   // efficient instructions that mirror the shuffles across the four 128-bit
17839   // lanes.
17840   SmallVector<int, 4> RepeatedMask;
17841   bool Is128BitLaneRepeatedShuffle =
17842       is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17843   if (Is128BitLaneRepeatedShuffle) {
17844     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17845     if (V2.isUndef())
17846       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17847                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17848 
17849     // Use dedicated unpack instructions for masks that match their pattern.
17850     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
17851       return V;
17852   }
17853 
17854   // Try to use shift instructions.
17855   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
17856                                           Zeroable, Subtarget, DAG))
17857     return Shift;
17858 
17859   // Try to use VALIGN.
17860   if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17861                                             Subtarget, DAG))
17862     return Rotate;
17863 
17864   // Try to use byte rotation instructions.
17865   if (Subtarget.hasBWI())
17866     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17867                                                   Subtarget, DAG))
17868       return Rotate;
17869 
17870   // Assume that a single SHUFPS is faster than using a permv shuffle.
17871   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17872   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17873     SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17874     SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17875     SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17876                                             CastV1, CastV2, DAG);
17877     return DAG.getBitcast(MVT::v16i32, ShufPS);
17878   }
17879 
17880   // Try to create an in-lane repeating shuffle mask and then shuffle the
17881   // results into the target lanes.
17882   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17883           DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17884     return V;
17885 
17886   // If we have AVX512F support, we can use VEXPAND.
17887   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
17888                                        DAG, Subtarget))
17889     return V;
17890 
17891   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17892                                           Zeroable, Subtarget, DAG))
17893     return Blend;
17894 
17895   return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17896 }
17897 
17898 /// Handle lowering of 32-lane 16-bit integer shuffles.
lowerV32I16Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)17899 static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17900                                   const APInt &Zeroable, SDValue V1, SDValue V2,
17901                                   const X86Subtarget &Subtarget,
17902                                   SelectionDAG &DAG) {
17903   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17904   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17905   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17906   assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17907 
17908   // Whenever we can lower this as a zext, that instruction is strictly faster
17909   // than any alternative. It also allows us to fold memory operands into the
17910   // shuffle in many cases.
17911   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17912           DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17913     return ZExt;
17914 
17915   // Use dedicated unpack instructions for masks that match their pattern.
17916   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
17917     return V;
17918 
17919   // Use dedicated pack instructions for masks that match their pattern.
17920   if (SDValue V =
17921           lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
17922     return V;
17923 
17924   // Try to use shift instructions.
17925   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
17926                                           Zeroable, Subtarget, DAG))
17927     return Shift;
17928 
17929   // Try to use byte rotation instructions.
17930   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17931                                                 Subtarget, DAG))
17932     return Rotate;
17933 
17934   if (V2.isUndef()) {
17935     // Try to use bit rotation instructions.
17936     if (SDValue Rotate =
17937             lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17938       return Rotate;
17939 
17940     SmallVector<int, 8> RepeatedMask;
17941     if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17942       // As this is a single-input shuffle, the repeated mask should be
17943       // a strictly valid v8i16 mask that we can pass through to the v8i16
17944       // lowering to handle even the v32 case.
17945       return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17946                                                  RepeatedMask, Subtarget, DAG);
17947     }
17948   }
17949 
17950   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17951                                           Zeroable, Subtarget, DAG))
17952     return Blend;
17953 
17954   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17955                                               Zeroable, Subtarget, DAG))
17956     return PSHUFB;
17957 
17958   return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17959 }
17960 
17961 /// Handle lowering of 64-lane 8-bit integer shuffles.
lowerV64I8Shuffle(const SDLoc & DL,ArrayRef<int> Mask,const APInt & Zeroable,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)17962 static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17963                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17964                                  const X86Subtarget &Subtarget,
17965                                  SelectionDAG &DAG) {
17966   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17967   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17968   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17969   assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17970 
17971   // Whenever we can lower this as a zext, that instruction is strictly faster
17972   // than any alternative. It also allows us to fold memory operands into the
17973   // shuffle in many cases.
17974   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17975           DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17976     return ZExt;
17977 
17978   // Use dedicated unpack instructions for masks that match their pattern.
17979   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
17980     return V;
17981 
17982   // Use dedicated pack instructions for masks that match their pattern.
17983   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
17984                                        Subtarget))
17985     return V;
17986 
17987   // Try to use shift instructions.
17988   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
17989                                           Zeroable, Subtarget, DAG))
17990     return Shift;
17991 
17992   // Try to use byte rotation instructions.
17993   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17994                                                 Subtarget, DAG))
17995     return Rotate;
17996 
17997   // Try to use bit rotation instructions.
17998   if (V2.isUndef())
17999     if (SDValue Rotate =
18000             lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
18001       return Rotate;
18002 
18003   // Lower as AND if possible.
18004   if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
18005                                              Zeroable, Subtarget, DAG))
18006     return Masked;
18007 
18008   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
18009                                               Zeroable, Subtarget, DAG))
18010     return PSHUFB;
18011 
18012   // VBMI can use VPERMV/VPERMV3 byte shuffles.
18013   if (Subtarget.hasVBMI())
18014     return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
18015 
18016   // Try to create an in-lane repeating shuffle mask and then shuffle the
18017   // results into the target lanes.
18018   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18019           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18020     return V;
18021 
18022   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
18023                                           Zeroable, Subtarget, DAG))
18024     return Blend;
18025 
18026   // Try to simplify this by merging 128-bit lanes to enable a lane-based
18027   // shuffle.
18028   if (!V2.isUndef())
18029     if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18030             DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18031       return Result;
18032 
18033   // FIXME: Implement direct support for this type!
18034   return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
18035 }
18036 
18037 /// High-level routine to lower various 512-bit x86 vector shuffles.
18038 ///
18039 /// This routine either breaks down the specific type of a 512-bit x86 vector
18040 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
18041 /// together based on the available instructions.
lower512BitShuffle(const SDLoc & DL,ArrayRef<int> Mask,MVT VT,SDValue V1,SDValue V2,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)18042 static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
18043                                   MVT VT, SDValue V1, SDValue V2,
18044                                   const APInt &Zeroable,
18045                                   const X86Subtarget &Subtarget,
18046                                   SelectionDAG &DAG) {
18047   assert(Subtarget.hasAVX512() &&
18048          "Cannot lower 512-bit vectors w/ basic ISA!");
18049 
18050   // If we have a single input to the zero element, insert that into V1 if we
18051   // can do so cheaply.
18052   int NumElts = Mask.size();
18053   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18054 
18055   if (NumV2Elements == 1 && Mask[0] >= NumElts)
18056     if (SDValue Insertion = lowerShuffleAsElementInsertion(
18057             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18058       return Insertion;
18059 
18060   // Handle special cases where the lower or upper half is UNDEF.
18061   if (SDValue V =
18062           lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
18063     return V;
18064 
18065   // Check for being able to broadcast a single element.
18066   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
18067                                                   Subtarget, DAG))
18068     return Broadcast;
18069 
18070   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
18071     // Try using bit ops for masking and blending before falling back to
18072     // splitting.
18073     if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
18074                                           Subtarget, DAG))
18075       return V;
18076     if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
18077       return V;
18078 
18079     return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
18080   }
18081 
18082   // Dispatch to each element type for lowering. If we don't have support for
18083   // specific element type shuffles at 512 bits, immediately split them and
18084   // lower them. Each lowering routine of a given type is allowed to assume that
18085   // the requisite ISA extensions for that element type are available.
18086   switch (VT.SimpleTy) {
18087   case MVT::v8f64:
18088     return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18089   case MVT::v16f32:
18090     return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18091   case MVT::v8i64:
18092     return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18093   case MVT::v16i32:
18094     return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18095   case MVT::v32i16:
18096     return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18097   case MVT::v64i8:
18098     return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18099 
18100   default:
18101     llvm_unreachable("Not a valid 512-bit x86 vector type!");
18102   }
18103 }
18104 
lower1BitShuffleAsKSHIFTR(const SDLoc & DL,ArrayRef<int> Mask,MVT VT,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)18105 static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
18106                                          MVT VT, SDValue V1, SDValue V2,
18107                                          const X86Subtarget &Subtarget,
18108                                          SelectionDAG &DAG) {
18109   // Shuffle should be unary.
18110   if (!V2.isUndef())
18111     return SDValue();
18112 
18113   int ShiftAmt = -1;
18114   int NumElts = Mask.size();
18115   for (int i = 0; i != NumElts; ++i) {
18116     int M = Mask[i];
18117     assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
18118            "Unexpected mask index.");
18119     if (M < 0)
18120       continue;
18121 
18122     // The first non-undef element determines our shift amount.
18123     if (ShiftAmt < 0) {
18124       ShiftAmt = M - i;
18125       // Need to be shifting right.
18126       if (ShiftAmt <= 0)
18127         return SDValue();
18128     }
18129     // All non-undef elements must shift by the same amount.
18130     if (ShiftAmt != M - i)
18131       return SDValue();
18132   }
18133   assert(ShiftAmt >= 0 && "All undef?");
18134 
18135   // Great we found a shift right.
18136   MVT WideVT = VT;
18137   if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18138     WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18139   SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18140                             DAG.getUNDEF(WideVT), V1,
18141                             DAG.getIntPtrConstant(0, DL));
18142   Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
18143                     DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18144   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18145                      DAG.getIntPtrConstant(0, DL));
18146 }
18147 
18148 // Determine if this shuffle can be implemented with a KSHIFT instruction.
18149 // Returns the shift amount if possible or -1 if not. This is a simplified
18150 // version of matchShuffleAsShift.
match1BitShuffleAsKSHIFT(unsigned & Opcode,ArrayRef<int> Mask,int MaskOffset,const APInt & Zeroable)18151 static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
18152                                     int MaskOffset, const APInt &Zeroable) {
18153   int Size = Mask.size();
18154 
18155   auto CheckZeros = [&](int Shift, bool Left) {
18156     for (int j = 0; j < Shift; ++j)
18157       if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
18158         return false;
18159 
18160     return true;
18161   };
18162 
18163   auto MatchShift = [&](int Shift, bool Left) {
18164     unsigned Pos = Left ? Shift : 0;
18165     unsigned Low = Left ? 0 : Shift;
18166     unsigned Len = Size - Shift;
18167     return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
18168   };
18169 
18170   for (int Shift = 1; Shift != Size; ++Shift)
18171     for (bool Left : {true, false})
18172       if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
18173         Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
18174         return Shift;
18175       }
18176 
18177   return -1;
18178 }
18179 
18180 
18181 // Lower vXi1 vector shuffles.
18182 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
18183 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
18184 // vector, shuffle and then truncate it back.
lower1BitShuffle(const SDLoc & DL,ArrayRef<int> Mask,MVT VT,SDValue V1,SDValue V2,const APInt & Zeroable,const X86Subtarget & Subtarget,SelectionDAG & DAG)18185 static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
18186                                 MVT VT, SDValue V1, SDValue V2,
18187                                 const APInt &Zeroable,
18188                                 const X86Subtarget &Subtarget,
18189                                 SelectionDAG &DAG) {
18190   assert(Subtarget.hasAVX512() &&
18191          "Cannot lower 512-bit vectors w/o basic ISA!");
18192 
18193   int NumElts = Mask.size();
18194 
18195   // Try to recognize shuffles that are just padding a subvector with zeros.
18196   int SubvecElts = 0;
18197   int Src = -1;
18198   for (int i = 0; i != NumElts; ++i) {
18199     if (Mask[i] >= 0) {
18200       // Grab the source from the first valid mask. All subsequent elements need
18201       // to use this same source.
18202       if (Src < 0)
18203         Src = Mask[i] / NumElts;
18204       if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18205         break;
18206     }
18207 
18208     ++SubvecElts;
18209   }
18210   assert(SubvecElts != NumElts && "Identity shuffle?");
18211 
18212   // Clip to a power 2.
18213   SubvecElts = PowerOf2Floor(SubvecElts);
18214 
18215   // Make sure the number of zeroable bits in the top at least covers the bits
18216   // not covered by the subvector.
18217   if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
18218     assert(Src >= 0 && "Expected a source!");
18219     MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18220     SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
18221                                   Src == 0 ? V1 : V2,
18222                                   DAG.getIntPtrConstant(0, DL));
18223     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18224                        DAG.getConstant(0, DL, VT),
18225                        Extract, DAG.getIntPtrConstant(0, DL));
18226   }
18227 
18228   // Try a simple shift right with undef elements. Later we'll try with zeros.
18229   if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
18230                                                 DAG))
18231     return Shift;
18232 
18233   // Try to match KSHIFTs.
18234   unsigned Offset = 0;
18235   for (SDValue V : { V1, V2 }) {
18236     unsigned Opcode;
18237     int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18238     if (ShiftAmt >= 0) {
18239       MVT WideVT = VT;
18240       if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18241         WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18242       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18243                                 DAG.getUNDEF(WideVT), V,
18244                                 DAG.getIntPtrConstant(0, DL));
18245       // Widened right shifts need two shifts to ensure we shift in zeroes.
18246       if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18247         int WideElts = WideVT.getVectorNumElements();
18248         // Shift left to put the original vector in the MSBs of the new size.
18249         Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18250                           DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18251         // Increase the shift amount to account for the left shift.
18252         ShiftAmt += WideElts - NumElts;
18253       }
18254 
18255       Res = DAG.getNode(Opcode, DL, WideVT, Res,
18256                         DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18257       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18258                          DAG.getIntPtrConstant(0, DL));
18259     }
18260     Offset += NumElts; // Increment for next iteration.
18261   }
18262 
18263 
18264 
18265   MVT ExtVT;
18266   switch (VT.SimpleTy) {
18267   default:
18268     llvm_unreachable("Expected a vector of i1 elements");
18269   case MVT::v2i1:
18270     ExtVT = MVT::v2i64;
18271     break;
18272   case MVT::v4i1:
18273     ExtVT = MVT::v4i32;
18274     break;
18275   case MVT::v8i1:
18276     // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18277     // shuffle.
18278     ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18279     break;
18280   case MVT::v16i1:
18281     // Take 512-bit type, unless we are avoiding 512-bit types and have the
18282     // 256-bit operation available.
18283     ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18284     break;
18285   case MVT::v32i1:
18286     // Take 512-bit type, unless we are avoiding 512-bit types and have the
18287     // 256-bit operation available.
18288     assert(Subtarget.hasBWI() && "Expected AVX512BW support");
18289     ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18290     break;
18291   case MVT::v64i1:
18292     // Fall back to scalarization. FIXME: We can do better if the shuffle
18293     // can be partitioned cleanly.
18294     if (!Subtarget.useBWIRegs())
18295       return SDValue();
18296     ExtVT = MVT::v64i8;
18297     break;
18298   }
18299 
18300   V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18301   V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18302 
18303   SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18304   // i1 was sign extended we can use X86ISD::CVT2MASK.
18305   int NumElems = VT.getVectorNumElements();
18306   if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18307       (Subtarget.hasDQI() && (NumElems < 32)))
18308     return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18309                        Shuffle, ISD::SETGT);
18310 
18311   return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18312 }
18313 
18314 /// Helper function that returns true if the shuffle mask should be
18315 /// commuted to improve canonicalization.
canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask)18316 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
18317   int NumElements = Mask.size();
18318 
18319   int NumV1Elements = 0, NumV2Elements = 0;
18320   for (int M : Mask)
18321     if (M < 0)
18322       continue;
18323     else if (M < NumElements)
18324       ++NumV1Elements;
18325     else
18326       ++NumV2Elements;
18327 
18328   // Commute the shuffle as needed such that more elements come from V1 than
18329   // V2. This allows us to match the shuffle pattern strictly on how many
18330   // elements come from V1 without handling the symmetric cases.
18331   if (NumV2Elements > NumV1Elements)
18332     return true;
18333 
18334   assert(NumV1Elements > 0 && "No V1 indices");
18335 
18336   if (NumV2Elements == 0)
18337     return false;
18338 
18339   // When the number of V1 and V2 elements are the same, try to minimize the
18340   // number of uses of V2 in the low half of the vector. When that is tied,
18341   // ensure that the sum of indices for V1 is equal to or lower than the sum
18342   // indices for V2. When those are equal, try to ensure that the number of odd
18343   // indices for V1 is lower than the number of odd indices for V2.
18344   if (NumV1Elements == NumV2Elements) {
18345     int LowV1Elements = 0, LowV2Elements = 0;
18346     for (int M : Mask.slice(0, NumElements / 2))
18347       if (M >= NumElements)
18348         ++LowV2Elements;
18349       else if (M >= 0)
18350         ++LowV1Elements;
18351     if (LowV2Elements > LowV1Elements)
18352       return true;
18353     if (LowV2Elements == LowV1Elements) {
18354       int SumV1Indices = 0, SumV2Indices = 0;
18355       for (int i = 0, Size = Mask.size(); i < Size; ++i)
18356         if (Mask[i] >= NumElements)
18357           SumV2Indices += i;
18358         else if (Mask[i] >= 0)
18359           SumV1Indices += i;
18360       if (SumV2Indices < SumV1Indices)
18361         return true;
18362       if (SumV2Indices == SumV1Indices) {
18363         int NumV1OddIndices = 0, NumV2OddIndices = 0;
18364         for (int i = 0, Size = Mask.size(); i < Size; ++i)
18365           if (Mask[i] >= NumElements)
18366             NumV2OddIndices += i % 2;
18367           else if (Mask[i] >= 0)
18368             NumV1OddIndices += i % 2;
18369         if (NumV2OddIndices < NumV1OddIndices)
18370           return true;
18371       }
18372     }
18373   }
18374 
18375   return false;
18376 }
18377 
18378 /// Top-level lowering for x86 vector shuffles.
18379 ///
18380 /// This handles decomposition, canonicalization, and lowering of all x86
18381 /// vector shuffles. Most of the specific lowering strategies are encapsulated
18382 /// above in helper routines. The canonicalization attempts to widen shuffles
18383 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
18384 /// s.t. only one of the two inputs needs to be tested, etc.
lowerVECTOR_SHUFFLE(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)18385 static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
18386                                    SelectionDAG &DAG) {
18387   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
18388   ArrayRef<int> OrigMask = SVOp->getMask();
18389   SDValue V1 = Op.getOperand(0);
18390   SDValue V2 = Op.getOperand(1);
18391   MVT VT = Op.getSimpleValueType();
18392   int NumElements = VT.getVectorNumElements();
18393   SDLoc DL(Op);
18394   bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18395 
18396   assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
18397          "Can't lower MMX shuffles");
18398 
18399   bool V1IsUndef = V1.isUndef();
18400   bool V2IsUndef = V2.isUndef();
18401   if (V1IsUndef && V2IsUndef)
18402     return DAG.getUNDEF(VT);
18403 
18404   // When we create a shuffle node we put the UNDEF node to second operand,
18405   // but in some cases the first operand may be transformed to UNDEF.
18406   // In this case we should just commute the node.
18407   if (V1IsUndef)
18408     return DAG.getCommutedVectorShuffle(*SVOp);
18409 
18410   // Check for non-undef masks pointing at an undef vector and make the masks
18411   // undef as well. This makes it easier to match the shuffle based solely on
18412   // the mask.
18413   if (V2IsUndef &&
18414       any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18415     SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
18416     for (int &M : NewMask)
18417       if (M >= NumElements)
18418         M = -1;
18419     return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18420   }
18421 
18422   // Check for illegal shuffle mask element index values.
18423   int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18424   (void)MaskUpperLimit;
18425   assert(llvm::all_of(OrigMask,
18426                       [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
18427          "Out of bounds shuffle index");
18428 
18429   // We actually see shuffles that are entirely re-arrangements of a set of
18430   // zero inputs. This mostly happens while decomposing complex shuffles into
18431   // simple ones. Directly lower these as a buildvector of zeros.
18432   APInt KnownUndef, KnownZero;
18433   computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18434 
18435   APInt Zeroable = KnownUndef | KnownZero;
18436   if (Zeroable.isAllOnesValue())
18437     return getZeroVector(VT, Subtarget, DAG, DL);
18438 
18439   bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18440 
18441   // Try to collapse shuffles into using a vector type with fewer elements but
18442   // wider element types. We cap this to not form integers or floating point
18443   // elements wider than 64 bits, but it might be interesting to form i128
18444   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18445   SmallVector<int, 16> WidenedMask;
18446   if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18447       canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18448     // Shuffle mask widening should not interfere with a broadcast opportunity
18449     // by obfuscating the operands with bitcasts.
18450     // TODO: Avoid lowering directly from this top-level function: make this
18451     // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18452     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18453                                                     Subtarget, DAG))
18454       return Broadcast;
18455 
18456     MVT NewEltVT = VT.isFloatingPoint()
18457                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
18458                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
18459     int NewNumElts = NumElements / 2;
18460     MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18461     // Make sure that the new vector type is legal. For example, v2f64 isn't
18462     // legal on SSE1.
18463     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18464       if (V2IsZero) {
18465         // Modify the new Mask to take all zeros from the all-zero vector.
18466         // Choose indices that are blend-friendly.
18467         bool UsedZeroVector = false;
18468         assert(is_contained(WidenedMask, SM_SentinelZero) &&
18469                "V2's non-undef elements are used?!");
18470         for (int i = 0; i != NewNumElts; ++i)
18471           if (WidenedMask[i] == SM_SentinelZero) {
18472             WidenedMask[i] = i + NewNumElts;
18473             UsedZeroVector = true;
18474           }
18475         // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18476         // some elements to be undef.
18477         if (UsedZeroVector)
18478           V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18479       }
18480       V1 = DAG.getBitcast(NewVT, V1);
18481       V2 = DAG.getBitcast(NewVT, V2);
18482       return DAG.getBitcast(
18483           VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18484     }
18485   }
18486 
18487   // Commute the shuffle if it will improve canonicalization.
18488   SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end());
18489   if (canonicalizeShuffleMaskWithCommute(Mask)) {
18490     ShuffleVectorSDNode::commuteMask(Mask);
18491     std::swap(V1, V2);
18492   }
18493 
18494   // For each vector width, delegate to a specialized lowering routine.
18495   if (VT.is128BitVector())
18496     return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18497 
18498   if (VT.is256BitVector())
18499     return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18500 
18501   if (VT.is512BitVector())
18502     return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18503 
18504   if (Is1BitVector)
18505     return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18506 
18507   llvm_unreachable("Unimplemented!");
18508 }
18509 
18510 /// Try to lower a VSELECT instruction to a vector shuffle.
lowerVSELECTtoVectorShuffle(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)18511 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
18512                                            const X86Subtarget &Subtarget,
18513                                            SelectionDAG &DAG) {
18514   SDValue Cond = Op.getOperand(0);
18515   SDValue LHS = Op.getOperand(1);
18516   SDValue RHS = Op.getOperand(2);
18517   MVT VT = Op.getSimpleValueType();
18518 
18519   // Only non-legal VSELECTs reach this lowering, convert those into generic
18520   // shuffles and re-use the shuffle lowering path for blends.
18521   if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
18522     SmallVector<int, 32> Mask;
18523     if (createShuffleMaskFromVSELECT(Mask, Cond))
18524       return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18525   }
18526 
18527   return SDValue();
18528 }
18529 
LowerVSELECT(SDValue Op,SelectionDAG & DAG) const18530 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18531   SDValue Cond = Op.getOperand(0);
18532   SDValue LHS = Op.getOperand(1);
18533   SDValue RHS = Op.getOperand(2);
18534 
18535   // A vselect where all conditions and data are constants can be optimized into
18536   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18537   if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
18538       ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
18539       ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
18540     return SDValue();
18541 
18542   // Try to lower this to a blend-style vector shuffle. This can handle all
18543   // constant condition cases.
18544   if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18545     return BlendOp;
18546 
18547   // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18548   // with patterns on the mask registers on AVX-512.
18549   MVT CondVT = Cond.getSimpleValueType();
18550   unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18551   if (CondEltSize == 1)
18552     return Op;
18553 
18554   // Variable blends are only legal from SSE4.1 onward.
18555   if (!Subtarget.hasSSE41())
18556     return SDValue();
18557 
18558   SDLoc dl(Op);
18559   MVT VT = Op.getSimpleValueType();
18560   unsigned EltSize = VT.getScalarSizeInBits();
18561   unsigned NumElts = VT.getVectorNumElements();
18562 
18563   // Expand v32i16/v64i8 without BWI.
18564   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18565     return SDValue();
18566 
18567   // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18568   // into an i1 condition so that we can use the mask-based 512-bit blend
18569   // instructions.
18570   if (VT.getSizeInBits() == 512) {
18571     // Build a mask by testing the condition against zero.
18572     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18573     SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18574                                 DAG.getConstant(0, dl, CondVT),
18575                                 ISD::SETNE);
18576     // Now return a new VSELECT using the mask.
18577     return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18578   }
18579 
18580   // SEXT/TRUNC cases where the mask doesn't match the destination size.
18581   if (CondEltSize != EltSize) {
18582     // If we don't have a sign splat, rely on the expansion.
18583     if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18584       return SDValue();
18585 
18586     MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18587     MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18588     Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18589     return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18590   }
18591 
18592   // Only some types will be legal on some subtargets. If we can emit a legal
18593   // VSELECT-matching blend, return Op, and but if we need to expand, return
18594   // a null value.
18595   switch (VT.SimpleTy) {
18596   default:
18597     // Most of the vector types have blends past SSE4.1.
18598     return Op;
18599 
18600   case MVT::v32i8:
18601     // The byte blends for AVX vectors were introduced only in AVX2.
18602     if (Subtarget.hasAVX2())
18603       return Op;
18604 
18605     return SDValue();
18606 
18607   case MVT::v8i16:
18608   case MVT::v16i16: {
18609     // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18610     MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18611     Cond = DAG.getBitcast(CastVT, Cond);
18612     LHS = DAG.getBitcast(CastVT, LHS);
18613     RHS = DAG.getBitcast(CastVT, RHS);
18614     SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18615     return DAG.getBitcast(VT, Select);
18616   }
18617   }
18618 }
18619 
LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,SelectionDAG & DAG)18620 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
18621   MVT VT = Op.getSimpleValueType();
18622   SDValue Vec = Op.getOperand(0);
18623   SDValue Idx = Op.getOperand(1);
18624   assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18625   SDLoc dl(Op);
18626 
18627   if (!Vec.getSimpleValueType().is128BitVector())
18628     return SDValue();
18629 
18630   if (VT.getSizeInBits() == 8) {
18631     // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18632     // we're going to zero extend the register or fold the store.
18633     if (llvm::isNullConstant(Idx) && !MayFoldIntoZeroExtend(Op) &&
18634         !MayFoldIntoStore(Op))
18635       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18636                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18637                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
18638 
18639     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
18640     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18641                                   DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18642     return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18643   }
18644 
18645   if (VT == MVT::f32) {
18646     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18647     // the result back to FR32 register. It's only worth matching if the
18648     // result has a single use which is a store or a bitcast to i32.  And in
18649     // the case of a store, it's not worth it if the index is a constant 0,
18650     // because a MOVSSmr can be used instead, which is smaller and faster.
18651     if (!Op.hasOneUse())
18652       return SDValue();
18653     SDNode *User = *Op.getNode()->use_begin();
18654     if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18655         (User->getOpcode() != ISD::BITCAST ||
18656          User->getValueType(0) != MVT::i32))
18657       return SDValue();
18658     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18659                                   DAG.getBitcast(MVT::v4i32, Vec), Idx);
18660     return DAG.getBitcast(MVT::f32, Extract);
18661   }
18662 
18663   if (VT == MVT::i32 || VT == MVT::i64)
18664       return Op;
18665 
18666   return SDValue();
18667 }
18668 
18669 /// Extract one bit from mask vector, like v16i1 or v8i1.
18670 /// AVX-512 feature.
ExtractBitFromMaskVector(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)18671 static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
18672                                         const X86Subtarget &Subtarget) {
18673   SDValue Vec = Op.getOperand(0);
18674   SDLoc dl(Vec);
18675   MVT VecVT = Vec.getSimpleValueType();
18676   SDValue Idx = Op.getOperand(1);
18677   auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18678   MVT EltVT = Op.getSimpleValueType();
18679 
18680   assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18681          "Unexpected vector type in ExtractBitFromMaskVector");
18682 
18683   // variable index can't be handled in mask registers,
18684   // extend vector to VR512/128
18685   if (!IdxC) {
18686     unsigned NumElts = VecVT.getVectorNumElements();
18687     // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18688     // than extending to 128/256bit.
18689     MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18690     MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18691     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18692     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18693     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18694   }
18695 
18696   unsigned IdxVal = IdxC->getZExtValue();
18697   if (IdxVal == 0) // the operation is legal
18698     return Op;
18699 
18700   // Extend to natively supported kshift.
18701   unsigned NumElems = VecVT.getVectorNumElements();
18702   MVT WideVecVT = VecVT;
18703   if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
18704     WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18705     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
18706                       DAG.getUNDEF(WideVecVT), Vec,
18707                       DAG.getIntPtrConstant(0, dl));
18708   }
18709 
18710   // Use kshiftr instruction to move to the lower element.
18711   Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
18712                     DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18713 
18714   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18715                      DAG.getIntPtrConstant(0, dl));
18716 }
18717 
18718 SDValue
LowerEXTRACT_VECTOR_ELT(SDValue Op,SelectionDAG & DAG) const18719 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18720                                            SelectionDAG &DAG) const {
18721   SDLoc dl(Op);
18722   SDValue Vec = Op.getOperand(0);
18723   MVT VecVT = Vec.getSimpleValueType();
18724   SDValue Idx = Op.getOperand(1);
18725   auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18726 
18727   if (VecVT.getVectorElementType() == MVT::i1)
18728     return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18729 
18730   if (!IdxC) {
18731     // Its more profitable to go through memory (1 cycles throughput)
18732     // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
18733     // IACA tool was used to get performance estimation
18734     // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18735     //
18736     // example : extractelement <16 x i8> %a, i32 %i
18737     //
18738     // Block Throughput: 3.00 Cycles
18739     // Throughput Bottleneck: Port5
18740     //
18741     // | Num Of |   Ports pressure in cycles  |    |
18742     // |  Uops  |  0  - DV  |  5  |  6  |  7  |    |
18743     // ---------------------------------------------
18744     // |   1    |           | 1.0 |     |     | CP | vmovd xmm1, edi
18745     // |   1    |           | 1.0 |     |     | CP | vpshufb xmm0, xmm0, xmm1
18746     // |   2    | 1.0       | 1.0 |     |     | CP | vpextrb eax, xmm0, 0x0
18747     // Total Num Of Uops: 4
18748     //
18749     //
18750     // Block Throughput: 1.00 Cycles
18751     // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18752     //
18753     // |    |  Ports pressure in cycles   |  |
18754     // |Uops| 1 | 2 - D  |3 -  D  | 4 | 5 |  |
18755     // ---------------------------------------------------------
18756     // |2^  |   | 0.5    | 0.5    |1.0|   |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18757     // |1   |0.5|        |        |   |0.5|  | lea rax, ptr [rsp-0x18]
18758     // |1   |   |0.5, 0.5|0.5, 0.5|   |   |CP| mov al, byte ptr [rdi+rax*1]
18759     // Total Num Of Uops: 4
18760 
18761     return SDValue();
18762   }
18763 
18764   unsigned IdxVal = IdxC->getZExtValue();
18765 
18766   // If this is a 256-bit vector result, first extract the 128-bit vector and
18767   // then extract the element from the 128-bit vector.
18768   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18769     // Get the 128-bit vector.
18770     Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18771     MVT EltVT = VecVT.getVectorElementType();
18772 
18773     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18774     assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18775 
18776     // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18777     // this can be done with a mask.
18778     IdxVal &= ElemsPerChunk - 1;
18779     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18780                        DAG.getIntPtrConstant(IdxVal, dl));
18781   }
18782 
18783   assert(VecVT.is128BitVector() && "Unexpected vector length");
18784 
18785   MVT VT = Op.getSimpleValueType();
18786 
18787   if (VT.getSizeInBits() == 16) {
18788     // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18789     // we're going to zero extend the register or fold the store (SSE41 only).
18790     if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
18791         !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
18792       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18793                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18794                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
18795 
18796     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18797                                   DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18798     return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18799   }
18800 
18801   if (Subtarget.hasSSE41())
18802     if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18803       return Res;
18804 
18805   // TODO: We only extract a single element from v16i8, we can probably afford
18806   // to be more aggressive here before using the default approach of spilling to
18807   // stack.
18808   if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
18809     // Extract either the lowest i32 or any i16, and extract the sub-byte.
18810     int DWordIdx = IdxVal / 4;
18811     if (DWordIdx == 0) {
18812       SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18813                                 DAG.getBitcast(MVT::v4i32, Vec),
18814                                 DAG.getIntPtrConstant(DWordIdx, dl));
18815       int ShiftVal = (IdxVal % 4) * 8;
18816       if (ShiftVal != 0)
18817         Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18818                           DAG.getConstant(ShiftVal, dl, MVT::i8));
18819       return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18820     }
18821 
18822     int WordIdx = IdxVal / 2;
18823     SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18824                               DAG.getBitcast(MVT::v8i16, Vec),
18825                               DAG.getIntPtrConstant(WordIdx, dl));
18826     int ShiftVal = (IdxVal % 2) * 8;
18827     if (ShiftVal != 0)
18828       Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18829                         DAG.getConstant(ShiftVal, dl, MVT::i8));
18830     return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18831   }
18832 
18833   if (VT.getSizeInBits() == 32) {
18834     if (IdxVal == 0)
18835       return Op;
18836 
18837     // SHUFPS the element to the lowest double word, then movss.
18838     int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
18839     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18840     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18841                        DAG.getIntPtrConstant(0, dl));
18842   }
18843 
18844   if (VT.getSizeInBits() == 64) {
18845     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18846     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18847     //        to match extract_elt for f64.
18848     if (IdxVal == 0)
18849       return Op;
18850 
18851     // UNPCKHPD the element to the lowest double word, then movsd.
18852     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18853     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18854     int Mask[2] = { 1, -1 };
18855     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18856     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18857                        DAG.getIntPtrConstant(0, dl));
18858   }
18859 
18860   return SDValue();
18861 }
18862 
18863 /// Insert one bit to mask vector, like v16i1 or v8i1.
18864 /// AVX-512 feature.
InsertBitToMaskVector(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)18865 static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
18866                                      const X86Subtarget &Subtarget) {
18867   SDLoc dl(Op);
18868   SDValue Vec = Op.getOperand(0);
18869   SDValue Elt = Op.getOperand(1);
18870   SDValue Idx = Op.getOperand(2);
18871   MVT VecVT = Vec.getSimpleValueType();
18872 
18873   if (!isa<ConstantSDNode>(Idx)) {
18874     // Non constant index. Extend source and destination,
18875     // insert element and then truncate the result.
18876     unsigned NumElts = VecVT.getVectorNumElements();
18877     MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18878     MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18879     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18880       DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18881       DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18882     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18883   }
18884 
18885   // Copy into a k-register, extract to v1i1 and insert_subvector.
18886   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18887   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18888 }
18889 
LowerINSERT_VECTOR_ELT(SDValue Op,SelectionDAG & DAG) const18890 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18891                                                   SelectionDAG &DAG) const {
18892   MVT VT = Op.getSimpleValueType();
18893   MVT EltVT = VT.getVectorElementType();
18894   unsigned NumElts = VT.getVectorNumElements();
18895   unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18896 
18897   if (EltVT == MVT::i1)
18898     return InsertBitToMaskVector(Op, DAG, Subtarget);
18899 
18900   SDLoc dl(Op);
18901   SDValue N0 = Op.getOperand(0);
18902   SDValue N1 = Op.getOperand(1);
18903   SDValue N2 = Op.getOperand(2);
18904   auto *N2C = dyn_cast<ConstantSDNode>(N2);
18905 
18906   if (!N2C) {
18907     // Variable insertion indices, usually we're better off spilling to stack,
18908     // but AVX512 can use a variable compare+select by comparing against all
18909     // possible vector indices, and FP insertion has less gpr->simd traffic.
18910     if (!(Subtarget.hasBWI() ||
18911           (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18912           (Subtarget.hasSSE41() && VT.isFloatingPoint())))
18913       return SDValue();
18914 
18915     MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18916     MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18917     SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18918     SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18919     SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18920 
18921     SmallVector<SDValue, 16> RawIndices;
18922     for (unsigned I = 0; I != NumElts; ++I)
18923       RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18924     SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18925 
18926     // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18927     return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18928                            ISD::CondCode::SETEQ);
18929   }
18930 
18931   if (N2C->getAPIntValue().uge(NumElts))
18932     return SDValue();
18933   uint64_t IdxVal = N2C->getZExtValue();
18934 
18935   bool IsZeroElt = X86::isZeroNode(N1);
18936   bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18937 
18938   // If we are inserting a element, see if we can do this more efficiently with
18939   // a blend shuffle with a rematerializable vector than a costly integer
18940   // insertion.
18941   if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
18942       (16 <= EltSizeInBits || (IsZeroElt && !VT.is128BitVector()))) {
18943     SmallVector<int, 8> BlendMask;
18944     for (unsigned i = 0; i != NumElts; ++i)
18945       BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18946     SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
18947                                   : getOnesVector(VT, DAG, dl);
18948     return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
18949   }
18950 
18951   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18952   // into that, and then insert the subvector back into the result.
18953   if (VT.is256BitVector() || VT.is512BitVector()) {
18954     // With a 256-bit vector, we can insert into the zero element efficiently
18955     // using a blend if we have AVX or AVX2 and the right data type.
18956     if (VT.is256BitVector() && IdxVal == 0) {
18957       // TODO: It is worthwhile to cast integer to floating point and back
18958       // and incur a domain crossing penalty if that's what we'll end up
18959       // doing anyway after extracting to a 128-bit vector.
18960       if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
18961           (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
18962         SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18963         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
18964                            DAG.getTargetConstant(1, dl, MVT::i8));
18965       }
18966     }
18967 
18968     // Get the desired 128-bit vector chunk.
18969     SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
18970 
18971     // Insert the element into the desired chunk.
18972     unsigned NumEltsIn128 = 128 / EltSizeInBits;
18973     assert(isPowerOf2_32(NumEltsIn128));
18974     // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
18975     unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
18976 
18977     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
18978                     DAG.getIntPtrConstant(IdxIn128, dl));
18979 
18980     // Insert the changed part back into the bigger vector
18981     return insert128BitVector(N0, V, IdxVal, DAG, dl);
18982   }
18983   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
18984 
18985   // This will be just movd/movq/movss/movsd.
18986   if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
18987     if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
18988         EltVT == MVT::i64) {
18989       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18990       return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18991     }
18992 
18993     // We can't directly insert an i8 or i16 into a vector, so zero extend
18994     // it to i32 first.
18995     if (EltVT == MVT::i16 || EltVT == MVT::i8) {
18996       N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
18997       MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
18998       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
18999       N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19000       return DAG.getBitcast(VT, N1);
19001     }
19002   }
19003 
19004   // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19005   // argument. SSE41 required for pinsrb.
19006   if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19007     unsigned Opc;
19008     if (VT == MVT::v8i16) {
19009       assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
19010       Opc = X86ISD::PINSRW;
19011     } else {
19012       assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
19013       assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
19014       Opc = X86ISD::PINSRB;
19015     }
19016 
19017     assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
19018     N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19019     N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19020     return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19021   }
19022 
19023   if (Subtarget.hasSSE41()) {
19024     if (EltVT == MVT::f32) {
19025       // Bits [7:6] of the constant are the source select. This will always be
19026       //   zero here. The DAG Combiner may combine an extract_elt index into
19027       //   these bits. For example (insert (extract, 3), 2) could be matched by
19028       //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19029       // Bits [5:4] of the constant are the destination select. This is the
19030       //   value of the incoming immediate.
19031       // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19032       //   combine either bitwise AND or insert of float 0.0 to set these bits.
19033 
19034       bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19035       if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
19036         // If this is an insertion of 32-bits into the low 32-bits of
19037         // a vector, we prefer to generate a blend with immediate rather
19038         // than an insertps. Blends are simpler operations in hardware and so
19039         // will always have equal or better performance than insertps.
19040         // But if optimizing for size and there's a load folding opportunity,
19041         // generate insertps because blendps does not have a 32-bit memory
19042         // operand form.
19043         N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19044         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19045                            DAG.getTargetConstant(1, dl, MVT::i8));
19046       }
19047       // Create this as a scalar to vector..
19048       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19049       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19050                          DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19051     }
19052 
19053     // PINSR* works with constant index.
19054     if (EltVT == MVT::i32 || EltVT == MVT::i64)
19055       return Op;
19056   }
19057 
19058   return SDValue();
19059 }
19060 
LowerSCALAR_TO_VECTOR(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)19061 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
19062                                      SelectionDAG &DAG) {
19063   SDLoc dl(Op);
19064   MVT OpVT = Op.getSimpleValueType();
19065 
19066   // It's always cheaper to replace a xor+movd with xorps and simplifies further
19067   // combines.
19068   if (X86::isZeroNode(Op.getOperand(0)))
19069     return getZeroVector(OpVT, Subtarget, DAG, dl);
19070 
19071   // If this is a 256-bit vector result, first insert into a 128-bit
19072   // vector and then insert into the 256-bit vector.
19073   if (!OpVT.is128BitVector()) {
19074     // Insert into a 128-bit vector.
19075     unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19076     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
19077                                  OpVT.getVectorNumElements() / SizeFactor);
19078 
19079     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19080 
19081     // Insert the 128-bit vector.
19082     return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19083   }
19084   assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
19085          "Expected an SSE type!");
19086 
19087   // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
19088   if (OpVT == MVT::v4i32)
19089     return Op;
19090 
19091   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19092   return DAG.getBitcast(
19093       OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19094 }
19095 
19096 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
19097 // simple superregister reference or explicit instructions to insert
19098 // the upper bits of a vector.
LowerINSERT_SUBVECTOR(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)19099 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
19100                                      SelectionDAG &DAG) {
19101   assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19102 
19103   return insert1BitVector(Op, DAG, Subtarget);
19104 }
19105 
LowerEXTRACT_SUBVECTOR(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)19106 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
19107                                       SelectionDAG &DAG) {
19108   assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19109          "Only vXi1 extract_subvectors need custom lowering");
19110 
19111   SDLoc dl(Op);
19112   SDValue Vec = Op.getOperand(0);
19113   uint64_t IdxVal = Op.getConstantOperandVal(1);
19114 
19115   if (IdxVal == 0) // the operation is legal
19116     return Op;
19117 
19118   MVT VecVT = Vec.getSimpleValueType();
19119   unsigned NumElems = VecVT.getVectorNumElements();
19120 
19121   // Extend to natively supported kshift.
19122   MVT WideVecVT = VecVT;
19123   if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
19124     WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19125     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
19126                       DAG.getUNDEF(WideVecVT), Vec,
19127                       DAG.getIntPtrConstant(0, dl));
19128   }
19129 
19130   // Shift to the LSB.
19131   Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
19132                     DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19133 
19134   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19135                      DAG.getIntPtrConstant(0, dl));
19136 }
19137 
19138 // Returns the appropriate wrapper opcode for a global reference.
getGlobalWrapperKind(const GlobalValue * GV,const unsigned char OpFlags) const19139 unsigned X86TargetLowering::getGlobalWrapperKind(
19140     const GlobalValue *GV, const unsigned char OpFlags) const {
19141   // References to absolute symbols are never PC-relative.
19142   if (GV && GV->isAbsoluteSymbolRef())
19143     return X86ISD::Wrapper;
19144 
19145   CodeModel::Model M = getTargetMachine().getCodeModel();
19146   if (Subtarget.isPICStyleRIPRel() &&
19147       (M == CodeModel::Small || M == CodeModel::Kernel))
19148     return X86ISD::WrapperRIP;
19149 
19150   // GOTPCREL references must always use RIP.
19151   if (OpFlags == X86II::MO_GOTPCREL)
19152     return X86ISD::WrapperRIP;
19153 
19154   return X86ISD::Wrapper;
19155 }
19156 
19157 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19158 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19159 // one of the above mentioned nodes. It has to be wrapped because otherwise
19160 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19161 // be used to form addressing mode. These wrapped nodes will be selected
19162 // into MOV32ri.
19163 SDValue
LowerConstantPool(SDValue Op,SelectionDAG & DAG) const19164 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19165   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19166 
19167   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19168   // global base reg.
19169   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19170 
19171   auto PtrVT = getPointerTy(DAG.getDataLayout());
19172   SDValue Result = DAG.getTargetConstantPool(
19173       CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19174   SDLoc DL(CP);
19175   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
19176   // With PIC, the address is actually $g + Offset.
19177   if (OpFlag) {
19178     Result =
19179         DAG.getNode(ISD::ADD, DL, PtrVT,
19180                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19181   }
19182 
19183   return Result;
19184 }
19185 
LowerJumpTable(SDValue Op,SelectionDAG & DAG) const19186 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19187   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19188 
19189   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19190   // global base reg.
19191   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19192 
19193   auto PtrVT = getPointerTy(DAG.getDataLayout());
19194   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19195   SDLoc DL(JT);
19196   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
19197 
19198   // With PIC, the address is actually $g + Offset.
19199   if (OpFlag)
19200     Result =
19201         DAG.getNode(ISD::ADD, DL, PtrVT,
19202                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19203 
19204   return Result;
19205 }
19206 
LowerExternalSymbol(SDValue Op,SelectionDAG & DAG) const19207 SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19208                                                SelectionDAG &DAG) const {
19209   return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19210 }
19211 
19212 SDValue
LowerBlockAddress(SDValue Op,SelectionDAG & DAG) const19213 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19214   // Create the TargetBlockAddressAddress node.
19215   unsigned char OpFlags =
19216     Subtarget.classifyBlockAddressReference();
19217   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19218   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19219   SDLoc dl(Op);
19220   auto PtrVT = getPointerTy(DAG.getDataLayout());
19221   SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19222   Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
19223 
19224   // With PIC, the address is actually $g + Offset.
19225   if (isGlobalRelativeToPICBase(OpFlags)) {
19226     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19227                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19228   }
19229 
19230   return Result;
19231 }
19232 
19233 /// Creates target global address or external symbol nodes for calls or
19234 /// other uses.
LowerGlobalOrExternal(SDValue Op,SelectionDAG & DAG,bool ForCall) const19235 SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19236                                                  bool ForCall) const {
19237   // Unpack the global address or external symbol.
19238   const SDLoc &dl = SDLoc(Op);
19239   const GlobalValue *GV = nullptr;
19240   int64_t Offset = 0;
19241   const char *ExternalSym = nullptr;
19242   if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19243     GV = G->getGlobal();
19244     Offset = G->getOffset();
19245   } else {
19246     const auto *ES = cast<ExternalSymbolSDNode>(Op);
19247     ExternalSym = ES->getSymbol();
19248   }
19249 
19250   // Calculate some flags for address lowering.
19251   const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
19252   unsigned char OpFlags;
19253   if (ForCall)
19254     OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19255   else
19256     OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19257   bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19258   bool NeedsLoad = isGlobalStubReference(OpFlags);
19259 
19260   CodeModel::Model M = DAG.getTarget().getCodeModel();
19261   auto PtrVT = getPointerTy(DAG.getDataLayout());
19262   SDValue Result;
19263 
19264   if (GV) {
19265     // Create a target global address if this is a global. If possible, fold the
19266     // offset into the global address reference. Otherwise, ADD it on later.
19267     // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19268     // allowed because if the address of foo is 0, the ELF R_X86_64_32
19269     // relocation will compute to a negative value, which is invalid.
19270     int64_t GlobalOffset = 0;
19271     if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19272         X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
19273       std::swap(GlobalOffset, Offset);
19274     }
19275     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19276   } else {
19277     // If this is not a global address, this must be an external symbol.
19278     Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19279   }
19280 
19281   // If this is a direct call, avoid the wrapper if we don't need to do any
19282   // loads or adds. This allows SDAG ISel to match direct calls.
19283   if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19284     return Result;
19285 
19286   Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19287 
19288   // With PIC, the address is actually $g + Offset.
19289   if (HasPICReg) {
19290     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19291                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19292   }
19293 
19294   // For globals that require a load from a stub to get the address, emit the
19295   // load.
19296   if (NeedsLoad)
19297     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19298                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
19299 
19300   // If there was a non-zero offset that we didn't fold, create an explicit
19301   // addition for it.
19302   if (Offset != 0)
19303     Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19304                          DAG.getConstant(Offset, dl, PtrVT));
19305 
19306   return Result;
19307 }
19308 
19309 SDValue
LowerGlobalAddress(SDValue Op,SelectionDAG & DAG) const19310 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19311   return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19312 }
19313 
19314 static SDValue
GetTLSADDR(SelectionDAG & DAG,SDValue Chain,GlobalAddressSDNode * GA,SDValue * InFlag,const EVT PtrVT,unsigned ReturnReg,unsigned char OperandFlags,bool LocalDynamic=false)19315 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
19316            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
19317            unsigned char OperandFlags, bool LocalDynamic = false) {
19318   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19319   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19320   SDLoc dl(GA);
19321   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19322                                            GA->getValueType(0),
19323                                            GA->getOffset(),
19324                                            OperandFlags);
19325 
19326   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
19327                                            : X86ISD::TLSADDR;
19328 
19329   if (InFlag) {
19330     SDValue Ops[] = { Chain,  TGA, *InFlag };
19331     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
19332   } else {
19333     SDValue Ops[]  = { Chain, TGA };
19334     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
19335   }
19336 
19337   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19338   MFI.setAdjustsStack(true);
19339   MFI.setHasCalls(true);
19340 
19341   SDValue Flag = Chain.getValue(1);
19342   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
19343 }
19344 
19345 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19346 static SDValue
LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode * GA,SelectionDAG & DAG,const EVT PtrVT)19347 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19348                                 const EVT PtrVT) {
19349   SDValue InFlag;
19350   SDLoc dl(GA);  // ? function entry point might be better
19351   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
19352                                    DAG.getNode(X86ISD::GlobalBaseReg,
19353                                                SDLoc(), PtrVT), InFlag);
19354   InFlag = Chain.getValue(1);
19355 
19356   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
19357 }
19358 
19359 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19360 static SDValue
LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode * GA,SelectionDAG & DAG,const EVT PtrVT)19361 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19362                                 const EVT PtrVT) {
19363   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
19364                     X86::RAX, X86II::MO_TLSGD);
19365 }
19366 
19367 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19368 static SDValue
LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode * GA,SelectionDAG & DAG,const EVT PtrVT)19369 LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19370                                  const EVT PtrVT) {
19371   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
19372                     X86::EAX, X86II::MO_TLSGD);
19373 }
19374 
LowerToTLSLocalDynamicModel(GlobalAddressSDNode * GA,SelectionDAG & DAG,const EVT PtrVT,bool Is64Bit,bool Is64BitLP64)19375 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
19376                                            SelectionDAG &DAG, const EVT PtrVT,
19377                                            bool Is64Bit, bool Is64BitLP64) {
19378   SDLoc dl(GA);
19379 
19380   // Get the start address of the TLS block for this module.
19381   X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
19382       .getInfo<X86MachineFunctionInfo>();
19383   MFI->incNumLocalDynamicTLSAccesses();
19384 
19385   SDValue Base;
19386   if (Is64Bit) {
19387     unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19388     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
19389                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
19390   } else {
19391     SDValue InFlag;
19392     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
19393         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
19394     InFlag = Chain.getValue(1);
19395     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
19396                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
19397   }
19398 
19399   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19400   // of Base.
19401 
19402   // Build x@dtpoff.
19403   unsigned char OperandFlags = X86II::MO_DTPOFF;
19404   unsigned WrapperKind = X86ISD::Wrapper;
19405   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19406                                            GA->getValueType(0),
19407                                            GA->getOffset(), OperandFlags);
19408   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19409 
19410   // Add x@dtpoff with the base.
19411   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19412 }
19413 
19414 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
LowerToTLSExecModel(GlobalAddressSDNode * GA,SelectionDAG & DAG,const EVT PtrVT,TLSModel::Model model,bool is64Bit,bool isPIC)19415 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19416                                    const EVT PtrVT, TLSModel::Model model,
19417                                    bool is64Bit, bool isPIC) {
19418   SDLoc dl(GA);
19419 
19420   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19421   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
19422                                                          is64Bit ? 257 : 256));
19423 
19424   SDValue ThreadPointer =
19425       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19426                   MachinePointerInfo(Ptr));
19427 
19428   unsigned char OperandFlags = 0;
19429   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
19430   // initialexec.
19431   unsigned WrapperKind = X86ISD::Wrapper;
19432   if (model == TLSModel::LocalExec) {
19433     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19434   } else if (model == TLSModel::InitialExec) {
19435     if (is64Bit) {
19436       OperandFlags = X86II::MO_GOTTPOFF;
19437       WrapperKind = X86ISD::WrapperRIP;
19438     } else {
19439       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19440     }
19441   } else {
19442     llvm_unreachable("Unexpected model");
19443   }
19444 
19445   // emit "addl x@ntpoff,%eax" (local exec)
19446   // or "addl x@indntpoff,%eax" (initial exec)
19447   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19448   SDValue TGA =
19449       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19450                                  GA->getOffset(), OperandFlags);
19451   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19452 
19453   if (model == TLSModel::InitialExec) {
19454     if (isPIC && !is64Bit) {
19455       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19456                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19457                            Offset);
19458     }
19459 
19460     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19461                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
19462   }
19463 
19464   // The address of the thread local variable is the add of the thread
19465   // pointer with the offset of the variable.
19466   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19467 }
19468 
19469 SDValue
LowerGlobalTLSAddress(SDValue Op,SelectionDAG & DAG) const19470 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19471 
19472   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19473 
19474   if (DAG.getTarget().useEmulatedTLS())
19475     return LowerToTLSEmulatedModel(GA, DAG);
19476 
19477   const GlobalValue *GV = GA->getGlobal();
19478   auto PtrVT = getPointerTy(DAG.getDataLayout());
19479   bool PositionIndependent = isPositionIndependent();
19480 
19481   if (Subtarget.isTargetELF()) {
19482     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19483     switch (model) {
19484       case TLSModel::GeneralDynamic:
19485         if (Subtarget.is64Bit()) {
19486           if (Subtarget.isTarget64BitLP64())
19487             return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19488           return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19489         }
19490         return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19491       case TLSModel::LocalDynamic:
19492         return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19493                                            Subtarget.isTarget64BitLP64());
19494       case TLSModel::InitialExec:
19495       case TLSModel::LocalExec:
19496         return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19497                                    PositionIndependent);
19498     }
19499     llvm_unreachable("Unknown TLS model.");
19500   }
19501 
19502   if (Subtarget.isTargetDarwin()) {
19503     // Darwin only has one model of TLS.  Lower to that.
19504     unsigned char OpFlag = 0;
19505     unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
19506                            X86ISD::WrapperRIP : X86ISD::Wrapper;
19507 
19508     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19509     // global base reg.
19510     bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19511     if (PIC32)
19512       OpFlag = X86II::MO_TLVP_PIC_BASE;
19513     else
19514       OpFlag = X86II::MO_TLVP;
19515     SDLoc DL(Op);
19516     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
19517                                                 GA->getValueType(0),
19518                                                 GA->getOffset(), OpFlag);
19519     SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19520 
19521     // With PIC32, the address is actually $g + Offset.
19522     if (PIC32)
19523       Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19524                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19525                            Offset);
19526 
19527     // Lowering the machine isd will make sure everything is in the right
19528     // location.
19529     SDValue Chain = DAG.getEntryNode();
19530     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19531     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19532     SDValue Args[] = { Chain, Offset };
19533     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19534     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
19535                                DAG.getIntPtrConstant(0, DL, true),
19536                                Chain.getValue(1), DL);
19537 
19538     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19539     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19540     MFI.setAdjustsStack(true);
19541 
19542     // And our return value (tls address) is in the standard call return value
19543     // location.
19544     unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19545     return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19546   }
19547 
19548   if (Subtarget.isOSWindows()) {
19549     // Just use the implicit TLS architecture
19550     // Need to generate something similar to:
19551     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19552     //                                  ; from TEB
19553     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
19554     //   mov     rcx, qword [rdx+rcx*8]
19555     //   mov     eax, .tls$:tlsvar
19556     //   [rax+rcx] contains the address
19557     // Windows 64bit: gs:0x58
19558     // Windows 32bit: fs:__tls_array
19559 
19560     SDLoc dl(GA);
19561     SDValue Chain = DAG.getEntryNode();
19562 
19563     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19564     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19565     // use its literal value of 0x2C.
19566     Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
19567                                         ? Type::getInt8PtrTy(*DAG.getContext(),
19568                                                              256)
19569                                         : Type::getInt32PtrTy(*DAG.getContext(),
19570                                                               257));
19571 
19572     SDValue TlsArray = Subtarget.is64Bit()
19573                            ? DAG.getIntPtrConstant(0x58, dl)
19574                            : (Subtarget.isTargetWindowsGNU()
19575                                   ? DAG.getIntPtrConstant(0x2C, dl)
19576                                   : DAG.getExternalSymbol("_tls_array", PtrVT));
19577 
19578     SDValue ThreadPointer =
19579         DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19580 
19581     SDValue res;
19582     if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
19583       res = ThreadPointer;
19584     } else {
19585       // Load the _tls_index variable
19586       SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19587       if (Subtarget.is64Bit())
19588         IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19589                              MachinePointerInfo(), MVT::i32);
19590       else
19591         IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19592 
19593       const DataLayout &DL = DAG.getDataLayout();
19594       SDValue Scale =
19595           DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19596       IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19597 
19598       res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19599     }
19600 
19601     res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19602 
19603     // Get the offset of start of .tls section
19604     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19605                                              GA->getValueType(0),
19606                                              GA->getOffset(), X86II::MO_SECREL);
19607     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19608 
19609     // The address of the thread local variable is the add of the thread
19610     // pointer with the offset of the variable.
19611     return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19612   }
19613 
19614   llvm_unreachable("TLS not implemented for this target.");
19615 }
19616 
19617 /// Lower SRA_PARTS and friends, which return two i32 values
19618 /// and take a 2 x i32 value to shift plus a shift amount.
19619 /// TODO: Can this be moved to general expansion code?
LowerShiftParts(SDValue Op,SelectionDAG & DAG)19620 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
19621   SDValue Lo, Hi;
19622   DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19623   return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19624 }
19625 
LowerFunnelShift(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)19626 static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
19627                                 SelectionDAG &DAG) {
19628   MVT VT = Op.getSimpleValueType();
19629   assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
19630          "Unexpected funnel shift opcode!");
19631 
19632   SDLoc DL(Op);
19633   SDValue Op0 = Op.getOperand(0);
19634   SDValue Op1 = Op.getOperand(1);
19635   SDValue Amt = Op.getOperand(2);
19636 
19637   bool IsFSHR = Op.getOpcode() == ISD::FSHR;
19638 
19639   if (VT.isVector()) {
19640     assert(Subtarget.hasVBMI2() && "Expected VBMI2");
19641 
19642     if (IsFSHR)
19643       std::swap(Op0, Op1);
19644 
19645     // With AVX512, but not VLX we need to widen to get a 512-bit result type.
19646     if (!Subtarget.hasVLX() && !VT.is512BitVector()) {
19647       Op0 = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
19648       Op1 = widenSubVector(Op1, false, Subtarget, DAG, DL, 512);
19649     }
19650 
19651     SDValue Funnel;
19652     APInt APIntShiftAmt;
19653     MVT ResultVT = Op0.getSimpleValueType();
19654     if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
19655       uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
19656       Funnel =
19657           DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, ResultVT, Op0,
19658                       Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
19659     } else {
19660       if (!Subtarget.hasVLX() && !VT.is512BitVector())
19661         Amt = widenSubVector(Amt, false, Subtarget, DAG, DL, 512);
19662       Funnel = DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL,
19663                            ResultVT, Op0, Op1, Amt);
19664     }
19665     if (!Subtarget.hasVLX() && !VT.is512BitVector())
19666       Funnel = extractSubVector(Funnel, 0, DAG, DL, VT.getSizeInBits());
19667     return Funnel;
19668   }
19669   assert(
19670       (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
19671       "Unexpected funnel shift type!");
19672 
19673   // Expand slow SHLD/SHRD cases if we are not optimizing for size.
19674   bool OptForSize = DAG.shouldOptForSize();
19675   bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
19676 
19677   // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
19678   // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
19679   if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
19680       !isa<ConstantSDNode>(Amt)) {
19681     unsigned EltSizeInBits = VT.getScalarSizeInBits();
19682     SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
19683     SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
19684     Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
19685     Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
19686     Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
19687     SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
19688     Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
19689     if (IsFSHR) {
19690       Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
19691     } else {
19692       Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
19693       Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
19694     }
19695     return DAG.getZExtOrTrunc(Res, DL, VT);
19696   }
19697 
19698   if (VT == MVT::i8 || ExpandFunnel)
19699     return SDValue();
19700 
19701   // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
19702   if (VT == MVT::i16) {
19703     Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
19704                       DAG.getConstant(15, DL, Amt.getValueType()));
19705     unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
19706     return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
19707   }
19708 
19709   return Op;
19710 }
19711 
19712 // Try to use a packed vector operation to handle i64 on 32-bit targets when
19713 // AVX512DQ is enabled.
LowerI64IntToFP_AVX512DQ(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)19714 static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
19715                                         const X86Subtarget &Subtarget) {
19716   assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19717           Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19718           Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19719           Op.getOpcode() == ISD::UINT_TO_FP) &&
19720          "Unexpected opcode!");
19721   bool IsStrict = Op->isStrictFPOpcode();
19722   unsigned OpNo = IsStrict ? 1 : 0;
19723   SDValue Src = Op.getOperand(OpNo);
19724   MVT SrcVT = Src.getSimpleValueType();
19725   MVT VT = Op.getSimpleValueType();
19726 
19727    if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19728        (VT != MVT::f32 && VT != MVT::f64))
19729     return SDValue();
19730 
19731   // Pack the i64 into a vector, do the operation and extract.
19732 
19733   // Using 256-bit to ensure result is 128-bits for f32 case.
19734   unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19735   MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19736   MVT VecVT = MVT::getVectorVT(VT, NumElts);
19737 
19738   SDLoc dl(Op);
19739   SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19740   if (IsStrict) {
19741     SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19742                                  {Op.getOperand(0), InVec});
19743     SDValue Chain = CvtVec.getValue(1);
19744     SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19745                                 DAG.getIntPtrConstant(0, dl));
19746     return DAG.getMergeValues({Value, Chain}, dl);
19747   }
19748 
19749   SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19750 
19751   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19752                      DAG.getIntPtrConstant(0, dl));
19753 }
19754 
useVectorCast(unsigned Opcode,MVT FromVT,MVT ToVT,const X86Subtarget & Subtarget)19755 static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19756                           const X86Subtarget &Subtarget) {
19757   switch (Opcode) {
19758     case ISD::SINT_TO_FP:
19759       // TODO: Handle wider types with AVX/AVX512.
19760       if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19761         return false;
19762       // CVTDQ2PS or (V)CVTDQ2PD
19763       return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19764 
19765     case ISD::UINT_TO_FP:
19766       // TODO: Handle wider types and i64 elements.
19767       if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19768         return false;
19769       // VCVTUDQ2PS or VCVTUDQ2PD
19770       return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19771 
19772     default:
19773       return false;
19774   }
19775 }
19776 
19777 /// Given a scalar cast operation that is extracted from a vector, try to
19778 /// vectorize the cast op followed by extraction. This will avoid an expensive
19779 /// round-trip between XMM and GPR.
vectorizeExtractedCast(SDValue Cast,SelectionDAG & DAG,const X86Subtarget & Subtarget)19780 static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
19781                                       const X86Subtarget &Subtarget) {
19782   // TODO: This could be enhanced to handle smaller integer types by peeking
19783   // through an extend.
19784   SDValue Extract = Cast.getOperand(0);
19785   MVT DestVT = Cast.getSimpleValueType();
19786   if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19787       !isa<ConstantSDNode>(Extract.getOperand(1)))
19788     return SDValue();
19789 
19790   // See if we have a 128-bit vector cast op for this type of cast.
19791   SDValue VecOp = Extract.getOperand(0);
19792   MVT FromVT = VecOp.getSimpleValueType();
19793   unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19794   MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19795   MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19796   if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19797     return SDValue();
19798 
19799   // If we are extracting from a non-zero element, first shuffle the source
19800   // vector to allow extracting from element zero.
19801   SDLoc DL(Cast);
19802   if (!isNullConstant(Extract.getOperand(1))) {
19803     SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19804     Mask[0] = Extract.getConstantOperandVal(1);
19805     VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19806   }
19807   // If the source vector is wider than 128-bits, extract the low part. Do not
19808   // create an unnecessarily wide vector cast op.
19809   if (FromVT != Vec128VT)
19810     VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19811 
19812   // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19813   // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19814   SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19815   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19816                      DAG.getIntPtrConstant(0, DL));
19817 }
19818 
19819 /// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19820 /// try to vectorize the cast ops. This will avoid an expensive round-trip
19821 /// between XMM and GPR.
lowerFPToIntToFP(SDValue CastToFP,SelectionDAG & DAG,const X86Subtarget & Subtarget)19822 static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
19823                                 const X86Subtarget &Subtarget) {
19824   // TODO: Allow FP_TO_UINT.
19825   SDValue CastToInt = CastToFP.getOperand(0);
19826   MVT VT = CastToFP.getSimpleValueType();
19827   if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19828     return SDValue();
19829 
19830   MVT IntVT = CastToInt.getSimpleValueType();
19831   SDValue X = CastToInt.getOperand(0);
19832   MVT SrcVT = X.getSimpleValueType();
19833   if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19834     return SDValue();
19835 
19836   // See if we have 128-bit vector cast instructions for this type of cast.
19837   // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19838   if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19839       IntVT != MVT::i32)
19840     return SDValue();
19841 
19842   unsigned SrcSize = SrcVT.getSizeInBits();
19843   unsigned IntSize = IntVT.getSizeInBits();
19844   unsigned VTSize = VT.getSizeInBits();
19845   MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19846   MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19847   MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19848 
19849   // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19850   unsigned ToIntOpcode =
19851       SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19852   unsigned ToFPOpcode =
19853       IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19854 
19855   // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19856   //
19857   // We are not defining the high elements (for example, zero them) because
19858   // that could nullify any performance advantage that we hoped to gain from
19859   // this vector op hack. We do not expect any adverse effects (like denorm
19860   // penalties) with cast ops.
19861   SDLoc DL(CastToFP);
19862   SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
19863   SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19864   SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19865   SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19866   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19867 }
19868 
lowerINT_TO_FP_vXi64(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)19869 static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
19870                                     const X86Subtarget &Subtarget) {
19871   SDLoc DL(Op);
19872   bool IsStrict = Op->isStrictFPOpcode();
19873   MVT VT = Op->getSimpleValueType(0);
19874   SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19875 
19876   if (Subtarget.hasDQI()) {
19877     assert(!Subtarget.hasVLX() && "Unexpected features");
19878 
19879     assert((Src.getSimpleValueType() == MVT::v2i64 ||
19880             Src.getSimpleValueType() == MVT::v4i64) &&
19881            "Unsupported custom type");
19882 
19883     // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19884     assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19885            "Unexpected VT!");
19886     MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19887 
19888     // Need to concat with zero vector for strict fp to avoid spurious
19889     // exceptions.
19890     SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19891                            : DAG.getUNDEF(MVT::v8i64);
19892     Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19893                       DAG.getIntPtrConstant(0, DL));
19894     SDValue Res, Chain;
19895     if (IsStrict) {
19896       Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19897                         {Op->getOperand(0), Src});
19898       Chain = Res.getValue(1);
19899     } else {
19900       Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19901     }
19902 
19903     Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19904                       DAG.getIntPtrConstant(0, DL));
19905 
19906     if (IsStrict)
19907       return DAG.getMergeValues({Res, Chain}, DL);
19908     return Res;
19909   }
19910 
19911   bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19912                   Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19913   if (VT != MVT::v4f32 || IsSigned)
19914     return SDValue();
19915 
19916   SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19917   SDValue One  = DAG.getConstant(1, DL, MVT::v4i64);
19918   SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19919                              DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19920                              DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19921   SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19922   SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19923   SmallVector<SDValue, 4> SignCvts(4);
19924   SmallVector<SDValue, 4> Chains(4);
19925   for (int i = 0; i != 4; ++i) {
19926     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19927                               DAG.getIntPtrConstant(i, DL));
19928     if (IsStrict) {
19929       SignCvts[i] =
19930           DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
19931                       {Op.getOperand(0), Elt});
19932       Chains[i] = SignCvts[i].getValue(1);
19933     } else {
19934       SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
19935     }
19936   }
19937   SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
19938 
19939   SDValue Slow, Chain;
19940   if (IsStrict) {
19941     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
19942     Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
19943                        {Chain, SignCvt, SignCvt});
19944     Chain = Slow.getValue(1);
19945   } else {
19946     Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
19947   }
19948 
19949   IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
19950   SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
19951 
19952   if (IsStrict)
19953     return DAG.getMergeValues({Cvt, Chain}, DL);
19954 
19955   return Cvt;
19956 }
19957 
LowerSINT_TO_FP(SDValue Op,SelectionDAG & DAG) const19958 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
19959                                            SelectionDAG &DAG) const {
19960   bool IsStrict = Op->isStrictFPOpcode();
19961   unsigned OpNo = IsStrict ? 1 : 0;
19962   SDValue Src = Op.getOperand(OpNo);
19963   SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19964   MVT SrcVT = Src.getSimpleValueType();
19965   MVT VT = Op.getSimpleValueType();
19966   SDLoc dl(Op);
19967 
19968   if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
19969     return Extract;
19970 
19971   if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
19972     return R;
19973 
19974   if (SrcVT.isVector()) {
19975     if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
19976       // Note: Since v2f64 is a legal type. We don't need to zero extend the
19977       // source for strict FP.
19978       if (IsStrict)
19979         return DAG.getNode(
19980             X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
19981             {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19982                                 DAG.getUNDEF(SrcVT))});
19983       return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
19984                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19985                                      DAG.getUNDEF(SrcVT)));
19986     }
19987     if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
19988       return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
19989 
19990     return SDValue();
19991   }
19992 
19993   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
19994          "Unknown SINT_TO_FP to lower!");
19995 
19996   bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
19997 
19998   // These are really Legal; return the operand so the caller accepts it as
19999   // Legal.
20000   if (SrcVT == MVT::i32 && UseSSEReg)
20001     return Op;
20002   if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20003     return Op;
20004 
20005   if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
20006     return V;
20007 
20008   // SSE doesn't have an i16 conversion so we need to promote.
20009   if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20010     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20011     if (IsStrict)
20012       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20013                          {Chain, Ext});
20014 
20015     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20016   }
20017 
20018   if (VT == MVT::f128)
20019     return SDValue();
20020 
20021   SDValue ValueToStore = Src;
20022   if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20023     // Bitcasting to f64 here allows us to do a single 64-bit store from
20024     // an SSE register, avoiding the store forwarding penalty that would come
20025     // with two 32-bit stores.
20026     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20027 
20028   unsigned Size = SrcVT.getStoreSize();
20029   Align Alignment(Size);
20030   MachineFunction &MF = DAG.getMachineFunction();
20031   auto PtrVT = getPointerTy(MF.getDataLayout());
20032   int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20033   MachinePointerInfo MPI =
20034       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
20035   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20036   Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20037   std::pair<SDValue, SDValue> Tmp =
20038       BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20039 
20040   if (IsStrict)
20041     return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20042 
20043   return Tmp.first;
20044 }
20045 
BuildFILD(EVT DstVT,EVT SrcVT,const SDLoc & DL,SDValue Chain,SDValue Pointer,MachinePointerInfo PtrInfo,Align Alignment,SelectionDAG & DAG) const20046 std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20047     EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20048     MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20049   // Build the FILD
20050   SDVTList Tys;
20051   bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20052   if (useSSE)
20053     Tys = DAG.getVTList(MVT::f80, MVT::Other);
20054   else
20055     Tys = DAG.getVTList(DstVT, MVT::Other);
20056 
20057   SDValue FILDOps[] = {Chain, Pointer};
20058   SDValue Result =
20059       DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20060                               Alignment, MachineMemOperand::MOLoad);
20061   Chain = Result.getValue(1);
20062 
20063   if (useSSE) {
20064     MachineFunction &MF = DAG.getMachineFunction();
20065     unsigned SSFISize = DstVT.getStoreSize();
20066     int SSFI =
20067         MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20068     auto PtrVT = getPointerTy(MF.getDataLayout());
20069     SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20070     Tys = DAG.getVTList(MVT::Other);
20071     SDValue FSTOps[] = {Chain, Result, StackSlot};
20072     MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
20073         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
20074         MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20075 
20076     Chain =
20077         DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20078     Result = DAG.getLoad(
20079         DstVT, DL, Chain, StackSlot,
20080         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
20081     Chain = Result.getValue(1);
20082   }
20083 
20084   return { Result, Chain };
20085 }
20086 
20087 /// Horizontal vector math instructions may be slower than normal math with
20088 /// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20089 /// implementation, and likely shuffle complexity of the alternate sequence.
shouldUseHorizontalOp(bool IsSingleSource,SelectionDAG & DAG,const X86Subtarget & Subtarget)20090 static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20091                                   const X86Subtarget &Subtarget) {
20092   bool IsOptimizingSize = DAG.shouldOptForSize();
20093   bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20094   return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20095 }
20096 
20097 /// 64-bit unsigned integer to double expansion.
LowerUINT_TO_FP_i64(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)20098 static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
20099                                    const X86Subtarget &Subtarget) {
20100   // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20101   // when converting 0 when rounding toward negative infinity. Caller will
20102   // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20103   assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
20104   // This algorithm is not obvious. Here it is what we're trying to output:
20105   /*
20106      movq       %rax,  %xmm0
20107      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20108      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20109      #ifdef __SSE3__
20110        haddpd   %xmm0, %xmm0
20111      #else
20112        pshufd   $0x4e, %xmm0, %xmm1
20113        addpd    %xmm1, %xmm0
20114      #endif
20115   */
20116 
20117   SDLoc dl(Op);
20118   LLVMContext *Context = DAG.getContext();
20119 
20120   // Build some magic constants.
20121   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20122   Constant *C0 = ConstantDataVector::get(*Context, CV0);
20123   auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20124   SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20125 
20126   SmallVector<Constant*,2> CV1;
20127   CV1.push_back(
20128     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20129                                       APInt(64, 0x4330000000000000ULL))));
20130   CV1.push_back(
20131     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20132                                       APInt(64, 0x4530000000000000ULL))));
20133   Constant *C1 = ConstantVector::get(CV1);
20134   SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20135 
20136   // Load the 64-bit value into an XMM register.
20137   SDValue XR1 =
20138       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20139   SDValue CLod0 = DAG.getLoad(
20140       MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20141       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
20142   SDValue Unpck1 =
20143       getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20144 
20145   SDValue CLod1 = DAG.getLoad(
20146       MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20147       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
20148   SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20149   // TODO: Are there any fast-math-flags to propagate here?
20150   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20151   SDValue Result;
20152 
20153   if (Subtarget.hasSSE3() &&
20154       shouldUseHorizontalOp(true, DAG, Subtarget)) {
20155     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20156   } else {
20157     SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20158     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20159   }
20160   Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20161                        DAG.getIntPtrConstant(0, dl));
20162   return Result;
20163 }
20164 
20165 /// 32-bit unsigned integer to float expansion.
LowerUINT_TO_FP_i32(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)20166 static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
20167                                    const X86Subtarget &Subtarget) {
20168   unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20169   SDLoc dl(Op);
20170   // FP constant to bias correct the final result.
20171   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
20172                                    MVT::f64);
20173 
20174   // Load the 32-bit value into an XMM register.
20175   SDValue Load =
20176       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20177 
20178   // Zero out the upper parts of the register.
20179   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20180 
20181   // Or the load with the bias.
20182   SDValue Or = DAG.getNode(
20183       ISD::OR, dl, MVT::v2i64,
20184       DAG.getBitcast(MVT::v2i64, Load),
20185       DAG.getBitcast(MVT::v2i64,
20186                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20187   Or =
20188       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20189                   DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
20190 
20191   if (Op.getNode()->isStrictFPOpcode()) {
20192     // Subtract the bias.
20193     // TODO: Are there any fast-math-flags to propagate here?
20194     SDValue Chain = Op.getOperand(0);
20195     SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20196                               {Chain, Or, Bias});
20197 
20198     if (Op.getValueType() == Sub.getValueType())
20199       return Sub;
20200 
20201     // Handle final rounding.
20202     std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20203         Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20204 
20205     return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20206   }
20207 
20208   // Subtract the bias.
20209   // TODO: Are there any fast-math-flags to propagate here?
20210   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20211 
20212   // Handle final rounding.
20213   return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20214 }
20215 
lowerUINT_TO_FP_v2i32(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget,const SDLoc & DL)20216 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
20217                                      const X86Subtarget &Subtarget,
20218                                      const SDLoc &DL) {
20219   if (Op.getSimpleValueType() != MVT::v2f64)
20220     return SDValue();
20221 
20222   bool IsStrict = Op->isStrictFPOpcode();
20223 
20224   SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20225   assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
20226 
20227   if (Subtarget.hasAVX512()) {
20228     if (!Subtarget.hasVLX()) {
20229       // Let generic type legalization widen this.
20230       if (!IsStrict)
20231         return SDValue();
20232       // Otherwise pad the integer input with 0s and widen the operation.
20233       N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20234                        DAG.getConstant(0, DL, MVT::v2i32));
20235       SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20236                                 {Op.getOperand(0), N0});
20237       SDValue Chain = Res.getValue(1);
20238       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20239                         DAG.getIntPtrConstant(0, DL));
20240       return DAG.getMergeValues({Res, Chain}, DL);
20241     }
20242 
20243     // Legalize to v4i32 type.
20244     N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20245                      DAG.getUNDEF(MVT::v2i32));
20246     if (IsStrict)
20247       return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20248                          {Op.getOperand(0), N0});
20249     return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20250   }
20251 
20252   // Zero extend to 2i64, OR with the floating point representation of 2^52.
20253   // This gives us the floating point equivalent of 2^52 + the i32 integer
20254   // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20255   // point leaving just our i32 integers in double format.
20256   SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20257   SDValue VBias =
20258       DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);
20259   SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20260                            DAG.getBitcast(MVT::v2i64, VBias));
20261   Or = DAG.getBitcast(MVT::v2f64, Or);
20262 
20263   if (IsStrict)
20264     return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20265                        {Op.getOperand(0), Or, VBias});
20266   return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20267 }
20268 
lowerUINT_TO_FP_vXi32(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)20269 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
20270                                      const X86Subtarget &Subtarget) {
20271   SDLoc DL(Op);
20272   bool IsStrict = Op->isStrictFPOpcode();
20273   SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20274   MVT VecIntVT = V.getSimpleValueType();
20275   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20276          "Unsupported custom type");
20277 
20278   if (Subtarget.hasAVX512()) {
20279     // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20280     assert(!Subtarget.hasVLX() && "Unexpected features");
20281     MVT VT = Op->getSimpleValueType(0);
20282 
20283     // v8i32->v8f64 is legal with AVX512 so just return it.
20284     if (VT == MVT::v8f64)
20285       return Op;
20286 
20287     assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
20288            "Unexpected VT!");
20289     MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20290     MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20291     // Need to concat with zero vector for strict fp to avoid spurious
20292     // exceptions.
20293     SDValue Tmp =
20294         IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20295     V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20296                     DAG.getIntPtrConstant(0, DL));
20297     SDValue Res, Chain;
20298     if (IsStrict) {
20299       Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20300                         {Op->getOperand(0), V});
20301       Chain = Res.getValue(1);
20302     } else {
20303       Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20304     }
20305 
20306     Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20307                       DAG.getIntPtrConstant(0, DL));
20308 
20309     if (IsStrict)
20310       return DAG.getMergeValues({Res, Chain}, DL);
20311     return Res;
20312   }
20313 
20314   if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20315       Op->getSimpleValueType(0) == MVT::v4f64) {
20316     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20317     Constant *Bias = ConstantFP::get(
20318         *DAG.getContext(),
20319         APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20320     auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20321     SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20322     SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20323     SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20324     SDValue VBias = DAG.getMemIntrinsicNode(
20325         X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20326         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
20327         MachineMemOperand::MOLoad);
20328 
20329     SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20330                              DAG.getBitcast(MVT::v4i64, VBias));
20331     Or = DAG.getBitcast(MVT::v4f64, Or);
20332 
20333     if (IsStrict)
20334       return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20335                          {Op.getOperand(0), Or, VBias});
20336     return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20337   }
20338 
20339   // The algorithm is the following:
20340   // #ifdef __SSE4_1__
20341   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20342   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20343   //                                 (uint4) 0x53000000, 0xaa);
20344   // #else
20345   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20346   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
20347   // #endif
20348   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20349   //     return (float4) lo + fhi;
20350 
20351   bool Is128 = VecIntVT == MVT::v4i32;
20352   MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20353   // If we convert to something else than the supported type, e.g., to v4f64,
20354   // abort early.
20355   if (VecFloatVT != Op->getSimpleValueType(0))
20356     return SDValue();
20357 
20358   // In the #idef/#else code, we have in common:
20359   // - The vector of constants:
20360   // -- 0x4b000000
20361   // -- 0x53000000
20362   // - A shift:
20363   // -- v >> 16
20364 
20365   // Create the splat vector for 0x4b000000.
20366   SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20367   // Create the splat vector for 0x53000000.
20368   SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20369 
20370   // Create the right shift.
20371   SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20372   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20373 
20374   SDValue Low, High;
20375   if (Subtarget.hasSSE41()) {
20376     MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20377     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20378     SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20379     SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20380     // Low will be bitcasted right away, so do not bother bitcasting back to its
20381     // original type.
20382     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20383                       VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20384     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20385     //                                 (uint4) 0x53000000, 0xaa);
20386     SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20387     SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20388     // High will be bitcasted right away, so do not bother bitcasting back to
20389     // its original type.
20390     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20391                        VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20392   } else {
20393     SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20394     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20395     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20396     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20397 
20398     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
20399     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20400   }
20401 
20402   // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20403   SDValue VecCstFSub = DAG.getConstantFP(
20404       APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20405 
20406   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20407   // NOTE: By using fsub of a positive constant instead of fadd of a negative
20408   // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20409   // enabled. See PR24512.
20410   SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20411   // TODO: Are there any fast-math-flags to propagate here?
20412   //     (float4) lo;
20413   SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20414   //     return (float4) lo + fhi;
20415   if (IsStrict) {
20416     SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20417                                 {Op.getOperand(0), HighBitcast, VecCstFSub});
20418     return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20419                        {FHigh.getValue(1), LowBitcast, FHigh});
20420   }
20421 
20422   SDValue FHigh =
20423       DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20424   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20425 }
20426 
lowerUINT_TO_FP_vec(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)20427 static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
20428                                    const X86Subtarget &Subtarget) {
20429   unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20430   SDValue N0 = Op.getOperand(OpNo);
20431   MVT SrcVT = N0.getSimpleValueType();
20432   SDLoc dl(Op);
20433 
20434   switch (SrcVT.SimpleTy) {
20435   default:
20436     llvm_unreachable("Custom UINT_TO_FP is not supported!");
20437   case MVT::v2i32:
20438     return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
20439   case MVT::v4i32:
20440   case MVT::v8i32:
20441     return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
20442   case MVT::v2i64:
20443   case MVT::v4i64:
20444     return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
20445   }
20446 }
20447 
LowerUINT_TO_FP(SDValue Op,SelectionDAG & DAG) const20448 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20449                                            SelectionDAG &DAG) const {
20450   bool IsStrict = Op->isStrictFPOpcode();
20451   unsigned OpNo = IsStrict ? 1 : 0;
20452   SDValue Src = Op.getOperand(OpNo);
20453   SDLoc dl(Op);
20454   auto PtrVT = getPointerTy(DAG.getDataLayout());
20455   MVT SrcVT = Src.getSimpleValueType();
20456   MVT DstVT = Op->getSimpleValueType(0);
20457   SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20458 
20459   if (DstVT == MVT::f128)
20460     return SDValue();
20461 
20462   if (DstVT.isVector())
20463     return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
20464 
20465   if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
20466     return Extract;
20467 
20468   if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20469       (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20470     // Conversions from unsigned i32 to f32/f64 are legal,
20471     // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
20472     return Op;
20473   }
20474 
20475   // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20476   if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20477     Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20478     if (IsStrict)
20479       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20480                          {Chain, Src});
20481     return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20482   }
20483 
20484   if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
20485     return V;
20486 
20487   // The transform for i64->f64 isn't correct for 0 when rounding to negative
20488   // infinity. It produces -0.0, so disable under strictfp.
20489   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict)
20490     return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
20491   if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
20492     return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
20493   if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20494       (DstVT == MVT::f32 || DstVT == MVT::f64))
20495     return SDValue();
20496 
20497   // Make a 64-bit buffer, and use it to build an FILD.
20498   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20499   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20500   Align SlotAlign(8);
20501   MachinePointerInfo MPI =
20502     MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
20503   if (SrcVT == MVT::i32) {
20504     SDValue OffsetSlot =
20505         DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
20506     SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20507     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20508                                   OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20509     std::pair<SDValue, SDValue> Tmp =
20510         BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20511     if (IsStrict)
20512       return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20513 
20514     return Tmp.first;
20515   }
20516 
20517   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
20518   SDValue ValueToStore = Src;
20519   if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20520     // Bitcasting to f64 here allows us to do a single 64-bit store from
20521     // an SSE register, avoiding the store forwarding penalty that would come
20522     // with two 32-bit stores.
20523     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20524   }
20525   SDValue Store =
20526       DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20527   // For i64 source, we need to add the appropriate power of 2 if the input
20528   // was negative. We must be careful to do the computation in x87 extended
20529   // precision, not in SSE.
20530   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20531   SDValue Ops[] = { Store, StackSlot };
20532   SDValue Fild =
20533       DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20534                               SlotAlign, MachineMemOperand::MOLoad);
20535   Chain = Fild.getValue(1);
20536 
20537 
20538   // Check whether the sign bit is set.
20539   SDValue SignSet = DAG.getSetCC(
20540       dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20541       Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20542 
20543   // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20544   APInt FF(64, 0x5F80000000000000ULL);
20545   SDValue FudgePtr = DAG.getConstantPool(
20546       ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20547   Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20548 
20549   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20550   SDValue Zero = DAG.getIntPtrConstant(0, dl);
20551   SDValue Four = DAG.getIntPtrConstant(4, dl);
20552   SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20553   FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20554 
20555   // Load the value out, extending it from f32 to f80.
20556   SDValue Fudge = DAG.getExtLoad(
20557       ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20558       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
20559       CPAlignment);
20560   Chain = Fudge.getValue(1);
20561   // Extend everything to 80 bits to force it to be done on x87.
20562   // TODO: Are there any fast-math-flags to propagate here?
20563   if (IsStrict) {
20564     SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
20565                               {Chain, Fild, Fudge});
20566     // STRICT_FP_ROUND can't handle equal types.
20567     if (DstVT == MVT::f80)
20568       return Add;
20569     return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20570                        {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
20571   }
20572   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
20573   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20574                      DAG.getIntPtrConstant(0, dl));
20575 }
20576 
20577 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20578 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20579 // just return an SDValue().
20580 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20581 // to i16, i32 or i64, and we lower it to a legal sequence and return the
20582 // result.
20583 SDValue
FP_TO_INTHelper(SDValue Op,SelectionDAG & DAG,bool IsSigned,SDValue & Chain) const20584 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20585                                    bool IsSigned, SDValue &Chain) const {
20586   bool IsStrict = Op->isStrictFPOpcode();
20587   SDLoc DL(Op);
20588 
20589   EVT DstTy = Op.getValueType();
20590   SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20591   EVT TheVT = Value.getValueType();
20592   auto PtrVT = getPointerTy(DAG.getDataLayout());
20593 
20594   if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20595     // f16 must be promoted before using the lowering in this routine.
20596     // fp128 does not use this lowering.
20597     return SDValue();
20598   }
20599 
20600   // If using FIST to compute an unsigned i64, we'll need some fixup
20601   // to handle values above the maximum signed i64.  A FIST is always
20602   // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20603   bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20604 
20605   // FIXME: This does not generate an invalid exception if the input does not
20606   // fit in i32. PR44019
20607   if (!IsSigned && DstTy != MVT::i64) {
20608     // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20609     // The low 32 bits of the fist result will have the correct uint32 result.
20610     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20611     DstTy = MVT::i64;
20612   }
20613 
20614   assert(DstTy.getSimpleVT() <= MVT::i64 &&
20615          DstTy.getSimpleVT() >= MVT::i16 &&
20616          "Unknown FP_TO_INT to lower!");
20617 
20618   // We lower FP->int64 into FISTP64 followed by a load from a temporary
20619   // stack slot.
20620   MachineFunction &MF = DAG.getMachineFunction();
20621   unsigned MemSize = DstTy.getStoreSize();
20622   int SSFI =
20623       MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20624   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20625 
20626   Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20627 
20628   SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20629 
20630   if (UnsignedFixup) {
20631     //
20632     // Conversion to unsigned i64 is implemented with a select,
20633     // depending on whether the source value fits in the range
20634     // of a signed i64.  Let Thresh be the FP equivalent of
20635     // 0x8000000000000000ULL.
20636     //
20637     //  Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20638     //  FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20639     //  FistSrc = (Value - FltOfs);
20640     //  Fist-to-mem64 FistSrc
20641     //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20642     //  to XOR'ing the high 32 bits with Adjust.
20643     //
20644     // Being a power of 2, Thresh is exactly representable in all FP formats.
20645     // For X87 we'd like to use the smallest FP type for this constant, but
20646     // for DAG type consistency we have to match the FP operand type.
20647 
20648     APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20649     LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
20650     bool LosesInfo = false;
20651     if (TheVT == MVT::f64)
20652       // The rounding mode is irrelevant as the conversion should be exact.
20653       Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
20654                               &LosesInfo);
20655     else if (TheVT == MVT::f80)
20656       Status = Thresh.convert(APFloat::x87DoubleExtended(),
20657                               APFloat::rmNearestTiesToEven, &LosesInfo);
20658 
20659     assert(Status == APFloat::opOK && !LosesInfo &&
20660            "FP conversion should have been exact");
20661 
20662     SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20663 
20664     EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20665                                    *DAG.getContext(), TheVT);
20666     SDValue Cmp;
20667     if (IsStrict) {
20668       Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20669                          /*IsSignaling*/ true);
20670       Chain = Cmp.getValue(1);
20671     } else {
20672       Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20673     }
20674 
20675     // Our preferred lowering of
20676     //
20677     // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20678     //
20679     // is
20680     //
20681     // (Value >= Thresh) << 63
20682     //
20683     // but since we can get here after LegalOperations, DAGCombine might do the
20684     // wrong thing if we create a select. So, directly create the preferred
20685     // version.
20686     SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20687     SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20688     Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20689 
20690     SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20691                                    DAG.getConstantFP(0.0, DL, TheVT));
20692 
20693     if (IsStrict) {
20694       Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20695                           { Chain, Value, FltOfs });
20696       Chain = Value.getValue(1);
20697     } else
20698       Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20699   }
20700 
20701   MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
20702 
20703   // FIXME This causes a redundant load/store if the SSE-class value is already
20704   // in memory, such as if it is on the callstack.
20705   if (isScalarFPTypeInSSEReg(TheVT)) {
20706     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20707     Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20708     SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20709     SDValue Ops[] = { Chain, StackSlot };
20710 
20711     unsigned FLDSize = TheVT.getStoreSize();
20712     assert(FLDSize <= MemSize && "Stack slot not big enough");
20713     MachineMemOperand *MMO = MF.getMachineMemOperand(
20714         MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20715     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20716     Chain = Value.getValue(1);
20717   }
20718 
20719   // Build the FP_TO_INT*_IN_MEM
20720   MachineMemOperand *MMO = MF.getMachineMemOperand(
20721       MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20722   SDValue Ops[] = { Chain, Value, StackSlot };
20723   SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
20724                                          DAG.getVTList(MVT::Other),
20725                                          Ops, DstTy, MMO);
20726 
20727   SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
20728   Chain = Res.getValue(1);
20729 
20730   // If we need an unsigned fixup, XOR the result with adjust.
20731   if (UnsignedFixup)
20732     Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20733 
20734   return Res;
20735 }
20736 
LowerAVXExtend(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)20737 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
20738                               const X86Subtarget &Subtarget) {
20739   MVT VT = Op.getSimpleValueType();
20740   SDValue In = Op.getOperand(0);
20741   MVT InVT = In.getSimpleValueType();
20742   SDLoc dl(Op);
20743   unsigned Opc = Op.getOpcode();
20744 
20745   assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20746   assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
20747          "Unexpected extension opcode");
20748   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
20749          "Expected same number of elements");
20750   assert((VT.getVectorElementType() == MVT::i16 ||
20751           VT.getVectorElementType() == MVT::i32 ||
20752           VT.getVectorElementType() == MVT::i64) &&
20753          "Unexpected element type");
20754   assert((InVT.getVectorElementType() == MVT::i8 ||
20755           InVT.getVectorElementType() == MVT::i16 ||
20756           InVT.getVectorElementType() == MVT::i32) &&
20757          "Unexpected element type");
20758 
20759   unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
20760 
20761   if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20762     assert(InVT == MVT::v32i8 && "Unexpected VT!");
20763     return splitVectorIntUnary(Op, DAG);
20764   }
20765 
20766   if (Subtarget.hasInt256())
20767     return Op;
20768 
20769   // Optimize vectors in AVX mode:
20770   //
20771   //   v8i16 -> v8i32
20772   //   Use vpmovzwd for 4 lower elements  v8i16 -> v4i32.
20773   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
20774   //   Concat upper and lower parts.
20775   //
20776   //   v4i32 -> v4i64
20777   //   Use vpmovzdq for 4 lower elements  v4i32 -> v2i64.
20778   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
20779   //   Concat upper and lower parts.
20780   //
20781   MVT HalfVT = VT.getHalfNumVectorElementsVT();
20782   SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20783 
20784   // Short-circuit if we can determine that each 128-bit half is the same value.
20785   // Otherwise, this is difficult to match and optimize.
20786   if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20787     if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20788       return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20789 
20790   SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20791   SDValue Undef = DAG.getUNDEF(InVT);
20792   bool NeedZero = Opc == ISD::ZERO_EXTEND;
20793   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20794   OpHi = DAG.getBitcast(HalfVT, OpHi);
20795 
20796   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20797 }
20798 
20799 // Helper to split and extend a v16i1 mask to v16i8 or v16i16.
SplitAndExtendv16i1(unsigned ExtOpc,MVT VT,SDValue In,const SDLoc & dl,SelectionDAG & DAG)20800 static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20801                                    const SDLoc &dl, SelectionDAG &DAG) {
20802   assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20803   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20804                            DAG.getIntPtrConstant(0, dl));
20805   SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20806                            DAG.getIntPtrConstant(8, dl));
20807   Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20808   Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20809   SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20810   return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20811 }
20812 
LowerZERO_EXTEND_Mask(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)20813 static  SDValue LowerZERO_EXTEND_Mask(SDValue Op,
20814                                       const X86Subtarget &Subtarget,
20815                                       SelectionDAG &DAG) {
20816   MVT VT = Op->getSimpleValueType(0);
20817   SDValue In = Op->getOperand(0);
20818   MVT InVT = In.getSimpleValueType();
20819   assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20820   SDLoc DL(Op);
20821   unsigned NumElts = VT.getVectorNumElements();
20822 
20823   // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20824   // avoids a constant pool load.
20825   if (VT.getVectorElementType() != MVT::i8) {
20826     SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20827     return DAG.getNode(ISD::SRL, DL, VT, Extend,
20828                        DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20829   }
20830 
20831   // Extend VT if BWI is not supported.
20832   MVT ExtVT = VT;
20833   if (!Subtarget.hasBWI()) {
20834     // If v16i32 is to be avoided, we'll need to split and concatenate.
20835     if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20836       return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
20837 
20838     ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20839   }
20840 
20841   // Widen to 512-bits if VLX is not supported.
20842   MVT WideVT = ExtVT;
20843   if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20844     NumElts *= 512 / ExtVT.getSizeInBits();
20845     InVT = MVT::getVectorVT(MVT::i1, NumElts);
20846     In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
20847                      In, DAG.getIntPtrConstant(0, DL));
20848     WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
20849                               NumElts);
20850   }
20851 
20852   SDValue One = DAG.getConstant(1, DL, WideVT);
20853   SDValue Zero = DAG.getConstant(0, DL, WideVT);
20854 
20855   SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
20856 
20857   // Truncate if we had to extend above.
20858   if (VT != ExtVT) {
20859     WideVT = MVT::getVectorVT(MVT::i8, NumElts);
20860     SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
20861   }
20862 
20863   // Extract back to 128/256-bit if we widened.
20864   if (WideVT != VT)
20865     SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
20866                               DAG.getIntPtrConstant(0, DL));
20867 
20868   return SelectedVal;
20869 }
20870 
LowerZERO_EXTEND(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)20871 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
20872                                 SelectionDAG &DAG) {
20873   SDValue In = Op.getOperand(0);
20874   MVT SVT = In.getSimpleValueType();
20875 
20876   if (SVT.getVectorElementType() == MVT::i1)
20877     return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
20878 
20879   assert(Subtarget.hasAVX() && "Expected AVX support");
20880   return LowerAVXExtend(Op, DAG, Subtarget);
20881 }
20882 
20883 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
20884 /// It makes use of the fact that vectors with enough leading sign/zero bits
20885 /// prevent the PACKSS/PACKUS from saturating the results.
20886 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
20887 /// within each 128-bit lane.
truncateVectorWithPACK(unsigned Opcode,EVT DstVT,SDValue In,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget)20888 static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
20889                                       const SDLoc &DL, SelectionDAG &DAG,
20890                                       const X86Subtarget &Subtarget) {
20891   assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
20892          "Unexpected PACK opcode");
20893   assert(DstVT.isVector() && "VT not a vector?");
20894 
20895   // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
20896   if (!Subtarget.hasSSE2())
20897     return SDValue();
20898 
20899   EVT SrcVT = In.getValueType();
20900 
20901   // No truncation required, we might get here due to recursive calls.
20902   if (SrcVT == DstVT)
20903     return In;
20904 
20905   // We only support vector truncation to 64bits or greater from a
20906   // 128bits or greater source.
20907   unsigned DstSizeInBits = DstVT.getSizeInBits();
20908   unsigned SrcSizeInBits = SrcVT.getSizeInBits();
20909   if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
20910     return SDValue();
20911 
20912   unsigned NumElems = SrcVT.getVectorNumElements();
20913   if (!isPowerOf2_32(NumElems))
20914     return SDValue();
20915 
20916   LLVMContext &Ctx = *DAG.getContext();
20917   assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
20918   assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
20919 
20920   EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
20921 
20922   // Pack to the largest type possible:
20923   // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20924   EVT InVT = MVT::i16, OutVT = MVT::i8;
20925   if (SrcVT.getScalarSizeInBits() > 16 &&
20926       (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
20927     InVT = MVT::i32;
20928     OutVT = MVT::i16;
20929   }
20930 
20931   // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
20932   if (SrcVT.is128BitVector()) {
20933     InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
20934     OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
20935     In = DAG.getBitcast(InVT, In);
20936     SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
20937     Res = extractSubVector(Res, 0, DAG, DL, 64);
20938     return DAG.getBitcast(DstVT, Res);
20939   }
20940 
20941   // Split lower/upper subvectors.
20942   SDValue Lo, Hi;
20943   std::tie(Lo, Hi) = splitVector(In, DAG, DL);
20944 
20945   unsigned SubSizeInBits = SrcSizeInBits / 2;
20946   InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
20947   OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
20948 
20949   // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
20950   if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
20951     Lo = DAG.getBitcast(InVT, Lo);
20952     Hi = DAG.getBitcast(InVT, Hi);
20953     SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20954     return DAG.getBitcast(DstVT, Res);
20955   }
20956 
20957   // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
20958   // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
20959   if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
20960     Lo = DAG.getBitcast(InVT, Lo);
20961     Hi = DAG.getBitcast(InVT, Hi);
20962     SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20963 
20964     // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
20965     // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
20966     // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
20967     SmallVector<int, 64> Mask;
20968     int Scale = 64 / OutVT.getScalarSizeInBits();
20969     narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
20970     Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
20971 
20972     if (DstVT.is256BitVector())
20973       return DAG.getBitcast(DstVT, Res);
20974 
20975     // If 512bit -> 128bit truncate another stage.
20976     EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
20977     Res = DAG.getBitcast(PackedVT, Res);
20978     return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20979   }
20980 
20981   // Recursively pack lower/upper subvectors, concat result and pack again.
20982   assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
20983   EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
20984   Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
20985   Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
20986 
20987   PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
20988   SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
20989   return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20990 }
20991 
LowerTruncateVecI1(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)20992 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
20993                                   const X86Subtarget &Subtarget) {
20994 
20995   SDLoc DL(Op);
20996   MVT VT = Op.getSimpleValueType();
20997   SDValue In = Op.getOperand(0);
20998   MVT InVT = In.getSimpleValueType();
20999 
21000   assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
21001 
21002   // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21003   unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21004   if (InVT.getScalarSizeInBits() <= 16) {
21005     if (Subtarget.hasBWI()) {
21006       // legal, will go to VPMOVB2M, VPMOVW2M
21007       if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21008         // We need to shift to get the lsb into sign position.
21009         // Shift packed bytes not supported natively, bitcast to word
21010         MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21011         In = DAG.getNode(ISD::SHL, DL, ExtVT,
21012                          DAG.getBitcast(ExtVT, In),
21013                          DAG.getConstant(ShiftInx, DL, ExtVT));
21014         In = DAG.getBitcast(InVT, In);
21015       }
21016       return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21017                           In, ISD::SETGT);
21018     }
21019     // Use TESTD/Q, extended vector to packed dword/qword.
21020     assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
21021            "Unexpected vector type.");
21022     unsigned NumElts = InVT.getVectorNumElements();
21023     assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
21024     // We need to change to a wider element type that we have support for.
21025     // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21026     // For 16 element vectors we extend to v16i32 unless we are explicitly
21027     // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21028     // we need to split into two 8 element vectors which we can extend to v8i32,
21029     // truncate and concat the results. There's an additional complication if
21030     // the original type is v16i8. In that case we can't split the v16i8
21031     // directly, so we need to shuffle high elements to low and use
21032     // sign_extend_vector_inreg.
21033     if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21034       SDValue Lo, Hi;
21035       if (InVT == MVT::v16i8) {
21036         Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21037         Hi = DAG.getVectorShuffle(
21038             InVT, DL, In, In,
21039             {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21040         Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21041       } else {
21042         assert(InVT == MVT::v16i16 && "Unexpected VT!");
21043         Lo = extract128BitVector(In, 0, DAG, DL);
21044         Hi = extract128BitVector(In, 8, DAG, DL);
21045       }
21046       // We're split now, just emit two truncates and a concat. The two
21047       // truncates will trigger legalization to come back to this function.
21048       Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21049       Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21050       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21051     }
21052     // We either have 8 elements or we're allowed to use 512-bit vectors.
21053     // If we have VLX, we want to use the narrowest vector that can get the
21054     // job done so we use vXi32.
21055     MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21056     MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21057     In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21058     InVT = ExtVT;
21059     ShiftInx = InVT.getScalarSizeInBits() - 1;
21060   }
21061 
21062   if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21063     // We need to shift to get the lsb into sign position.
21064     In = DAG.getNode(ISD::SHL, DL, InVT, In,
21065                      DAG.getConstant(ShiftInx, DL, InVT));
21066   }
21067   // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21068   if (Subtarget.hasDQI())
21069     return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21070   return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21071 }
21072 
LowerTRUNCATE(SDValue Op,SelectionDAG & DAG) const21073 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21074   SDLoc DL(Op);
21075   MVT VT = Op.getSimpleValueType();
21076   SDValue In = Op.getOperand(0);
21077   MVT InVT = In.getSimpleValueType();
21078   unsigned InNumEltBits = InVT.getScalarSizeInBits();
21079 
21080   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
21081          "Invalid TRUNCATE operation");
21082 
21083   // If we're called by the type legalizer, handle a few cases.
21084   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21085   if (!TLI.isTypeLegal(InVT)) {
21086     if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21087         VT.is128BitVector()) {
21088       assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21089              "Unexpected subtarget!");
21090       // The default behavior is to truncate one step, concatenate, and then
21091       // truncate the remainder. We'd rather produce two 64-bit results and
21092       // concatenate those.
21093       SDValue Lo, Hi;
21094       std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21095 
21096       EVT LoVT, HiVT;
21097       std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21098 
21099       Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21100       Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21101       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21102     }
21103 
21104     // Otherwise let default legalization handle it.
21105     return SDValue();
21106   }
21107 
21108   if (VT.getVectorElementType() == MVT::i1)
21109     return LowerTruncateVecI1(Op, DAG, Subtarget);
21110 
21111   // vpmovqb/w/d, vpmovdb/w, vpmovwb
21112   if (Subtarget.hasAVX512()) {
21113     if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21114       assert(VT == MVT::v32i8 && "Unexpected VT!");
21115       return splitVectorIntUnary(Op, DAG);
21116     }
21117 
21118     // word to byte only under BWI. Otherwise we have to promoted to v16i32
21119     // and then truncate that. But we should only do that if we haven't been
21120     // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21121     // handled by isel patterns.
21122     if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21123         Subtarget.canExtendTo512DQ())
21124       return Op;
21125   }
21126 
21127   unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
21128   unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21129 
21130   // Truncate with PACKUS if we are truncating a vector with leading zero bits
21131   // that extend all the way to the packed/truncated value.
21132   // Pre-SSE41 we can only use PACKUSWB.
21133   KnownBits Known = DAG.computeKnownBits(In);
21134   if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
21135     if (SDValue V =
21136             truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
21137       return V;
21138 
21139   // Truncate with PACKSS if we are truncating a vector with sign-bits that
21140   // extend all the way to the packed/truncated value.
21141   if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
21142     if (SDValue V =
21143             truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
21144       return V;
21145 
21146   // Handle truncation of V256 to V128 using shuffles.
21147   assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
21148 
21149   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21150     In = DAG.getBitcast(MVT::v8i32, In);
21151 
21152     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21153     if (Subtarget.hasInt256()) {
21154       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21155       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21156       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21157                          DAG.getIntPtrConstant(0, DL));
21158     }
21159 
21160     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21161                                DAG.getIntPtrConstant(0, DL));
21162     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21163                                DAG.getIntPtrConstant(4, DL));
21164     static const int ShufMask[] = {0, 2, 4, 6};
21165     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
21166   }
21167 
21168   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21169     In = DAG.getBitcast(MVT::v32i8, In);
21170 
21171     // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21172     if (Subtarget.hasInt256()) {
21173       // The PSHUFB mask:
21174       static const int ShufMask1[] = { 0,  1,  4,  5,  8,  9, 12, 13,
21175                                       -1, -1, -1, -1, -1, -1, -1, -1,
21176                                       16, 17, 20, 21, 24, 25, 28, 29,
21177                                       -1, -1, -1, -1, -1, -1, -1, -1 };
21178       In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21179       In = DAG.getBitcast(MVT::v4i64, In);
21180 
21181       static const int ShufMask2[] = {0, 2, -1, -1};
21182       In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21183       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16,
21184                          DAG.getBitcast(MVT::v16i16, In),
21185                          DAG.getIntPtrConstant(0, DL));
21186     }
21187 
21188     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
21189                                DAG.getIntPtrConstant(0, DL));
21190     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
21191                                DAG.getIntPtrConstant(16, DL));
21192 
21193     // The PSHUFB mask:
21194     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
21195                                    -1, -1, -1, -1, -1, -1, -1, -1};
21196 
21197     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
21198     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
21199 
21200     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
21201     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
21202 
21203     // The MOVLHPS Mask:
21204     static const int ShufMask2[] = {0, 1, 4, 5};
21205     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
21206     return DAG.getBitcast(MVT::v8i16, res);
21207   }
21208 
21209   if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
21210     // Use an AND to zero uppper bits for PACKUS.
21211     In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
21212 
21213     SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21214                                DAG.getIntPtrConstant(0, DL));
21215     SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21216                                DAG.getIntPtrConstant(8, DL));
21217     return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
21218   }
21219 
21220   llvm_unreachable("All 256->128 cases should have been handled above!");
21221 }
21222 
LowerFP_TO_INT(SDValue Op,SelectionDAG & DAG) const21223 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21224   bool IsStrict = Op->isStrictFPOpcode();
21225   bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21226                   Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21227   MVT VT = Op->getSimpleValueType(0);
21228   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21229   MVT SrcVT = Src.getSimpleValueType();
21230   SDLoc dl(Op);
21231 
21232   if (VT.isVector()) {
21233     if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21234       MVT ResVT = MVT::v4i32;
21235       MVT TruncVT = MVT::v4i1;
21236       unsigned Opc;
21237       if (IsStrict)
21238         Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
21239       else
21240         Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21241 
21242       if (!IsSigned && !Subtarget.hasVLX()) {
21243         assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21244         // Widen to 512-bits.
21245         ResVT = MVT::v8i32;
21246         TruncVT = MVT::v8i1;
21247         Opc = Op.getOpcode();
21248         // Need to concat with zero vector for strict fp to avoid spurious
21249         // exceptions.
21250         // TODO: Should we just do this for non-strict as well?
21251         SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21252                                : DAG.getUNDEF(MVT::v8f64);
21253         Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21254                           DAG.getIntPtrConstant(0, dl));
21255       }
21256       SDValue Res, Chain;
21257       if (IsStrict) {
21258         Res =
21259             DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src});
21260         Chain = Res.getValue(1);
21261       } else {
21262         Res = DAG.getNode(Opc, dl, ResVT, Src);
21263       }
21264 
21265       Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21266       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21267                         DAG.getIntPtrConstant(0, dl));
21268       if (IsStrict)
21269         return DAG.getMergeValues({Res, Chain}, dl);
21270       return Res;
21271     }
21272 
21273     // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21274     if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21275       assert(!IsSigned && "Expected unsigned conversion!");
21276       assert(Subtarget.useAVX512Regs() && "Requires avx512f");
21277       return Op;
21278     }
21279 
21280     // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21281     if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21282         (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32)) {
21283       assert(!IsSigned && "Expected unsigned conversion!");
21284       assert(Subtarget.useAVX512Regs() && !Subtarget.hasVLX() &&
21285              "Unexpected features!");
21286       MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21287       MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21288       // Need to concat with zero vector for strict fp to avoid spurious
21289       // exceptions.
21290       // TODO: Should we just do this for non-strict as well?
21291       SDValue Tmp =
21292           IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21293       Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21294                         DAG.getIntPtrConstant(0, dl));
21295 
21296       SDValue Res, Chain;
21297       if (IsStrict) {
21298         Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21299                           {Op->getOperand(0), Src});
21300         Chain = Res.getValue(1);
21301       } else {
21302         Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21303       }
21304 
21305       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21306                         DAG.getIntPtrConstant(0, dl));
21307 
21308       if (IsStrict)
21309         return DAG.getMergeValues({Res, Chain}, dl);
21310       return Res;
21311     }
21312 
21313     // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21314     if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21315         (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32)) {
21316       assert(Subtarget.useAVX512Regs() && Subtarget.hasDQI() &&
21317              !Subtarget.hasVLX() && "Unexpected features!");
21318       MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21319       // Need to concat with zero vector for strict fp to avoid spurious
21320       // exceptions.
21321       // TODO: Should we just do this for non-strict as well?
21322       SDValue Tmp =
21323           IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21324       Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21325                         DAG.getIntPtrConstant(0, dl));
21326 
21327       SDValue Res, Chain;
21328       if (IsStrict) {
21329         Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21330                           {Op->getOperand(0), Src});
21331         Chain = Res.getValue(1);
21332       } else {
21333         Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21334       }
21335 
21336       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21337                         DAG.getIntPtrConstant(0, dl));
21338 
21339       if (IsStrict)
21340         return DAG.getMergeValues({Res, Chain}, dl);
21341       return Res;
21342     }
21343 
21344     if (VT == MVT::v2i64 && SrcVT  == MVT::v2f32) {
21345       if (!Subtarget.hasVLX()) {
21346         // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21347         // legalizer and then widened again by vector op legalization.
21348         if (!IsStrict)
21349           return SDValue();
21350 
21351         SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21352         SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21353                                   {Src, Zero, Zero, Zero});
21354         Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21355                           {Op->getOperand(0), Tmp});
21356         SDValue Chain = Tmp.getValue(1);
21357         Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21358                           DAG.getIntPtrConstant(0, dl));
21359         if (IsStrict)
21360           return DAG.getMergeValues({Tmp, Chain}, dl);
21361         return Tmp;
21362       }
21363 
21364       assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21365       SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21366                                 DAG.getUNDEF(MVT::v2f32));
21367       if (IsStrict) {
21368         unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21369                                 : X86ISD::STRICT_CVTTP2UI;
21370         return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21371       }
21372       unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21373       return DAG.getNode(Opc, dl, VT, Tmp);
21374     }
21375 
21376     return SDValue();
21377   }
21378 
21379   assert(!VT.isVector());
21380 
21381   bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21382 
21383   if (!IsSigned && UseSSEReg) {
21384     // Conversions from f32/f64 with AVX512 should be legal.
21385     if (Subtarget.hasAVX512())
21386       return Op;
21387 
21388     // Use default expansion for i64.
21389     if (VT == MVT::i64)
21390       return SDValue();
21391 
21392     assert(VT == MVT::i32 && "Unexpected VT!");
21393 
21394     // Promote i32 to i64 and use a signed operation on 64-bit targets.
21395     // FIXME: This does not generate an invalid exception if the input does not
21396     // fit in i32. PR44019
21397     if (Subtarget.is64Bit()) {
21398       SDValue Res, Chain;
21399       if (IsStrict) {
21400         Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other},
21401                           { Op.getOperand(0), Src });
21402         Chain = Res.getValue(1);
21403       } else
21404         Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21405 
21406       Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21407       if (IsStrict)
21408         return DAG.getMergeValues({ Res, Chain }, dl);
21409       return Res;
21410     }
21411 
21412     // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21413     // use fisttp which will be handled later.
21414     if (!Subtarget.hasSSE3())
21415       return SDValue();
21416   }
21417 
21418   // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21419   // FIXME: This does not generate an invalid exception if the input does not
21420   // fit in i16. PR44019
21421   if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21422     assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21423     SDValue Res, Chain;
21424     if (IsStrict) {
21425       Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other},
21426                         { Op.getOperand(0), Src });
21427       Chain = Res.getValue(1);
21428     } else
21429       Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21430 
21431     Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21432     if (IsStrict)
21433       return DAG.getMergeValues({ Res, Chain }, dl);
21434     return Res;
21435   }
21436 
21437   // If this is a FP_TO_SINT using SSEReg we're done.
21438   if (UseSSEReg && IsSigned)
21439     return Op;
21440 
21441   // fp128 needs to use a libcall.
21442   if (SrcVT == MVT::f128) {
21443     RTLIB::Libcall LC;
21444     if (IsSigned)
21445       LC = RTLIB::getFPTOSINT(SrcVT, VT);
21446     else
21447       LC = RTLIB::getFPTOUINT(SrcVT, VT);
21448 
21449     SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21450     MakeLibCallOptions CallOptions;
21451     std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
21452                                                   SDLoc(Op), Chain);
21453 
21454     if (IsStrict)
21455       return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21456 
21457     return Tmp.first;
21458   }
21459 
21460   // Fall back to X87.
21461   SDValue Chain;
21462   if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21463     if (IsStrict)
21464       return DAG.getMergeValues({V, Chain}, dl);
21465     return V;
21466   }
21467 
21468   llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21469 }
21470 
LowerLRINT_LLRINT(SDValue Op,SelectionDAG & DAG) const21471 SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21472                                              SelectionDAG &DAG) const {
21473   SDValue Src = Op.getOperand(0);
21474   MVT SrcVT = Src.getSimpleValueType();
21475 
21476   // If the source is in an SSE register, the node is Legal.
21477   if (isScalarFPTypeInSSEReg(SrcVT))
21478     return Op;
21479 
21480   return LRINT_LLRINTHelper(Op.getNode(), DAG);
21481 }
21482 
LRINT_LLRINTHelper(SDNode * N,SelectionDAG & DAG) const21483 SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
21484                                               SelectionDAG &DAG) const {
21485   EVT DstVT = N->getValueType(0);
21486   SDValue Src = N->getOperand(0);
21487   EVT SrcVT = Src.getValueType();
21488 
21489   if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21490     // f16 must be promoted before using the lowering in this routine.
21491     // fp128 does not use this lowering.
21492     return SDValue();
21493   }
21494 
21495   SDLoc DL(N);
21496   SDValue Chain = DAG.getEntryNode();
21497 
21498   bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
21499 
21500   // If we're converting from SSE, the stack slot needs to hold both types.
21501   // Otherwise it only needs to hold the DstVT.
21502   EVT OtherVT = UseSSE ? SrcVT : DstVT;
21503   SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
21504   int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
21505   MachinePointerInfo MPI =
21506       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
21507 
21508   if (UseSSE) {
21509     assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
21510     Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
21511     SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21512     SDValue Ops[] = { Chain, StackPtr };
21513 
21514     Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
21515                                   /*Align*/ None, MachineMemOperand::MOLoad);
21516     Chain = Src.getValue(1);
21517   }
21518 
21519   SDValue StoreOps[] = { Chain, Src, StackPtr };
21520   Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
21521                                   StoreOps, DstVT, MPI, /*Align*/ None,
21522                                   MachineMemOperand::MOStore);
21523 
21524   return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
21525 }
21526 
21527 SDValue
LowerFP_TO_INT_SAT(SDValue Op,SelectionDAG & DAG) const21528 X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
21529   // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
21530   // but making use of X86 specifics to produce better instruction sequences.
21531   SDNode *Node = Op.getNode();
21532   bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
21533   unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
21534   SDLoc dl(SDValue(Node, 0));
21535   SDValue Src = Node->getOperand(0);
21536 
21537   // There are three types involved here: SrcVT is the source floating point
21538   // type, DstVT is the type of the result, and TmpVT is the result of the
21539   // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
21540   // DstVT).
21541   EVT SrcVT = Src.getValueType();
21542   EVT DstVT = Node->getValueType(0);
21543   EVT TmpVT = DstVT;
21544 
21545   // This code is only for floats and doubles. Fall back to generic code for
21546   // anything else.
21547   if (!isScalarFPTypeInSSEReg(SrcVT))
21548     return SDValue();
21549 
21550   EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
21551   unsigned SatWidth = SatVT.getScalarSizeInBits();
21552   unsigned DstWidth = DstVT.getScalarSizeInBits();
21553   unsigned TmpWidth = TmpVT.getScalarSizeInBits();
21554   assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
21555          "Expected saturation width smaller than result width");
21556 
21557   // Promote result of FP_TO_*INT to at least 32 bits.
21558   if (TmpWidth < 32) {
21559     TmpVT = MVT::i32;
21560     TmpWidth = 32;
21561   }
21562 
21563   // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
21564   // us to use a native signed conversion instead.
21565   if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
21566     TmpVT = MVT::i64;
21567     TmpWidth = 64;
21568   }
21569 
21570   // If the saturation width is smaller than the size of the temporary result,
21571   // we can always use signed conversion, which is native.
21572   if (SatWidth < TmpWidth)
21573     FpToIntOpcode = ISD::FP_TO_SINT;
21574 
21575   // Determine minimum and maximum integer values and their corresponding
21576   // floating-point values.
21577   APInt MinInt, MaxInt;
21578   if (IsSigned) {
21579     MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth);
21580     MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth);
21581   } else {
21582     MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth);
21583     MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth);
21584   }
21585 
21586   APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21587   APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21588 
21589   APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
21590     MinInt, IsSigned, APFloat::rmTowardZero);
21591   APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
21592     MaxInt, IsSigned, APFloat::rmTowardZero);
21593   bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
21594                           && !(MaxStatus & APFloat::opStatus::opInexact);
21595 
21596   SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
21597   SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
21598 
21599   // If the integer bounds are exactly representable as floats, emit a
21600   // min+max+fptoi sequence. Otherwise use comparisons and selects.
21601   if (AreExactFloatBounds) {
21602     if (DstVT != TmpVT) {
21603       // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
21604       SDValue MinClamped = DAG.getNode(
21605         X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
21606       // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
21607       SDValue BothClamped = DAG.getNode(
21608         X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
21609       // Convert clamped value to integer.
21610       SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
21611 
21612       // NaN will become INDVAL, with the top bit set and the rest zero.
21613       // Truncation will discard the top bit, resulting in zero.
21614       return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21615     }
21616 
21617     // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
21618     SDValue MinClamped = DAG.getNode(
21619       X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
21620     // Clamp by MaxFloat from above. NaN cannot occur.
21621     SDValue BothClamped = DAG.getNode(
21622       X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
21623     // Convert clamped value to integer.
21624     SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
21625 
21626     if (!IsSigned) {
21627       // In the unsigned case we're done, because we mapped NaN to MinFloat,
21628       // which is zero.
21629       return FpToInt;
21630     }
21631 
21632     // Otherwise, select zero if Src is NaN.
21633     SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21634     return DAG.getSelectCC(
21635       dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
21636   }
21637 
21638   SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
21639   SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
21640 
21641   // Result of direct conversion, which may be selected away.
21642   SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
21643 
21644   if (DstVT != TmpVT) {
21645     // NaN will become INDVAL, with the top bit set and the rest zero.
21646     // Truncation will discard the top bit, resulting in zero.
21647     FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21648   }
21649 
21650   SDValue Select = FpToInt;
21651   // For signed conversions where we saturate to the same size as the
21652   // result type of the fptoi instructions, INDVAL coincides with integer
21653   // minimum, so we don't need to explicitly check it.
21654   if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
21655     // If Src ULT MinFloat, select MinInt. In particular, this also selects
21656     // MinInt if Src is NaN.
21657     Select = DAG.getSelectCC(
21658       dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
21659   }
21660 
21661   // If Src OGT MaxFloat, select MaxInt.
21662   Select = DAG.getSelectCC(
21663     dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
21664 
21665   // In the unsigned case we are done, because we mapped NaN to MinInt, which
21666   // is already zero. The promoted case was already handled above.
21667   if (!IsSigned || DstVT != TmpVT) {
21668     return Select;
21669   }
21670 
21671   // Otherwise, select 0 if Src is NaN.
21672   SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21673   return DAG.getSelectCC(
21674     dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
21675 }
21676 
LowerFP_EXTEND(SDValue Op,SelectionDAG & DAG) const21677 SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
21678   bool IsStrict = Op->isStrictFPOpcode();
21679 
21680   SDLoc DL(Op);
21681   MVT VT = Op.getSimpleValueType();
21682   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21683   MVT SVT = In.getSimpleValueType();
21684 
21685   if (VT == MVT::f128)
21686     return SDValue();
21687 
21688   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
21689 
21690   SDValue Res =
21691       DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
21692   if (IsStrict)
21693     return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21694                        {Op->getOperand(0), Res});
21695   return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21696 }
21697 
LowerFP_ROUND(SDValue Op,SelectionDAG & DAG) const21698 SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
21699   bool IsStrict = Op->isStrictFPOpcode();
21700   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21701   // It's legal except when f128 is involved
21702   if (In.getSimpleValueType() != MVT::f128)
21703     return Op;
21704 
21705   return SDValue();
21706 }
21707 
LowerFP16_TO_FP(SDValue Op,SelectionDAG & DAG)21708 static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
21709   bool IsStrict = Op->isStrictFPOpcode();
21710   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21711   assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
21712          "Unexpected VT!");
21713 
21714   SDLoc dl(Op);
21715   SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
21716                             DAG.getConstant(0, dl, MVT::v8i16), Src,
21717                             DAG.getIntPtrConstant(0, dl));
21718 
21719   SDValue Chain;
21720   if (IsStrict) {
21721     Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
21722                       {Op.getOperand(0), Res});
21723     Chain = Res.getValue(1);
21724   } else {
21725     Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
21726   }
21727 
21728   Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
21729                     DAG.getIntPtrConstant(0, dl));
21730 
21731   if (IsStrict)
21732     return DAG.getMergeValues({Res, Chain}, dl);
21733 
21734   return Res;
21735 }
21736 
LowerFP_TO_FP16(SDValue Op,SelectionDAG & DAG)21737 static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
21738   bool IsStrict = Op->isStrictFPOpcode();
21739   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21740   assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
21741          "Unexpected VT!");
21742 
21743   SDLoc dl(Op);
21744   SDValue Res, Chain;
21745   if (IsStrict) {
21746     Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
21747                       DAG.getConstantFP(0, dl, MVT::v4f32), Src,
21748                       DAG.getIntPtrConstant(0, dl));
21749     Res = DAG.getNode(
21750         X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
21751         {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
21752     Chain = Res.getValue(1);
21753   } else {
21754     // FIXME: Should we use zeros for upper elements for non-strict?
21755     Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
21756     Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
21757                       DAG.getTargetConstant(4, dl, MVT::i32));
21758   }
21759 
21760   Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
21761                     DAG.getIntPtrConstant(0, dl));
21762 
21763   if (IsStrict)
21764     return DAG.getMergeValues({Res, Chain}, dl);
21765 
21766   return Res;
21767 }
21768 
21769 /// Depending on uarch and/or optimizing for size, we might prefer to use a
21770 /// vector operation in place of the typical scalar operation.
lowerAddSubToHorizontalOp(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)21771 static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
21772                                          const X86Subtarget &Subtarget) {
21773   // If both operands have other uses, this is probably not profitable.
21774   SDValue LHS = Op.getOperand(0);
21775   SDValue RHS = Op.getOperand(1);
21776   if (!LHS.hasOneUse() && !RHS.hasOneUse())
21777     return Op;
21778 
21779   // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
21780   bool IsFP = Op.getSimpleValueType().isFloatingPoint();
21781   if (IsFP && !Subtarget.hasSSE3())
21782     return Op;
21783   if (!IsFP && !Subtarget.hasSSSE3())
21784     return Op;
21785 
21786   // Extract from a common vector.
21787   if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21788       RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21789       LHS.getOperand(0) != RHS.getOperand(0) ||
21790       !isa<ConstantSDNode>(LHS.getOperand(1)) ||
21791       !isa<ConstantSDNode>(RHS.getOperand(1)) ||
21792       !shouldUseHorizontalOp(true, DAG, Subtarget))
21793     return Op;
21794 
21795   // Allow commuted 'hadd' ops.
21796   // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
21797   unsigned HOpcode;
21798   switch (Op.getOpcode()) {
21799     case ISD::ADD: HOpcode = X86ISD::HADD; break;
21800     case ISD::SUB: HOpcode = X86ISD::HSUB; break;
21801     case ISD::FADD: HOpcode = X86ISD::FHADD; break;
21802     case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
21803     default:
21804       llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
21805   }
21806   unsigned LExtIndex = LHS.getConstantOperandVal(1);
21807   unsigned RExtIndex = RHS.getConstantOperandVal(1);
21808   if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
21809       (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
21810     std::swap(LExtIndex, RExtIndex);
21811 
21812   if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
21813     return Op;
21814 
21815   SDValue X = LHS.getOperand(0);
21816   EVT VecVT = X.getValueType();
21817   unsigned BitWidth = VecVT.getSizeInBits();
21818   unsigned NumLanes = BitWidth / 128;
21819   unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
21820   assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
21821          "Not expecting illegal vector widths here");
21822 
21823   // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
21824   // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
21825   SDLoc DL(Op);
21826   if (BitWidth == 256 || BitWidth == 512) {
21827     unsigned LaneIdx = LExtIndex / NumEltsPerLane;
21828     X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
21829     LExtIndex %= NumEltsPerLane;
21830   }
21831 
21832   // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
21833   // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
21834   // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
21835   // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
21836   SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
21837   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
21838                      DAG.getIntPtrConstant(LExtIndex / 2, DL));
21839 }
21840 
21841 /// Depending on uarch and/or optimizing for size, we might prefer to use a
21842 /// vector operation in place of the typical scalar operation.
lowerFaddFsub(SDValue Op,SelectionDAG & DAG) const21843 SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
21844   assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
21845          "Only expecting float/double");
21846   return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
21847 }
21848 
21849 /// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
21850 /// This mode isn't supported in hardware on X86. But as long as we aren't
21851 /// compiling with trapping math, we can emulate this with
21852 /// floor(X + copysign(nextafter(0.5, 0.0), X)).
LowerFROUND(SDValue Op,SelectionDAG & DAG)21853 static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
21854   SDValue N0 = Op.getOperand(0);
21855   SDLoc dl(Op);
21856   MVT VT = Op.getSimpleValueType();
21857 
21858   // N0 += copysign(nextafter(0.5, 0.0), N0)
21859   const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
21860   bool Ignored;
21861   APFloat Point5Pred = APFloat(0.5f);
21862   Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
21863   Point5Pred.next(/*nextDown*/true);
21864 
21865   SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
21866                               DAG.getConstantFP(Point5Pred, dl, VT), N0);
21867   N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
21868 
21869   // Truncate the result to remove fraction.
21870   return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
21871 }
21872 
21873 /// The only differences between FABS and FNEG are the mask and the logic op.
21874 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
LowerFABSorFNEG(SDValue Op,SelectionDAG & DAG)21875 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
21876   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
21877          "Wrong opcode for lowering FABS or FNEG.");
21878 
21879   bool IsFABS = (Op.getOpcode() == ISD::FABS);
21880 
21881   // If this is a FABS and it has an FNEG user, bail out to fold the combination
21882   // into an FNABS. We'll lower the FABS after that if it is still in use.
21883   if (IsFABS)
21884     for (SDNode *User : Op->uses())
21885       if (User->getOpcode() == ISD::FNEG)
21886         return Op;
21887 
21888   SDLoc dl(Op);
21889   MVT VT = Op.getSimpleValueType();
21890 
21891   bool IsF128 = (VT == MVT::f128);
21892   assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
21893           VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
21894           VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
21895          "Unexpected type in LowerFABSorFNEG");
21896 
21897   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
21898   // decide if we should generate a 16-byte constant mask when we only need 4 or
21899   // 8 bytes for the scalar case.
21900 
21901   // There are no scalar bitwise logical SSE/AVX instructions, so we
21902   // generate a 16-byte vector constant and logic op even for the scalar case.
21903   // Using a 16-byte mask allows folding the load of the mask with
21904   // the logic op, so it can save (~4 bytes) on code size.
21905   bool IsFakeVector = !VT.isVector() && !IsF128;
21906   MVT LogicVT = VT;
21907   if (IsFakeVector)
21908     LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
21909 
21910   unsigned EltBits = VT.getScalarSizeInBits();
21911   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
21912   APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
21913                            APInt::getSignMask(EltBits);
21914   const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
21915   SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
21916 
21917   SDValue Op0 = Op.getOperand(0);
21918   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
21919   unsigned LogicOp = IsFABS  ? X86ISD::FAND :
21920                      IsFNABS ? X86ISD::FOR  :
21921                                X86ISD::FXOR;
21922   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
21923 
21924   if (VT.isVector() || IsF128)
21925     return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
21926 
21927   // For the scalar case extend to a 128-bit vector, perform the logic op,
21928   // and extract the scalar result back out.
21929   Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
21930   SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
21931   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
21932                      DAG.getIntPtrConstant(0, dl));
21933 }
21934 
LowerFCOPYSIGN(SDValue Op,SelectionDAG & DAG)21935 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
21936   SDValue Mag = Op.getOperand(0);
21937   SDValue Sign = Op.getOperand(1);
21938   SDLoc dl(Op);
21939 
21940   // If the sign operand is smaller, extend it first.
21941   MVT VT = Op.getSimpleValueType();
21942   if (Sign.getSimpleValueType().bitsLT(VT))
21943     Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
21944 
21945   // And if it is bigger, shrink it first.
21946   if (Sign.getSimpleValueType().bitsGT(VT))
21947     Sign =
21948         DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(0, dl));
21949 
21950   // At this point the operands and the result should have the same
21951   // type, and that won't be f80 since that is not custom lowered.
21952   bool IsF128 = (VT == MVT::f128);
21953   assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
21954           VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
21955           VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
21956          "Unexpected type in LowerFCOPYSIGN");
21957 
21958   const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
21959 
21960   // Perform all scalar logic operations as 16-byte vectors because there are no
21961   // scalar FP logic instructions in SSE.
21962   // TODO: This isn't necessary. If we used scalar types, we might avoid some
21963   // unnecessary splats, but we might miss load folding opportunities. Should
21964   // this decision be based on OptimizeForSize?
21965   bool IsFakeVector = !VT.isVector() && !IsF128;
21966   MVT LogicVT = VT;
21967   if (IsFakeVector)
21968     LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
21969 
21970   // The mask constants are automatically splatted for vector types.
21971   unsigned EltSizeInBits = VT.getScalarSizeInBits();
21972   SDValue SignMask = DAG.getConstantFP(
21973       APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
21974   SDValue MagMask = DAG.getConstantFP(
21975       APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
21976 
21977   // First, clear all bits but the sign bit from the second operand (sign).
21978   if (IsFakeVector)
21979     Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
21980   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
21981 
21982   // Next, clear the sign bit from the first operand (magnitude).
21983   // TODO: If we had general constant folding for FP logic ops, this check
21984   // wouldn't be necessary.
21985   SDValue MagBits;
21986   if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
21987     APFloat APF = Op0CN->getValueAPF();
21988     APF.clearSign();
21989     MagBits = DAG.getConstantFP(APF, dl, LogicVT);
21990   } else {
21991     // If the magnitude operand wasn't a constant, we need to AND out the sign.
21992     if (IsFakeVector)
21993       Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
21994     MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
21995   }
21996 
21997   // OR the magnitude value with the sign bit.
21998   SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
21999   return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22000                                           DAG.getIntPtrConstant(0, dl));
22001 }
22002 
LowerFGETSIGN(SDValue Op,SelectionDAG & DAG)22003 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
22004   SDValue N0 = Op.getOperand(0);
22005   SDLoc dl(Op);
22006   MVT VT = Op.getSimpleValueType();
22007 
22008   MVT OpVT = N0.getSimpleValueType();
22009   assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
22010          "Unexpected type for FGETSIGN");
22011 
22012   // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22013   MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22014   SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22015   Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22016   Res = DAG.getZExtOrTrunc(Res, dl, VT);
22017   Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22018   return Res;
22019 }
22020 
22021 /// Helper for creating a X86ISD::SETCC node.
getSETCC(X86::CondCode Cond,SDValue EFLAGS,const SDLoc & dl,SelectionDAG & DAG)22022 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
22023                         SelectionDAG &DAG) {
22024   return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22025                      DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22026 }
22027 
22028 /// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
22029 /// style scalarized (associative) reduction patterns. Partial reductions
22030 /// are supported when the pointer SrcMask is non-null.
22031 /// TODO - move this to SelectionDAG?
matchScalarReduction(SDValue Op,ISD::NodeType BinOp,SmallVectorImpl<SDValue> & SrcOps,SmallVectorImpl<APInt> * SrcMask=nullptr)22032 static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
22033                                  SmallVectorImpl<SDValue> &SrcOps,
22034                                  SmallVectorImpl<APInt> *SrcMask = nullptr) {
22035   SmallVector<SDValue, 8> Opnds;
22036   DenseMap<SDValue, APInt> SrcOpMap;
22037   EVT VT = MVT::Other;
22038 
22039   // Recognize a special case where a vector is casted into wide integer to
22040   // test all 0s.
22041   assert(Op.getOpcode() == unsigned(BinOp) &&
22042          "Unexpected bit reduction opcode");
22043   Opnds.push_back(Op.getOperand(0));
22044   Opnds.push_back(Op.getOperand(1));
22045 
22046   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22047     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
22048     // BFS traverse all BinOp operands.
22049     if (I->getOpcode() == unsigned(BinOp)) {
22050       Opnds.push_back(I->getOperand(0));
22051       Opnds.push_back(I->getOperand(1));
22052       // Re-evaluate the number of nodes to be traversed.
22053       e += 2; // 2 more nodes (LHS and RHS) are pushed.
22054       continue;
22055     }
22056 
22057     // Quit if a non-EXTRACT_VECTOR_ELT
22058     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22059       return false;
22060 
22061     // Quit if without a constant index.
22062     auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
22063     if (!Idx)
22064       return false;
22065 
22066     SDValue Src = I->getOperand(0);
22067     DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
22068     if (M == SrcOpMap.end()) {
22069       VT = Src.getValueType();
22070       // Quit if not the same type.
22071       if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
22072         return false;
22073       unsigned NumElts = VT.getVectorNumElements();
22074       APInt EltCount = APInt::getNullValue(NumElts);
22075       M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
22076       SrcOps.push_back(Src);
22077     }
22078 
22079     // Quit if element already used.
22080     unsigned CIdx = Idx->getZExtValue();
22081     if (M->second[CIdx])
22082       return false;
22083     M->second.setBit(CIdx);
22084   }
22085 
22086   if (SrcMask) {
22087     // Collect the source partial masks.
22088     for (SDValue &SrcOp : SrcOps)
22089       SrcMask->push_back(SrcOpMap[SrcOp]);
22090   } else {
22091     // Quit if not all elements are used.
22092     for (const auto &I : SrcOpMap)
22093       if (!I.second.isAllOnesValue())
22094         return false;
22095   }
22096 
22097   return true;
22098 }
22099 
22100 // Helper function for comparing all bits of a vector against zero.
LowerVectorAllZero(const SDLoc & DL,SDValue V,ISD::CondCode CC,const APInt & Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG,X86::CondCode & X86CC)22101 static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
22102                                   const APInt &Mask,
22103                                   const X86Subtarget &Subtarget,
22104                                   SelectionDAG &DAG, X86::CondCode &X86CC) {
22105   EVT VT = V.getValueType();
22106   unsigned ScalarSize = VT.getScalarSizeInBits();
22107   if (Mask.getBitWidth() != ScalarSize) {
22108     assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
22109     return SDValue();
22110   }
22111 
22112   assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22113   X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
22114 
22115   auto MaskBits = [&](SDValue Src) {
22116     if (Mask.isAllOnesValue())
22117       return Src;
22118     EVT SrcVT = Src.getValueType();
22119     SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
22120     return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
22121   };
22122 
22123   // For sub-128-bit vector, cast to (legal) integer and compare with zero.
22124   if (VT.getSizeInBits() < 128) {
22125     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
22126     if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))
22127       return SDValue();
22128     return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22129                        DAG.getBitcast(IntVT, MaskBits(V)),
22130                        DAG.getConstant(0, DL, IntVT));
22131   }
22132 
22133   // Quit if not splittable to 128/256-bit vector.
22134   if (!isPowerOf2_32(VT.getSizeInBits()))
22135     return SDValue();
22136 
22137   // Split down to 128/256-bit vector.
22138   unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
22139   while (VT.getSizeInBits() > TestSize) {
22140     auto Split = DAG.SplitVector(V, DL);
22141     VT = Split.first.getValueType();
22142     V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
22143   }
22144 
22145   bool UsePTEST = Subtarget.hasSSE41();
22146   if (UsePTEST) {
22147     MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
22148     V = DAG.getBitcast(TestVT, MaskBits(V));
22149     return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
22150   }
22151 
22152   // Without PTEST, a masked v2i64 or-reduction is not faster than
22153   // scalarization.
22154   if (!Mask.isAllOnesValue() && VT.getScalarSizeInBits() > 32)
22155       return SDValue();
22156 
22157   V = DAG.getBitcast(MVT::v16i8, MaskBits(V));
22158   V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,
22159                   getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
22160   V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22161   return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22162                      DAG.getConstant(0xFFFF, DL, MVT::i32));
22163 }
22164 
22165 // Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to
22166 // CMP(MOVMSK(PCMPEQB(X,0))).
MatchVectorAllZeroTest(SDValue Op,ISD::CondCode CC,const SDLoc & DL,const X86Subtarget & Subtarget,SelectionDAG & DAG,SDValue & X86CC)22167 static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
22168                                       const SDLoc &DL,
22169                                       const X86Subtarget &Subtarget,
22170                                       SelectionDAG &DAG, SDValue &X86CC) {
22171   assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22172 
22173   if (!Subtarget.hasSSE2() || !Op->hasOneUse())
22174     return SDValue();
22175 
22176   // Check whether we're masking/truncating an OR-reduction result, in which
22177   // case track the masked bits.
22178   APInt Mask = APInt::getAllOnesValue(Op.getScalarValueSizeInBits());
22179   switch (Op.getOpcode()) {
22180   case ISD::TRUNCATE: {
22181     SDValue Src = Op.getOperand(0);
22182     Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
22183                                 Op.getScalarValueSizeInBits());
22184     Op = Src;
22185     break;
22186   }
22187   case ISD::AND: {
22188     if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
22189       Mask = Cst->getAPIntValue();
22190       Op = Op.getOperand(0);
22191     }
22192     break;
22193   }
22194   }
22195 
22196   SmallVector<SDValue, 8> VecIns;
22197   if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {
22198     EVT VT = VecIns[0].getValueType();
22199     assert(llvm::all_of(VecIns,
22200                         [VT](SDValue V) { return VT == V.getValueType(); }) &&
22201            "Reduction source vector mismatch");
22202 
22203     // Quit if less than 128-bits or not splittable to 128/256-bit vector.
22204     if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))
22205       return SDValue();
22206 
22207     // If more than one full vector is evaluated, OR them first before PTEST.
22208     for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
22209          Slot += 2, e += 1) {
22210       // Each iteration will OR 2 nodes and append the result until there is
22211       // only 1 node left, i.e. the final OR'd value of all vectors.
22212       SDValue LHS = VecIns[Slot];
22213       SDValue RHS = VecIns[Slot + 1];
22214       VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));
22215     }
22216 
22217     X86::CondCode CCode;
22218     if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,
22219                                        DAG, CCode)) {
22220       X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
22221       return V;
22222     }
22223   }
22224 
22225   if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
22226     ISD::NodeType BinOp;
22227     if (SDValue Match =
22228             DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {
22229       X86::CondCode CCode;
22230       if (SDValue V =
22231               LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {
22232         X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
22233         return V;
22234       }
22235     }
22236   }
22237 
22238   return SDValue();
22239 }
22240 
22241 /// return true if \c Op has a use that doesn't just read flags.
hasNonFlagsUse(SDValue Op)22242 static bool hasNonFlagsUse(SDValue Op) {
22243   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
22244        ++UI) {
22245     SDNode *User = *UI;
22246     unsigned UOpNo = UI.getOperandNo();
22247     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
22248       // Look pass truncate.
22249       UOpNo = User->use_begin().getOperandNo();
22250       User = *User->use_begin();
22251     }
22252 
22253     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
22254         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
22255       return true;
22256   }
22257   return false;
22258 }
22259 
22260 // Transform to an x86-specific ALU node with flags if there is a chance of
22261 // using an RMW op or only the flags are used. Otherwise, leave
22262 // the node alone and emit a 'cmp' or 'test' instruction.
isProfitableToUseFlagOp(SDValue Op)22263 static bool isProfitableToUseFlagOp(SDValue Op) {
22264   for (SDNode *U : Op->uses())
22265     if (U->getOpcode() != ISD::CopyToReg &&
22266         U->getOpcode() != ISD::SETCC &&
22267         U->getOpcode() != ISD::STORE)
22268       return false;
22269 
22270   return true;
22271 }
22272 
22273 /// Emit nodes that will be selected as "test Op0,Op0", or something
22274 /// equivalent.
EmitTest(SDValue Op,unsigned X86CC,const SDLoc & dl,SelectionDAG & DAG,const X86Subtarget & Subtarget)22275 static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
22276                         SelectionDAG &DAG, const X86Subtarget &Subtarget) {
22277   // CF and OF aren't always set the way we want. Determine which
22278   // of these we need.
22279   bool NeedCF = false;
22280   bool NeedOF = false;
22281   switch (X86CC) {
22282   default: break;
22283   case X86::COND_A: case X86::COND_AE:
22284   case X86::COND_B: case X86::COND_BE:
22285     NeedCF = true;
22286     break;
22287   case X86::COND_G: case X86::COND_GE:
22288   case X86::COND_L: case X86::COND_LE:
22289   case X86::COND_O: case X86::COND_NO: {
22290     // Check if we really need to set the
22291     // Overflow flag. If NoSignedWrap is present
22292     // that is not actually needed.
22293     switch (Op->getOpcode()) {
22294     case ISD::ADD:
22295     case ISD::SUB:
22296     case ISD::MUL:
22297     case ISD::SHL:
22298       if (Op.getNode()->getFlags().hasNoSignedWrap())
22299         break;
22300       LLVM_FALLTHROUGH;
22301     default:
22302       NeedOF = true;
22303       break;
22304     }
22305     break;
22306   }
22307   }
22308   // See if we can use the EFLAGS value from the operand instead of
22309   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
22310   // we prove that the arithmetic won't overflow, we can't use OF or CF.
22311   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
22312     // Emit a CMP with 0, which is the TEST pattern.
22313     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22314                        DAG.getConstant(0, dl, Op.getValueType()));
22315   }
22316   unsigned Opcode = 0;
22317   unsigned NumOperands = 0;
22318 
22319   SDValue ArithOp = Op;
22320 
22321   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
22322   // which may be the result of a CAST.  We use the variable 'Op', which is the
22323   // non-casted variable when we check for possible users.
22324   switch (ArithOp.getOpcode()) {
22325   case ISD::AND:
22326     // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
22327     // because a TEST instruction will be better.
22328     if (!hasNonFlagsUse(Op))
22329       break;
22330 
22331     LLVM_FALLTHROUGH;
22332   case ISD::ADD:
22333   case ISD::SUB:
22334   case ISD::OR:
22335   case ISD::XOR:
22336     if (!isProfitableToUseFlagOp(Op))
22337       break;
22338 
22339     // Otherwise use a regular EFLAGS-setting instruction.
22340     switch (ArithOp.getOpcode()) {
22341     default: llvm_unreachable("unexpected operator!");
22342     case ISD::ADD: Opcode = X86ISD::ADD; break;
22343     case ISD::SUB: Opcode = X86ISD::SUB; break;
22344     case ISD::XOR: Opcode = X86ISD::XOR; break;
22345     case ISD::AND: Opcode = X86ISD::AND; break;
22346     case ISD::OR:  Opcode = X86ISD::OR;  break;
22347     }
22348 
22349     NumOperands = 2;
22350     break;
22351   case X86ISD::ADD:
22352   case X86ISD::SUB:
22353   case X86ISD::OR:
22354   case X86ISD::XOR:
22355   case X86ISD::AND:
22356     return SDValue(Op.getNode(), 1);
22357   case ISD::SSUBO:
22358   case ISD::USUBO: {
22359     // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
22360     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22361     return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
22362                        Op->getOperand(1)).getValue(1);
22363   }
22364   default:
22365     break;
22366   }
22367 
22368   if (Opcode == 0) {
22369     // Emit a CMP with 0, which is the TEST pattern.
22370     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22371                        DAG.getConstant(0, dl, Op.getValueType()));
22372   }
22373   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22374   SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
22375 
22376   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
22377   DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
22378   return SDValue(New.getNode(), 1);
22379 }
22380 
22381 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
22382 /// equivalent.
EmitCmp(SDValue Op0,SDValue Op1,unsigned X86CC,const SDLoc & dl,SelectionDAG & DAG,const X86Subtarget & Subtarget)22383 static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
22384                        const SDLoc &dl, SelectionDAG &DAG,
22385                        const X86Subtarget &Subtarget) {
22386   if (isNullConstant(Op1))
22387     return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
22388 
22389   EVT CmpVT = Op0.getValueType();
22390 
22391   assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
22392           CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
22393 
22394   // Only promote the compare up to I32 if it is a 16 bit operation
22395   // with an immediate.  16 bit immediates are to be avoided.
22396   if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
22397       !DAG.getMachineFunction().getFunction().hasMinSize()) {
22398     ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
22399     ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
22400     // Don't do this if the immediate can fit in 8-bits.
22401     if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
22402         (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
22403       unsigned ExtendOp =
22404           isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22405       if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
22406         // For equality comparisons try to use SIGN_EXTEND if the input was
22407         // truncate from something with enough sign bits.
22408         if (Op0.getOpcode() == ISD::TRUNCATE) {
22409           SDValue In = Op0.getOperand(0);
22410           unsigned EffBits =
22411               In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
22412           if (EffBits <= 16)
22413             ExtendOp = ISD::SIGN_EXTEND;
22414         } else if (Op1.getOpcode() == ISD::TRUNCATE) {
22415           SDValue In = Op1.getOperand(0);
22416           unsigned EffBits =
22417               In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
22418           if (EffBits <= 16)
22419             ExtendOp = ISD::SIGN_EXTEND;
22420         }
22421       }
22422 
22423       CmpVT = MVT::i32;
22424       Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
22425       Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
22426     }
22427   }
22428 
22429   // Try to shrink i64 compares if the input has enough zero bits.
22430   // FIXME: Do this for non-constant compares for constant on LHS?
22431   if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
22432       Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
22433       cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
22434       DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
22435     CmpVT = MVT::i32;
22436     Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
22437     Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
22438   }
22439 
22440   // 0-x == y --> x+y == 0
22441   // 0-x != y --> x+y != 0
22442   if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
22443       Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22444     SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22445     SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
22446     return Add.getValue(1);
22447   }
22448 
22449   // x == 0-y --> x+y == 0
22450   // x != 0-y --> x+y != 0
22451   if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
22452       Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22453     SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22454     SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
22455     return Add.getValue(1);
22456   }
22457 
22458   // Use SUB instead of CMP to enable CSE between SUB and CMP.
22459   SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22460   SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
22461   return Sub.getValue(1);
22462 }
22463 
22464 /// Check if replacement of SQRT with RSQRT should be disabled.
isFsqrtCheap(SDValue Op,SelectionDAG & DAG) const22465 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
22466   EVT VT = Op.getValueType();
22467 
22468   // We never want to use both SQRT and RSQRT instructions for the same input.
22469   if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
22470     return false;
22471 
22472   if (VT.isVector())
22473     return Subtarget.hasFastVectorFSQRT();
22474   return Subtarget.hasFastScalarFSQRT();
22475 }
22476 
22477 /// The minimum architected relative accuracy is 2^-12. We need one
22478 /// Newton-Raphson step to have a good float result (24 bits of precision).
getSqrtEstimate(SDValue Op,SelectionDAG & DAG,int Enabled,int & RefinementSteps,bool & UseOneConstNR,bool Reciprocal) const22479 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
22480                                            SelectionDAG &DAG, int Enabled,
22481                                            int &RefinementSteps,
22482                                            bool &UseOneConstNR,
22483                                            bool Reciprocal) const {
22484   EVT VT = Op.getValueType();
22485 
22486   // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
22487   // It is likely not profitable to do this for f64 because a double-precision
22488   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
22489   // instructions: convert to single, rsqrtss, convert back to double, refine
22490   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
22491   // along with FMA, this could be a throughput win.
22492   // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
22493   // after legalize types.
22494   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22495       (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
22496       (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
22497       (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22498       (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22499     if (RefinementSteps == ReciprocalEstimate::Unspecified)
22500       RefinementSteps = 1;
22501 
22502     UseOneConstNR = false;
22503     // There is no FSQRT for 512-bits, but there is RSQRT14.
22504     unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
22505     return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
22506   }
22507   return SDValue();
22508 }
22509 
22510 /// The minimum architected relative accuracy is 2^-12. We need one
22511 /// Newton-Raphson step to have a good float result (24 bits of precision).
getRecipEstimate(SDValue Op,SelectionDAG & DAG,int Enabled,int & RefinementSteps) const22512 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
22513                                             int Enabled,
22514                                             int &RefinementSteps) const {
22515   EVT VT = Op.getValueType();
22516 
22517   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
22518   // It is likely not profitable to do this for f64 because a double-precision
22519   // reciprocal estimate with refinement on x86 prior to FMA requires
22520   // 15 instructions: convert to single, rcpss, convert back to double, refine
22521   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
22522   // along with FMA, this could be a throughput win.
22523 
22524   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22525       (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
22526       (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22527       (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22528     // Enable estimate codegen with 1 refinement step for vector division.
22529     // Scalar division estimates are disabled because they break too much
22530     // real-world code. These defaults are intended to match GCC behavior.
22531     if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
22532       return SDValue();
22533 
22534     if (RefinementSteps == ReciprocalEstimate::Unspecified)
22535       RefinementSteps = 1;
22536 
22537     // There is no FSQRT for 512-bits, but there is RCP14.
22538     unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
22539     return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
22540   }
22541   return SDValue();
22542 }
22543 
22544 /// If we have at least two divisions that use the same divisor, convert to
22545 /// multiplication by a reciprocal. This may need to be adjusted for a given
22546 /// CPU if a division's cost is not at least twice the cost of a multiplication.
22547 /// This is because we still need one division to calculate the reciprocal and
22548 /// then we need two multiplies by that reciprocal as replacements for the
22549 /// original divisions.
combineRepeatedFPDivisors() const22550 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
22551   return 2;
22552 }
22553 
22554 SDValue
BuildSDIVPow2(SDNode * N,const APInt & Divisor,SelectionDAG & DAG,SmallVectorImpl<SDNode * > & Created) const22555 X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
22556                                  SelectionDAG &DAG,
22557                                  SmallVectorImpl<SDNode *> &Created) const {
22558   AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
22559   if (isIntDivCheap(N->getValueType(0), Attr))
22560     return SDValue(N,0); // Lower SDIV as SDIV
22561 
22562   assert((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&
22563          "Unexpected divisor!");
22564 
22565   // Only perform this transform if CMOV is supported otherwise the select
22566   // below will become a branch.
22567   if (!Subtarget.hasCMov())
22568     return SDValue();
22569 
22570   // fold (sdiv X, pow2)
22571   EVT VT = N->getValueType(0);
22572   // FIXME: Support i8.
22573   if (VT != MVT::i16 && VT != MVT::i32 &&
22574       !(Subtarget.is64Bit() && VT == MVT::i64))
22575     return SDValue();
22576 
22577   unsigned Lg2 = Divisor.countTrailingZeros();
22578 
22579   // If the divisor is 2 or -2, the default expansion is better.
22580   if (Lg2 == 1)
22581     return SDValue();
22582 
22583   SDLoc DL(N);
22584   SDValue N0 = N->getOperand(0);
22585   SDValue Zero = DAG.getConstant(0, DL, VT);
22586   APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
22587   SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
22588 
22589   // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
22590   SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
22591   SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
22592   SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
22593 
22594   Created.push_back(Cmp.getNode());
22595   Created.push_back(Add.getNode());
22596   Created.push_back(CMov.getNode());
22597 
22598   // Divide by pow2.
22599   SDValue SRA =
22600       DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
22601 
22602   // If we're dividing by a positive value, we're done.  Otherwise, we must
22603   // negate the result.
22604   if (Divisor.isNonNegative())
22605     return SRA;
22606 
22607   Created.push_back(SRA.getNode());
22608   return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
22609 }
22610 
22611 /// Result of 'and' is compared against zero. Change to a BT node if possible.
22612 /// Returns the BT node and the condition code needed to use it.
LowerAndToBT(SDValue And,ISD::CondCode CC,const SDLoc & dl,SelectionDAG & DAG,SDValue & X86CC)22613 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
22614                             const SDLoc &dl, SelectionDAG &DAG,
22615                             SDValue &X86CC) {
22616   assert(And.getOpcode() == ISD::AND && "Expected AND node!");
22617   SDValue Op0 = And.getOperand(0);
22618   SDValue Op1 = And.getOperand(1);
22619   if (Op0.getOpcode() == ISD::TRUNCATE)
22620     Op0 = Op0.getOperand(0);
22621   if (Op1.getOpcode() == ISD::TRUNCATE)
22622     Op1 = Op1.getOperand(0);
22623 
22624   SDValue Src, BitNo;
22625   if (Op1.getOpcode() == ISD::SHL)
22626     std::swap(Op0, Op1);
22627   if (Op0.getOpcode() == ISD::SHL) {
22628     if (isOneConstant(Op0.getOperand(0))) {
22629       // If we looked past a truncate, check that it's only truncating away
22630       // known zeros.
22631       unsigned BitWidth = Op0.getValueSizeInBits();
22632       unsigned AndBitWidth = And.getValueSizeInBits();
22633       if (BitWidth > AndBitWidth) {
22634         KnownBits Known = DAG.computeKnownBits(Op0);
22635         if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
22636           return SDValue();
22637       }
22638       Src = Op1;
22639       BitNo = Op0.getOperand(1);
22640     }
22641   } else if (Op1.getOpcode() == ISD::Constant) {
22642     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
22643     uint64_t AndRHSVal = AndRHS->getZExtValue();
22644     SDValue AndLHS = Op0;
22645 
22646     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
22647       Src = AndLHS.getOperand(0);
22648       BitNo = AndLHS.getOperand(1);
22649     } else {
22650       // Use BT if the immediate can't be encoded in a TEST instruction or we
22651       // are optimizing for size and the immedaite won't fit in a byte.
22652       bool OptForSize = DAG.shouldOptForSize();
22653       if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
22654           isPowerOf2_64(AndRHSVal)) {
22655         Src = AndLHS;
22656         BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
22657                                 Src.getValueType());
22658       }
22659     }
22660   }
22661 
22662   // No patterns found, give up.
22663   if (!Src.getNode())
22664     return SDValue();
22665 
22666   // If Src is i8, promote it to i32 with any_extend.  There is no i8 BT
22667   // instruction.  Since the shift amount is in-range-or-undefined, we know
22668   // that doing a bittest on the i32 value is ok.  We extend to i32 because
22669   // the encoding for the i16 version is larger than the i32 version.
22670   // Also promote i16 to i32 for performance / code size reason.
22671   if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
22672     Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
22673 
22674   // See if we can use the 32-bit instruction instead of the 64-bit one for a
22675   // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22676   // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22677   // known to be zero.
22678   if (Src.getValueType() == MVT::i64 &&
22679       DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22680     Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
22681 
22682   // If the operand types disagree, extend the shift amount to match.  Since
22683   // BT ignores high bits (like shifts) we can use anyextend.
22684   if (Src.getValueType() != BitNo.getValueType())
22685     BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
22686 
22687   X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
22688                                 dl, MVT::i8);
22689   return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
22690 }
22691 
22692 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
22693 /// CMPs.
translateX86FSETCC(ISD::CondCode SetCCOpcode,SDValue & Op0,SDValue & Op1,bool & IsAlwaysSignaling)22694 static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
22695                                    SDValue &Op1, bool &IsAlwaysSignaling) {
22696   unsigned SSECC;
22697   bool Swap = false;
22698 
22699   // SSE Condition code mapping:
22700   //  0 - EQ
22701   //  1 - LT
22702   //  2 - LE
22703   //  3 - UNORD
22704   //  4 - NEQ
22705   //  5 - NLT
22706   //  6 - NLE
22707   //  7 - ORD
22708   switch (SetCCOpcode) {
22709   default: llvm_unreachable("Unexpected SETCC condition");
22710   case ISD::SETOEQ:
22711   case ISD::SETEQ:  SSECC = 0; break;
22712   case ISD::SETOGT:
22713   case ISD::SETGT:  Swap = true; LLVM_FALLTHROUGH;
22714   case ISD::SETLT:
22715   case ISD::SETOLT: SSECC = 1; break;
22716   case ISD::SETOGE:
22717   case ISD::SETGE:  Swap = true; LLVM_FALLTHROUGH;
22718   case ISD::SETLE:
22719   case ISD::SETOLE: SSECC = 2; break;
22720   case ISD::SETUO:  SSECC = 3; break;
22721   case ISD::SETUNE:
22722   case ISD::SETNE:  SSECC = 4; break;
22723   case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
22724   case ISD::SETUGE: SSECC = 5; break;
22725   case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
22726   case ISD::SETUGT: SSECC = 6; break;
22727   case ISD::SETO:   SSECC = 7; break;
22728   case ISD::SETUEQ: SSECC = 8; break;
22729   case ISD::SETONE: SSECC = 12; break;
22730   }
22731   if (Swap)
22732     std::swap(Op0, Op1);
22733 
22734   switch (SetCCOpcode) {
22735   default:
22736     IsAlwaysSignaling = true;
22737     break;
22738   case ISD::SETEQ:
22739   case ISD::SETOEQ:
22740   case ISD::SETUEQ:
22741   case ISD::SETNE:
22742   case ISD::SETONE:
22743   case ISD::SETUNE:
22744   case ISD::SETO:
22745   case ISD::SETUO:
22746     IsAlwaysSignaling = false;
22747     break;
22748   }
22749 
22750   return SSECC;
22751 }
22752 
22753 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
22754 /// concatenate the result back.
splitIntVSETCC(EVT VT,SDValue LHS,SDValue RHS,ISD::CondCode Cond,SelectionDAG & DAG,const SDLoc & dl)22755 static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,
22756                               ISD::CondCode Cond, SelectionDAG &DAG,
22757                               const SDLoc &dl) {
22758   assert(VT.isInteger() && VT == LHS.getValueType() &&
22759          VT == RHS.getValueType() && "Unsupported VTs!");
22760 
22761   SDValue CC = DAG.getCondCode(Cond);
22762 
22763   // Extract the LHS Lo/Hi vectors
22764   SDValue LHS1, LHS2;
22765   std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
22766 
22767   // Extract the RHS Lo/Hi vectors
22768   SDValue RHS1, RHS2;
22769   std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
22770 
22771   // Issue the operation on the smaller types and concatenate the result back
22772   EVT LoVT, HiVT;
22773   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
22774   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22775                      DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
22776                      DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
22777 }
22778 
LowerIntVSETCC_AVX512(SDValue Op,SelectionDAG & DAG)22779 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
22780 
22781   SDValue Op0 = Op.getOperand(0);
22782   SDValue Op1 = Op.getOperand(1);
22783   SDValue CC = Op.getOperand(2);
22784   MVT VT = Op.getSimpleValueType();
22785   SDLoc dl(Op);
22786 
22787   assert(VT.getVectorElementType() == MVT::i1 &&
22788          "Cannot set masked compare for this operation");
22789 
22790   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
22791 
22792   // Prefer SETGT over SETLT.
22793   if (SetCCOpcode == ISD::SETLT) {
22794     SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
22795     std::swap(Op0, Op1);
22796   }
22797 
22798   return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
22799 }
22800 
22801 /// Given a buildvector constant, return a new vector constant with each element
22802 /// incremented or decremented. If incrementing or decrementing would result in
22803 /// unsigned overflow or underflow or this is not a simple vector constant,
22804 /// return an empty value.
incDecVectorConstant(SDValue V,SelectionDAG & DAG,bool IsInc)22805 static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
22806   auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
22807   if (!BV)
22808     return SDValue();
22809 
22810   MVT VT = V.getSimpleValueType();
22811   MVT EltVT = VT.getVectorElementType();
22812   unsigned NumElts = VT.getVectorNumElements();
22813   SmallVector<SDValue, 8> NewVecC;
22814   SDLoc DL(V);
22815   for (unsigned i = 0; i < NumElts; ++i) {
22816     auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
22817     if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
22818       return SDValue();
22819 
22820     // Avoid overflow/underflow.
22821     const APInt &EltC = Elt->getAPIntValue();
22822     if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue()))
22823       return SDValue();
22824 
22825     NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
22826   }
22827 
22828   return DAG.getBuildVector(VT, DL, NewVecC);
22829 }
22830 
22831 /// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
22832 /// Op0 u<= Op1:
22833 ///   t = psubus Op0, Op1
22834 ///   pcmpeq t, <0..0>
LowerVSETCCWithSUBUS(SDValue Op0,SDValue Op1,MVT VT,ISD::CondCode Cond,const SDLoc & dl,const X86Subtarget & Subtarget,SelectionDAG & DAG)22835 static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
22836                                     ISD::CondCode Cond, const SDLoc &dl,
22837                                     const X86Subtarget &Subtarget,
22838                                     SelectionDAG &DAG) {
22839   if (!Subtarget.hasSSE2())
22840     return SDValue();
22841 
22842   MVT VET = VT.getVectorElementType();
22843   if (VET != MVT::i8 && VET != MVT::i16)
22844     return SDValue();
22845 
22846   switch (Cond) {
22847   default:
22848     return SDValue();
22849   case ISD::SETULT: {
22850     // If the comparison is against a constant we can turn this into a
22851     // setule.  With psubus, setule does not require a swap.  This is
22852     // beneficial because the constant in the register is no longer
22853     // destructed as the destination so it can be hoisted out of a loop.
22854     // Only do this pre-AVX since vpcmp* is no longer destructive.
22855     if (Subtarget.hasAVX())
22856       return SDValue();
22857     SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);
22858     if (!ULEOp1)
22859       return SDValue();
22860     Op1 = ULEOp1;
22861     break;
22862   }
22863   case ISD::SETUGT: {
22864     // If the comparison is against a constant, we can turn this into a setuge.
22865     // This is beneficial because materializing a constant 0 for the PCMPEQ is
22866     // probably cheaper than XOR+PCMPGT using 2 different vector constants:
22867     // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
22868     SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);
22869     if (!UGEOp1)
22870       return SDValue();
22871     Op1 = Op0;
22872     Op0 = UGEOp1;
22873     break;
22874   }
22875   // Psubus is better than flip-sign because it requires no inversion.
22876   case ISD::SETUGE:
22877     std::swap(Op0, Op1);
22878     break;
22879   case ISD::SETULE:
22880     break;
22881   }
22882 
22883   SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
22884   return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
22885                      DAG.getConstant(0, dl, VT));
22886 }
22887 
LowerVSETCC(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)22888 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
22889                            SelectionDAG &DAG) {
22890   bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
22891                   Op.getOpcode() == ISD::STRICT_FSETCCS;
22892   SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
22893   SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
22894   SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
22895   MVT VT = Op->getSimpleValueType(0);
22896   ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
22897   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
22898   SDLoc dl(Op);
22899 
22900   if (isFP) {
22901 #ifndef NDEBUG
22902     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
22903     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
22904 #endif
22905 
22906     bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
22907     SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22908 
22909     // If we have a strict compare with a vXi1 result and the input is 128/256
22910     // bits we can't use a masked compare unless we have VLX. If we use a wider
22911     // compare like we do for non-strict, we might trigger spurious exceptions
22912     // from the upper elements. Instead emit a AVX compare and convert to mask.
22913     unsigned Opc;
22914     if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
22915         (!IsStrict || Subtarget.hasVLX() ||
22916          Op0.getSimpleValueType().is512BitVector())) {
22917       assert(VT.getVectorNumElements() <= 16);
22918       Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
22919     } else {
22920       Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
22921       // The SSE/AVX packed FP comparison nodes are defined with a
22922       // floating-point vector result that matches the operand type. This allows
22923       // them to work with an SSE1 target (integer vector types are not legal).
22924       VT = Op0.getSimpleValueType();
22925     }
22926 
22927     SDValue Cmp;
22928     bool IsAlwaysSignaling;
22929     unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
22930     if (!Subtarget.hasAVX()) {
22931       // TODO: We could use following steps to handle a quiet compare with
22932       // signaling encodings.
22933       // 1. Get ordered masks from a quiet ISD::SETO
22934       // 2. Use the masks to mask potential unordered elements in operand A, B
22935       // 3. Get the compare results of masked A, B
22936       // 4. Calculating final result using the mask and result from 3
22937       // But currently, we just fall back to scalar operations.
22938       if (IsStrict && IsAlwaysSignaling && !IsSignaling)
22939         return SDValue();
22940 
22941       // Insert an extra signaling instruction to raise exception.
22942       if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
22943         SDValue SignalCmp = DAG.getNode(
22944             Opc, dl, {VT, MVT::Other},
22945             {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
22946         // FIXME: It seems we need to update the flags of all new strict nodes.
22947         // Otherwise, mayRaiseFPException in MI will return false due to
22948         // NoFPExcept = false by default. However, I didn't find it in other
22949         // patches.
22950         SignalCmp->setFlags(Op->getFlags());
22951         Chain = SignalCmp.getValue(1);
22952       }
22953 
22954       // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
22955       // emit two comparisons and a logic op to tie them together.
22956       if (SSECC >= 8) {
22957         // LLVM predicate is SETUEQ or SETONE.
22958         unsigned CC0, CC1;
22959         unsigned CombineOpc;
22960         if (Cond == ISD::SETUEQ) {
22961           CC0 = 3; // UNORD
22962           CC1 = 0; // EQ
22963           CombineOpc = X86ISD::FOR;
22964         } else {
22965           assert(Cond == ISD::SETONE);
22966           CC0 = 7; // ORD
22967           CC1 = 4; // NEQ
22968           CombineOpc = X86ISD::FAND;
22969         }
22970 
22971         SDValue Cmp0, Cmp1;
22972         if (IsStrict) {
22973           Cmp0 = DAG.getNode(
22974               Opc, dl, {VT, MVT::Other},
22975               {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
22976           Cmp1 = DAG.getNode(
22977               Opc, dl, {VT, MVT::Other},
22978               {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
22979           Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
22980                               Cmp1.getValue(1));
22981         } else {
22982           Cmp0 = DAG.getNode(
22983               Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
22984           Cmp1 = DAG.getNode(
22985               Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
22986         }
22987         Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
22988       } else {
22989         if (IsStrict) {
22990           Cmp = DAG.getNode(
22991               Opc, dl, {VT, MVT::Other},
22992               {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
22993           Chain = Cmp.getValue(1);
22994         } else
22995           Cmp = DAG.getNode(
22996               Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
22997       }
22998     } else {
22999       // Handle all other FP comparisons here.
23000       if (IsStrict) {
23001         // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
23002         SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
23003         Cmp = DAG.getNode(
23004             Opc, dl, {VT, MVT::Other},
23005             {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23006         Chain = Cmp.getValue(1);
23007       } else
23008         Cmp = DAG.getNode(
23009             Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23010     }
23011 
23012     if (VT.getFixedSizeInBits() >
23013         Op.getSimpleValueType().getFixedSizeInBits()) {
23014       // We emitted a compare with an XMM/YMM result. Finish converting to a
23015       // mask register using a vptestm.
23016       EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
23017       Cmp = DAG.getBitcast(CastVT, Cmp);
23018       Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
23019                          DAG.getConstant(0, dl, CastVT), ISD::SETNE);
23020     } else {
23021       // If this is SSE/AVX CMPP, bitcast the result back to integer to match
23022       // the result type of SETCC. The bitcast is expected to be optimized
23023       // away during combining/isel.
23024       Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
23025     }
23026 
23027     if (IsStrict)
23028       return DAG.getMergeValues({Cmp, Chain}, dl);
23029 
23030     return Cmp;
23031   }
23032 
23033   assert(!IsStrict && "Strict SETCC only handles FP operands.");
23034 
23035   MVT VTOp0 = Op0.getSimpleValueType();
23036   (void)VTOp0;
23037   assert(VTOp0 == Op1.getSimpleValueType() &&
23038          "Expected operands with same type!");
23039   assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
23040          "Invalid number of packed elements for source and destination!");
23041 
23042   // The non-AVX512 code below works under the assumption that source and
23043   // destination types are the same.
23044   assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
23045          "Value types for source and destination must be the same!");
23046 
23047   // The result is boolean, but operands are int/float
23048   if (VT.getVectorElementType() == MVT::i1) {
23049     // In AVX-512 architecture setcc returns mask with i1 elements,
23050     // But there is no compare instruction for i8 and i16 elements in KNL.
23051     assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
23052            "Unexpected operand type");
23053     return LowerIntVSETCC_AVX512(Op, DAG);
23054   }
23055 
23056   // Lower using XOP integer comparisons.
23057   if (VT.is128BitVector() && Subtarget.hasXOP()) {
23058     // Translate compare code to XOP PCOM compare mode.
23059     unsigned CmpMode = 0;
23060     switch (Cond) {
23061     default: llvm_unreachable("Unexpected SETCC condition");
23062     case ISD::SETULT:
23063     case ISD::SETLT: CmpMode = 0x00; break;
23064     case ISD::SETULE:
23065     case ISD::SETLE: CmpMode = 0x01; break;
23066     case ISD::SETUGT:
23067     case ISD::SETGT: CmpMode = 0x02; break;
23068     case ISD::SETUGE:
23069     case ISD::SETGE: CmpMode = 0x03; break;
23070     case ISD::SETEQ: CmpMode = 0x04; break;
23071     case ISD::SETNE: CmpMode = 0x05; break;
23072     }
23073 
23074     // Are we comparing unsigned or signed integers?
23075     unsigned Opc =
23076         ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
23077 
23078     return DAG.getNode(Opc, dl, VT, Op0, Op1,
23079                        DAG.getTargetConstant(CmpMode, dl, MVT::i8));
23080   }
23081 
23082   // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
23083   // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
23084   if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
23085     SDValue BC0 = peekThroughBitcasts(Op0);
23086     if (BC0.getOpcode() == ISD::AND) {
23087       APInt UndefElts;
23088       SmallVector<APInt, 64> EltBits;
23089       if (getTargetConstantBitsFromNode(BC0.getOperand(1),
23090                                         VT.getScalarSizeInBits(), UndefElts,
23091                                         EltBits, false, false)) {
23092         if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
23093           Cond = ISD::SETEQ;
23094           Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
23095         }
23096       }
23097     }
23098   }
23099 
23100   // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
23101   if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
23102       Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
23103     ConstantSDNode *C1 = isConstOrConstSplat(Op1);
23104     if (C1 && C1->getAPIntValue().isPowerOf2()) {
23105       unsigned BitWidth = VT.getScalarSizeInBits();
23106       unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
23107 
23108       SDValue Result = Op0.getOperand(0);
23109       Result = DAG.getNode(ISD::SHL, dl, VT, Result,
23110                            DAG.getConstant(ShiftAmt, dl, VT));
23111       Result = DAG.getNode(ISD::SRA, dl, VT, Result,
23112                            DAG.getConstant(BitWidth - 1, dl, VT));
23113       return Result;
23114     }
23115   }
23116 
23117   // Break 256-bit integer vector compare into smaller ones.
23118   if (VT.is256BitVector() && !Subtarget.hasInt256())
23119     return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23120 
23121   if (VT == MVT::v32i16 || VT == MVT::v64i8) {
23122     assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!");
23123     return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23124   }
23125 
23126   // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
23127   // not-of-PCMPEQ:
23128   // X != INT_MIN --> X >s INT_MIN
23129   // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
23130   // +X != 0 --> +X >s 0
23131   APInt ConstValue;
23132   if (Cond == ISD::SETNE &&
23133       ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
23134     if (ConstValue.isMinSignedValue())
23135       Cond = ISD::SETGT;
23136     else if (ConstValue.isMaxSignedValue())
23137       Cond = ISD::SETLT;
23138     else if (ConstValue.isNullValue() && DAG.SignBitIsZero(Op0))
23139       Cond = ISD::SETGT;
23140   }
23141 
23142   // If both operands are known non-negative, then an unsigned compare is the
23143   // same as a signed compare and there's no need to flip signbits.
23144   // TODO: We could check for more general simplifications here since we're
23145   // computing known bits.
23146   bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
23147                    !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
23148 
23149   // Special case: Use min/max operations for unsigned compares.
23150   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23151   if (ISD::isUnsignedIntSetCC(Cond) &&
23152       (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
23153       TLI.isOperationLegal(ISD::UMIN, VT)) {
23154     // If we have a constant operand, increment/decrement it and change the
23155     // condition to avoid an invert.
23156     if (Cond == ISD::SETUGT) {
23157       // X > C --> X >= (C+1) --> X == umax(X, C+1)
23158       if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {
23159         Op1 = UGTOp1;
23160         Cond = ISD::SETUGE;
23161       }
23162     }
23163     if (Cond == ISD::SETULT) {
23164       // X < C --> X <= (C-1) --> X == umin(X, C-1)
23165       if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {
23166         Op1 = ULTOp1;
23167         Cond = ISD::SETULE;
23168       }
23169     }
23170     bool Invert = false;
23171     unsigned Opc;
23172     switch (Cond) {
23173     default: llvm_unreachable("Unexpected condition code");
23174     case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
23175     case ISD::SETULE: Opc = ISD::UMIN; break;
23176     case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
23177     case ISD::SETUGE: Opc = ISD::UMAX; break;
23178     }
23179 
23180     SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23181     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
23182 
23183     // If the logical-not of the result is required, perform that now.
23184     if (Invert)
23185       Result = DAG.getNOT(dl, Result, VT);
23186 
23187     return Result;
23188   }
23189 
23190   // Try to use SUBUS and PCMPEQ.
23191   if (FlipSigns)
23192     if (SDValue V =
23193             LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
23194       return V;
23195 
23196   // We are handling one of the integer comparisons here. Since SSE only has
23197   // GT and EQ comparisons for integer, swapping operands and multiple
23198   // operations may be required for some comparisons.
23199   unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
23200                                                             : X86ISD::PCMPGT;
23201   bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
23202               Cond == ISD::SETGE || Cond == ISD::SETUGE;
23203   bool Invert = Cond == ISD::SETNE ||
23204                 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
23205 
23206   if (Swap)
23207     std::swap(Op0, Op1);
23208 
23209   // Check that the operation in question is available (most are plain SSE2,
23210   // but PCMPGTQ and PCMPEQQ have different requirements).
23211   if (VT == MVT::v2i64) {
23212     if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
23213       assert(Subtarget.hasSSE2() && "Don't know how to lower!");
23214 
23215       // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
23216       // the odd elements over the even elements.
23217       if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
23218         Op0 = DAG.getConstant(0, dl, MVT::v4i32);
23219         Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23220 
23221         SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23222         static const int MaskHi[] = { 1, 1, 3, 3 };
23223         SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23224 
23225         return DAG.getBitcast(VT, Result);
23226       }
23227 
23228       if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
23229         Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23230         Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
23231 
23232         SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23233         static const int MaskHi[] = { 1, 1, 3, 3 };
23234         SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23235 
23236         return DAG.getBitcast(VT, Result);
23237       }
23238 
23239       // Since SSE has no unsigned integer comparisons, we need to flip the sign
23240       // bits of the inputs before performing those operations. The lower
23241       // compare is always unsigned.
23242       SDValue SB;
23243       if (FlipSigns) {
23244         SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
23245       } else {
23246         SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
23247       }
23248       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
23249       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
23250 
23251       // Cast everything to the right type.
23252       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23253       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23254 
23255       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
23256       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23257       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
23258 
23259       // Create masks for only the low parts/high parts of the 64 bit integers.
23260       static const int MaskHi[] = { 1, 1, 3, 3 };
23261       static const int MaskLo[] = { 0, 0, 2, 2 };
23262       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
23263       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
23264       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23265 
23266       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
23267       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
23268 
23269       if (Invert)
23270         Result = DAG.getNOT(dl, Result, MVT::v4i32);
23271 
23272       return DAG.getBitcast(VT, Result);
23273     }
23274 
23275     if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
23276       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
23277       // pcmpeqd + pshufd + pand.
23278       assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
23279 
23280       // First cast everything to the right type.
23281       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23282       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23283 
23284       // Do the compare.
23285       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
23286 
23287       // Make sure the lower and upper halves are both all-ones.
23288       static const int Mask[] = { 1, 0, 3, 2 };
23289       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
23290       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
23291 
23292       if (Invert)
23293         Result = DAG.getNOT(dl, Result, MVT::v4i32);
23294 
23295       return DAG.getBitcast(VT, Result);
23296     }
23297   }
23298 
23299   // Since SSE has no unsigned integer comparisons, we need to flip the sign
23300   // bits of the inputs before performing those operations.
23301   if (FlipSigns) {
23302     MVT EltVT = VT.getVectorElementType();
23303     SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
23304                                  VT);
23305     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
23306     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
23307   }
23308 
23309   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23310 
23311   // If the logical-not of the result is required, perform that now.
23312   if (Invert)
23313     Result = DAG.getNOT(dl, Result, VT);
23314 
23315   return Result;
23316 }
23317 
23318 // Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
EmitAVX512Test(SDValue Op0,SDValue Op1,ISD::CondCode CC,const SDLoc & dl,SelectionDAG & DAG,const X86Subtarget & Subtarget,SDValue & X86CC)23319 static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
23320                               const SDLoc &dl, SelectionDAG &DAG,
23321                               const X86Subtarget &Subtarget,
23322                               SDValue &X86CC) {
23323   // Only support equality comparisons.
23324   if (CC != ISD::SETEQ && CC != ISD::SETNE)
23325     return SDValue();
23326 
23327   // Must be a bitcast from vXi1.
23328   if (Op0.getOpcode() != ISD::BITCAST)
23329     return SDValue();
23330 
23331   Op0 = Op0.getOperand(0);
23332   MVT VT = Op0.getSimpleValueType();
23333   if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
23334       !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
23335       !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
23336     return SDValue();
23337 
23338   X86::CondCode X86Cond;
23339   if (isNullConstant(Op1)) {
23340     X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
23341   } else if (isAllOnesConstant(Op1)) {
23342     // C flag is set for all ones.
23343     X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
23344   } else
23345     return SDValue();
23346 
23347   // If the input is an AND, we can combine it's operands into the KTEST.
23348   bool KTestable = false;
23349   if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
23350     KTestable = true;
23351   if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
23352     KTestable = true;
23353   if (!isNullConstant(Op1))
23354     KTestable = false;
23355   if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
23356     SDValue LHS = Op0.getOperand(0);
23357     SDValue RHS = Op0.getOperand(1);
23358     X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23359     return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
23360   }
23361 
23362   // If the input is an OR, we can combine it's operands into the KORTEST.
23363   SDValue LHS = Op0;
23364   SDValue RHS = Op0;
23365   if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
23366     LHS = Op0.getOperand(0);
23367     RHS = Op0.getOperand(1);
23368   }
23369 
23370   X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23371   return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
23372 }
23373 
23374 /// Emit flags for the given setcc condition and operands. Also returns the
23375 /// corresponding X86 condition code constant in X86CC.
emitFlagsForSetcc(SDValue Op0,SDValue Op1,ISD::CondCode CC,const SDLoc & dl,SelectionDAG & DAG,SDValue & X86CC) const23376 SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
23377                                              ISD::CondCode CC, const SDLoc &dl,
23378                                              SelectionDAG &DAG,
23379                                              SDValue &X86CC) const {
23380   // Optimize to BT if possible.
23381   // Lower (X & (1 << N)) == 0 to BT(X, N).
23382   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
23383   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
23384   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
23385       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23386     if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
23387       return BT;
23388   }
23389 
23390   // Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.
23391   // TODO: We could do AND tree with all 1s as well by using the C flag.
23392   if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE))
23393     if (SDValue CmpZ =
23394             MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))
23395       return CmpZ;
23396 
23397   // Try to lower using KORTEST or KTEST.
23398   if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
23399     return Test;
23400 
23401   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
23402   // these.
23403   if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
23404       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23405     // If the input is a setcc, then reuse the input setcc or use a new one with
23406     // the inverted condition.
23407     if (Op0.getOpcode() == X86ISD::SETCC) {
23408       bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
23409 
23410       X86CC = Op0.getOperand(0);
23411       if (Invert) {
23412         X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
23413         CCode = X86::GetOppositeBranchCondition(CCode);
23414         X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
23415       }
23416 
23417       return Op0.getOperand(1);
23418     }
23419   }
23420 
23421   // Try to use the carry flag from the add in place of an separate CMP for:
23422   // (seteq (add X, -1), -1). Similar for setne.
23423   if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
23424       Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23425     if (isProfitableToUseFlagOp(Op0)) {
23426       SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
23427 
23428       SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
23429                                 Op0.getOperand(1));
23430       DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
23431       X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23432       X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
23433       return SDValue(New.getNode(), 1);
23434     }
23435   }
23436 
23437   X86::CondCode CondCode =
23438       TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
23439   assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
23440 
23441   SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
23442   X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23443   return EFLAGS;
23444 }
23445 
LowerSETCC(SDValue Op,SelectionDAG & DAG) const23446 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
23447 
23448   bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23449                   Op.getOpcode() == ISD::STRICT_FSETCCS;
23450   MVT VT = Op->getSimpleValueType(0);
23451 
23452   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
23453 
23454   assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
23455   SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23456   SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23457   SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23458   SDLoc dl(Op);
23459   ISD::CondCode CC =
23460       cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
23461 
23462   // Handle f128 first, since one possible outcome is a normal integer
23463   // comparison which gets handled by emitFlagsForSetcc.
23464   if (Op0.getValueType() == MVT::f128) {
23465     softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
23466                         Op.getOpcode() == ISD::STRICT_FSETCCS);
23467 
23468     // If softenSetCCOperands returned a scalar, use it.
23469     if (!Op1.getNode()) {
23470       assert(Op0.getValueType() == Op.getValueType() &&
23471              "Unexpected setcc expansion!");
23472       if (IsStrict)
23473         return DAG.getMergeValues({Op0, Chain}, dl);
23474       return Op0;
23475     }
23476   }
23477 
23478   if (Op0.getSimpleValueType().isInteger()) {
23479     SDValue X86CC;
23480     SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
23481     SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23482     return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23483   }
23484 
23485   // Handle floating point.
23486   X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
23487   if (CondCode == X86::COND_INVALID)
23488     return SDValue();
23489 
23490   SDValue EFLAGS;
23491   if (IsStrict) {
23492     bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23493     EFLAGS =
23494         DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
23495                     dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
23496     Chain = EFLAGS.getValue(1);
23497   } else {
23498     EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
23499   }
23500 
23501   SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23502   SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23503   return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23504 }
23505 
LowerSETCCCARRY(SDValue Op,SelectionDAG & DAG) const23506 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
23507   SDValue LHS = Op.getOperand(0);
23508   SDValue RHS = Op.getOperand(1);
23509   SDValue Carry = Op.getOperand(2);
23510   SDValue Cond = Op.getOperand(3);
23511   SDLoc DL(Op);
23512 
23513   assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
23514   X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
23515 
23516   // Recreate the carry if needed.
23517   EVT CarryVT = Carry.getValueType();
23518   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23519                       Carry, DAG.getAllOnesConstant(DL, CarryVT));
23520 
23521   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
23522   SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
23523   return getSETCC(CC, Cmp.getValue(1), DL, DAG);
23524 }
23525 
23526 // This function returns three things: the arithmetic computation itself
23527 // (Value), an EFLAGS result (Overflow), and a condition code (Cond).  The
23528 // flag and the condition code define the case in which the arithmetic
23529 // computation overflows.
23530 static std::pair<SDValue, SDValue>
getX86XALUOOp(X86::CondCode & Cond,SDValue Op,SelectionDAG & DAG)23531 getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
23532   assert(Op.getResNo() == 0 && "Unexpected result number!");
23533   SDValue Value, Overflow;
23534   SDValue LHS = Op.getOperand(0);
23535   SDValue RHS = Op.getOperand(1);
23536   unsigned BaseOp = 0;
23537   SDLoc DL(Op);
23538   switch (Op.getOpcode()) {
23539   default: llvm_unreachable("Unknown ovf instruction!");
23540   case ISD::SADDO:
23541     BaseOp = X86ISD::ADD;
23542     Cond = X86::COND_O;
23543     break;
23544   case ISD::UADDO:
23545     BaseOp = X86ISD::ADD;
23546     Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
23547     break;
23548   case ISD::SSUBO:
23549     BaseOp = X86ISD::SUB;
23550     Cond = X86::COND_O;
23551     break;
23552   case ISD::USUBO:
23553     BaseOp = X86ISD::SUB;
23554     Cond = X86::COND_B;
23555     break;
23556   case ISD::SMULO:
23557     BaseOp = X86ISD::SMUL;
23558     Cond = X86::COND_O;
23559     break;
23560   case ISD::UMULO:
23561     BaseOp = X86ISD::UMUL;
23562     Cond = X86::COND_O;
23563     break;
23564   }
23565 
23566   if (BaseOp) {
23567     // Also sets EFLAGS.
23568     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23569     Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
23570     Overflow = Value.getValue(1);
23571   }
23572 
23573   return std::make_pair(Value, Overflow);
23574 }
23575 
LowerXALUO(SDValue Op,SelectionDAG & DAG)23576 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
23577   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
23578   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
23579   // looks for this combo and may remove the "setcc" instruction if the "setcc"
23580   // has only one use.
23581   SDLoc DL(Op);
23582   X86::CondCode Cond;
23583   SDValue Value, Overflow;
23584   std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
23585 
23586   SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
23587   assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
23588   return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
23589 }
23590 
23591 /// Return true if opcode is a X86 logical comparison.
isX86LogicalCmp(SDValue Op)23592 static bool isX86LogicalCmp(SDValue Op) {
23593   unsigned Opc = Op.getOpcode();
23594   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
23595       Opc == X86ISD::FCMP)
23596     return true;
23597   if (Op.getResNo() == 1 &&
23598       (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
23599        Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
23600        Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
23601     return true;
23602 
23603   return false;
23604 }
23605 
isTruncWithZeroHighBitsInput(SDValue V,SelectionDAG & DAG)23606 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
23607   if (V.getOpcode() != ISD::TRUNCATE)
23608     return false;
23609 
23610   SDValue VOp0 = V.getOperand(0);
23611   unsigned InBits = VOp0.getValueSizeInBits();
23612   unsigned Bits = V.getValueSizeInBits();
23613   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
23614 }
23615 
LowerSELECT(SDValue Op,SelectionDAG & DAG) const23616 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
23617   bool AddTest = true;
23618   SDValue Cond  = Op.getOperand(0);
23619   SDValue Op1 = Op.getOperand(1);
23620   SDValue Op2 = Op.getOperand(2);
23621   SDLoc DL(Op);
23622   MVT VT = Op1.getSimpleValueType();
23623   SDValue CC;
23624 
23625   // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
23626   // are available or VBLENDV if AVX is available.
23627   // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
23628   if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
23629       VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
23630     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
23631     bool IsAlwaysSignaling;
23632     unsigned SSECC =
23633         translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
23634                            CondOp0, CondOp1, IsAlwaysSignaling);
23635 
23636     if (Subtarget.hasAVX512()) {
23637       SDValue Cmp =
23638           DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
23639                       DAG.getTargetConstant(SSECC, DL, MVT::i8));
23640       assert(!VT.isVector() && "Not a scalar type?");
23641       return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
23642     }
23643 
23644     if (SSECC < 8 || Subtarget.hasAVX()) {
23645       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
23646                                 DAG.getTargetConstant(SSECC, DL, MVT::i8));
23647 
23648       // If we have AVX, we can use a variable vector select (VBLENDV) instead
23649       // of 3 logic instructions for size savings and potentially speed.
23650       // Unfortunately, there is no scalar form of VBLENDV.
23651 
23652       // If either operand is a +0.0 constant, don't try this. We can expect to
23653       // optimize away at least one of the logic instructions later in that
23654       // case, so that sequence would be faster than a variable blend.
23655 
23656       // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
23657       // uses XMM0 as the selection register. That may need just as many
23658       // instructions as the AND/ANDN/OR sequence due to register moves, so
23659       // don't bother.
23660       if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
23661           !isNullFPConstant(Op2)) {
23662         // Convert to vectors, do a VSELECT, and convert back to scalar.
23663         // All of the conversions should be optimized away.
23664         MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
23665         SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
23666         SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
23667         SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
23668 
23669         MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
23670         VCmp = DAG.getBitcast(VCmpVT, VCmp);
23671 
23672         SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
23673 
23674         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
23675                            VSel, DAG.getIntPtrConstant(0, DL));
23676       }
23677       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
23678       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
23679       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
23680     }
23681   }
23682 
23683   // AVX512 fallback is to lower selects of scalar floats to masked moves.
23684   if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
23685     SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
23686     return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
23687   }
23688 
23689   if (Cond.getOpcode() == ISD::SETCC) {
23690     if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
23691       Cond = NewCond;
23692       // If the condition was updated, it's possible that the operands of the
23693       // select were also updated (for example, EmitTest has a RAUW). Refresh
23694       // the local references to the select operands in case they got stale.
23695       Op1 = Op.getOperand(1);
23696       Op2 = Op.getOperand(2);
23697     }
23698   }
23699 
23700   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
23701   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
23702   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
23703   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
23704   // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
23705   // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
23706   if (Cond.getOpcode() == X86ISD::SETCC &&
23707       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
23708       isNullConstant(Cond.getOperand(1).getOperand(1))) {
23709     SDValue Cmp = Cond.getOperand(1);
23710     SDValue CmpOp0 = Cmp.getOperand(0);
23711     unsigned CondCode = Cond.getConstantOperandVal(0);
23712 
23713     // Special handling for __builtin_ffs(X) - 1 pattern which looks like
23714     // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
23715     // handle to keep the CMP with 0. This should be removed by
23716     // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
23717     // cttz_zero_undef.
23718     auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
23719       return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
23720               Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
23721     };
23722     if (Subtarget.hasCMov() && (VT == MVT::i32 || VT == MVT::i64) &&
23723         ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
23724          (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
23725       // Keep Cmp.
23726     } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
23727         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
23728       SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
23729 
23730       SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23731       SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
23732 
23733       // Apply further optimizations for special cases
23734       // (select (x != 0), -1, 0) -> neg & sbb
23735       // (select (x == 0), 0, -1) -> neg & sbb
23736       if (isNullConstant(Y) &&
23737           (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
23738         SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
23739         SDValue Neg = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
23740         Zero = DAG.getConstant(0, DL, Op.getValueType());
23741         return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Neg.getValue(1));
23742       }
23743 
23744       Cmp = DAG.getNode(X86ISD::SUB, DL, CmpVTs,
23745                         CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
23746 
23747       SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
23748       SDValue Res =   // Res = 0 or -1.
23749         DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp.getValue(1));
23750 
23751       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
23752         Res = DAG.getNOT(DL, Res, Res.getValueType());
23753 
23754       return DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
23755     } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
23756                Cmp.getOperand(0).getOpcode() == ISD::AND &&
23757                isOneConstant(Cmp.getOperand(0).getOperand(1))) {
23758       SDValue Src1, Src2;
23759       // true if Op2 is XOR or OR operator and one of its operands
23760       // is equal to Op1
23761       // ( a , a op b) || ( b , a op b)
23762       auto isOrXorPattern = [&]() {
23763         if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
23764             (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
23765           Src1 =
23766               Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
23767           Src2 = Op1;
23768           return true;
23769         }
23770         return false;
23771       };
23772 
23773       if (isOrXorPattern()) {
23774         SDValue Neg;
23775         unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
23776         // we need mask of all zeros or ones with same size of the other
23777         // operands.
23778         if (CmpSz > VT.getSizeInBits())
23779           Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
23780         else if (CmpSz < VT.getSizeInBits())
23781           Neg = DAG.getNode(ISD::AND, DL, VT,
23782               DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
23783               DAG.getConstant(1, DL, VT));
23784         else
23785           Neg = CmpOp0;
23786         SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
23787                                    Neg); // -(and (x, 0x1))
23788         SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
23789         return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2);  // And Op y
23790       }
23791     }
23792   }
23793 
23794   // Look past (and (setcc_carry (cmp ...)), 1).
23795   if (Cond.getOpcode() == ISD::AND &&
23796       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
23797       isOneConstant(Cond.getOperand(1)))
23798     Cond = Cond.getOperand(0);
23799 
23800   // If condition flag is set by a X86ISD::CMP, then use it as the condition
23801   // setting operand in place of the X86ISD::SETCC.
23802   unsigned CondOpcode = Cond.getOpcode();
23803   if (CondOpcode == X86ISD::SETCC ||
23804       CondOpcode == X86ISD::SETCC_CARRY) {
23805     CC = Cond.getOperand(0);
23806 
23807     SDValue Cmp = Cond.getOperand(1);
23808     bool IllegalFPCMov = false;
23809     if (VT.isFloatingPoint() && !VT.isVector() &&
23810         !isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov())  // FPStack?
23811       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
23812 
23813     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
23814         Cmp.getOpcode() == X86ISD::BT) { // FIXME
23815       Cond = Cmp;
23816       AddTest = false;
23817     }
23818   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
23819              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
23820              CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
23821     SDValue Value;
23822     X86::CondCode X86Cond;
23823     std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
23824 
23825     CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
23826     AddTest = false;
23827   }
23828 
23829   if (AddTest) {
23830     // Look past the truncate if the high bits are known zero.
23831     if (isTruncWithZeroHighBitsInput(Cond, DAG))
23832       Cond = Cond.getOperand(0);
23833 
23834     // We know the result of AND is compared against zero. Try to match
23835     // it to BT.
23836     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
23837       SDValue BTCC;
23838       if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
23839         CC = BTCC;
23840         Cond = BT;
23841         AddTest = false;
23842       }
23843     }
23844   }
23845 
23846   if (AddTest) {
23847     CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
23848     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
23849   }
23850 
23851   // a <  b ? -1 :  0 -> RES = ~setcc_carry
23852   // a <  b ?  0 : -1 -> RES = setcc_carry
23853   // a >= b ? -1 :  0 -> RES = setcc_carry
23854   // a >= b ?  0 : -1 -> RES = ~setcc_carry
23855   if (Cond.getOpcode() == X86ISD::SUB) {
23856     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
23857 
23858     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
23859         (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
23860         (isNullConstant(Op1) || isNullConstant(Op2))) {
23861       SDValue Res =
23862           DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
23863                       DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
23864       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
23865         return DAG.getNOT(DL, Res, Res.getValueType());
23866       return Res;
23867     }
23868   }
23869 
23870   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
23871   // widen the cmov and push the truncate through. This avoids introducing a new
23872   // branch during isel and doesn't add any extensions.
23873   if (Op.getValueType() == MVT::i8 &&
23874       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
23875     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
23876     if (T1.getValueType() == T2.getValueType() &&
23877         // Exclude CopyFromReg to avoid partial register stalls.
23878         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
23879       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
23880                                  CC, Cond);
23881       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
23882     }
23883   }
23884 
23885   // Or finally, promote i8 cmovs if we have CMOV,
23886   //                 or i16 cmovs if it won't prevent folding a load.
23887   // FIXME: we should not limit promotion of i8 case to only when the CMOV is
23888   //        legal, but EmitLoweredSelect() can not deal with these extensions
23889   //        being inserted between two CMOV's. (in i16 case too TBN)
23890   //        https://bugs.llvm.org/show_bug.cgi?id=40974
23891   if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
23892       (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
23893        !MayFoldLoad(Op2))) {
23894     Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
23895     Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
23896     SDValue Ops[] = { Op2, Op1, CC, Cond };
23897     SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
23898     return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
23899   }
23900 
23901   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
23902   // condition is true.
23903   SDValue Ops[] = { Op2, Op1, CC, Cond };
23904   return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
23905 }
23906 
LowerSIGN_EXTEND_Mask(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)23907 static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
23908                                      const X86Subtarget &Subtarget,
23909                                      SelectionDAG &DAG) {
23910   MVT VT = Op->getSimpleValueType(0);
23911   SDValue In = Op->getOperand(0);
23912   MVT InVT = In.getSimpleValueType();
23913   assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
23914   MVT VTElt = VT.getVectorElementType();
23915   SDLoc dl(Op);
23916 
23917   unsigned NumElts = VT.getVectorNumElements();
23918 
23919   // Extend VT if the scalar type is i8/i16 and BWI is not supported.
23920   MVT ExtVT = VT;
23921   if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
23922     // If v16i32 is to be avoided, we'll need to split and concatenate.
23923     if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
23924       return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
23925 
23926     ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
23927   }
23928 
23929   // Widen to 512-bits if VLX is not supported.
23930   MVT WideVT = ExtVT;
23931   if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
23932     NumElts *= 512 / ExtVT.getSizeInBits();
23933     InVT = MVT::getVectorVT(MVT::i1, NumElts);
23934     In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
23935                      In, DAG.getIntPtrConstant(0, dl));
23936     WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
23937   }
23938 
23939   SDValue V;
23940   MVT WideEltVT = WideVT.getVectorElementType();
23941   if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
23942       (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
23943     V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
23944   } else {
23945     SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
23946     SDValue Zero = DAG.getConstant(0, dl, WideVT);
23947     V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
23948   }
23949 
23950   // Truncate if we had to extend i16/i8 above.
23951   if (VT != ExtVT) {
23952     WideVT = MVT::getVectorVT(VTElt, NumElts);
23953     V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
23954   }
23955 
23956   // Extract back to 128/256-bit if we widened.
23957   if (WideVT != VT)
23958     V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
23959                     DAG.getIntPtrConstant(0, dl));
23960 
23961   return V;
23962 }
23963 
LowerANY_EXTEND(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)23964 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
23965                                SelectionDAG &DAG) {
23966   SDValue In = Op->getOperand(0);
23967   MVT InVT = In.getSimpleValueType();
23968 
23969   if (InVT.getVectorElementType() == MVT::i1)
23970     return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
23971 
23972   assert(Subtarget.hasAVX() && "Expected AVX support");
23973   return LowerAVXExtend(Op, DAG, Subtarget);
23974 }
23975 
23976 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
23977 // For sign extend this needs to handle all vector sizes and SSE4.1 and
23978 // non-SSE4.1 targets. For zero extend this should only handle inputs of
23979 // MVT::v64i8 when BWI is not supported, but AVX512 is.
LowerEXTEND_VECTOR_INREG(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)23980 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
23981                                         const X86Subtarget &Subtarget,
23982                                         SelectionDAG &DAG) {
23983   SDValue In = Op->getOperand(0);
23984   MVT VT = Op->getSimpleValueType(0);
23985   MVT InVT = In.getSimpleValueType();
23986 
23987   MVT SVT = VT.getVectorElementType();
23988   MVT InSVT = InVT.getVectorElementType();
23989   assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits());
23990 
23991   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
23992     return SDValue();
23993   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
23994     return SDValue();
23995   if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
23996       !(VT.is256BitVector() && Subtarget.hasAVX()) &&
23997       !(VT.is512BitVector() && Subtarget.hasAVX512()))
23998     return SDValue();
23999 
24000   SDLoc dl(Op);
24001   unsigned Opc = Op.getOpcode();
24002   unsigned NumElts = VT.getVectorNumElements();
24003 
24004   // For 256-bit vectors, we only need the lower (128-bit) half of the input.
24005   // For 512-bit vectors, we need 128-bits or 256-bits.
24006   if (InVT.getSizeInBits() > 128) {
24007     // Input needs to be at least the same number of elements as output, and
24008     // at least 128-bits.
24009     int InSize = InSVT.getSizeInBits() * NumElts;
24010     In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
24011     InVT = In.getSimpleValueType();
24012   }
24013 
24014   // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
24015   // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
24016   // need to be handled here for 256/512-bit results.
24017   if (Subtarget.hasInt256()) {
24018     assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
24019 
24020     if (InVT.getVectorNumElements() != NumElts)
24021       return DAG.getNode(Op.getOpcode(), dl, VT, In);
24022 
24023     // FIXME: Apparently we create inreg operations that could be regular
24024     // extends.
24025     unsigned ExtOpc =
24026         Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
24027                                              : ISD::ZERO_EXTEND;
24028     return DAG.getNode(ExtOpc, dl, VT, In);
24029   }
24030 
24031   // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
24032   if (Subtarget.hasAVX()) {
24033     assert(VT.is256BitVector() && "256-bit vector expected");
24034     MVT HalfVT = VT.getHalfNumVectorElementsVT();
24035     int HalfNumElts = HalfVT.getVectorNumElements();
24036 
24037     unsigned NumSrcElts = InVT.getVectorNumElements();
24038     SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
24039     for (int i = 0; i != HalfNumElts; ++i)
24040       HiMask[i] = HalfNumElts + i;
24041 
24042     SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
24043     SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
24044     Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
24045     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
24046   }
24047 
24048   // We should only get here for sign extend.
24049   assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
24050   assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
24051 
24052   // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
24053   SDValue Curr = In;
24054   SDValue SignExt = Curr;
24055 
24056   // As SRAI is only available on i16/i32 types, we expand only up to i32
24057   // and handle i64 separately.
24058   if (InVT != MVT::v4i32) {
24059     MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
24060 
24061     unsigned DestWidth = DestVT.getScalarSizeInBits();
24062     unsigned Scale = DestWidth / InSVT.getSizeInBits();
24063 
24064     unsigned InNumElts = InVT.getVectorNumElements();
24065     unsigned DestElts = DestVT.getVectorNumElements();
24066 
24067     // Build a shuffle mask that takes each input element and places it in the
24068     // MSBs of the new element size.
24069     SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
24070     for (unsigned i = 0; i != DestElts; ++i)
24071       Mask[i * Scale + (Scale - 1)] = i;
24072 
24073     Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
24074     Curr = DAG.getBitcast(DestVT, Curr);
24075 
24076     unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
24077     SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
24078                           DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
24079   }
24080 
24081   if (VT == MVT::v2i64) {
24082     assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
24083     SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
24084     SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
24085     SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
24086     SignExt = DAG.getBitcast(VT, SignExt);
24087   }
24088 
24089   return SignExt;
24090 }
24091 
LowerSIGN_EXTEND(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)24092 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
24093                                 SelectionDAG &DAG) {
24094   MVT VT = Op->getSimpleValueType(0);
24095   SDValue In = Op->getOperand(0);
24096   MVT InVT = In.getSimpleValueType();
24097   SDLoc dl(Op);
24098 
24099   if (InVT.getVectorElementType() == MVT::i1)
24100     return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24101 
24102   assert(VT.isVector() && InVT.isVector() && "Expected vector type");
24103   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
24104          "Expected same number of elements");
24105   assert((VT.getVectorElementType() == MVT::i16 ||
24106           VT.getVectorElementType() == MVT::i32 ||
24107           VT.getVectorElementType() == MVT::i64) &&
24108          "Unexpected element type");
24109   assert((InVT.getVectorElementType() == MVT::i8 ||
24110           InVT.getVectorElementType() == MVT::i16 ||
24111           InVT.getVectorElementType() == MVT::i32) &&
24112          "Unexpected element type");
24113 
24114   if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
24115     assert(InVT == MVT::v32i8 && "Unexpected VT!");
24116     return splitVectorIntUnary(Op, DAG);
24117   }
24118 
24119   if (Subtarget.hasInt256())
24120     return Op;
24121 
24122   // Optimize vectors in AVX mode
24123   // Sign extend  v8i16 to v8i32 and
24124   //              v4i32 to v4i64
24125   //
24126   // Divide input vector into two parts
24127   // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
24128   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
24129   // concat the vectors to original VT
24130   MVT HalfVT = VT.getHalfNumVectorElementsVT();
24131   SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
24132 
24133   unsigned NumElems = InVT.getVectorNumElements();
24134   SmallVector<int,8> ShufMask(NumElems, -1);
24135   for (unsigned i = 0; i != NumElems/2; ++i)
24136     ShufMask[i] = i + NumElems/2;
24137 
24138   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
24139   OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
24140 
24141   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
24142 }
24143 
24144 /// Change a vector store into a pair of half-size vector stores.
splitVectorStore(StoreSDNode * Store,SelectionDAG & DAG)24145 static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
24146   SDValue StoredVal = Store->getValue();
24147   assert((StoredVal.getValueType().is256BitVector() ||
24148           StoredVal.getValueType().is512BitVector()) &&
24149          "Expecting 256/512-bit op");
24150 
24151   // Splitting volatile memory ops is not allowed unless the operation was not
24152   // legal to begin with. Assume the input store is legal (this transform is
24153   // only used for targets with AVX). Note: It is possible that we have an
24154   // illegal type like v2i128, and so we could allow splitting a volatile store
24155   // in that case if that is important.
24156   if (!Store->isSimple())
24157     return SDValue();
24158 
24159   SDLoc DL(Store);
24160   SDValue Value0, Value1;
24161   std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
24162   unsigned HalfOffset = Value0.getValueType().getStoreSize();
24163   SDValue Ptr0 = Store->getBasePtr();
24164   SDValue Ptr1 =
24165       DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
24166   SDValue Ch0 =
24167       DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
24168                    Store->getOriginalAlign(),
24169                    Store->getMemOperand()->getFlags());
24170   SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
24171                              Store->getPointerInfo().getWithOffset(HalfOffset),
24172                              Store->getOriginalAlign(),
24173                              Store->getMemOperand()->getFlags());
24174   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
24175 }
24176 
24177 /// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
24178 /// type.
scalarizeVectorStore(StoreSDNode * Store,MVT StoreVT,SelectionDAG & DAG)24179 static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
24180                                     SelectionDAG &DAG) {
24181   SDValue StoredVal = Store->getValue();
24182   assert(StoreVT.is128BitVector() &&
24183          StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
24184   StoredVal = DAG.getBitcast(StoreVT, StoredVal);
24185 
24186   // Splitting volatile memory ops is not allowed unless the operation was not
24187   // legal to begin with. We are assuming the input op is legal (this transform
24188   // is only used for targets with AVX).
24189   if (!Store->isSimple())
24190     return SDValue();
24191 
24192   MVT StoreSVT = StoreVT.getScalarType();
24193   unsigned NumElems = StoreVT.getVectorNumElements();
24194   unsigned ScalarSize = StoreSVT.getStoreSize();
24195 
24196   SDLoc DL(Store);
24197   SmallVector<SDValue, 4> Stores;
24198   for (unsigned i = 0; i != NumElems; ++i) {
24199     unsigned Offset = i * ScalarSize;
24200     SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
24201                                            TypeSize::Fixed(Offset), DL);
24202     SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
24203                               DAG.getIntPtrConstant(i, DL));
24204     SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
24205                               Store->getPointerInfo().getWithOffset(Offset),
24206                               Store->getOriginalAlign(),
24207                               Store->getMemOperand()->getFlags());
24208     Stores.push_back(Ch);
24209   }
24210   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
24211 }
24212 
LowerStore(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)24213 static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
24214                           SelectionDAG &DAG) {
24215   StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
24216   SDLoc dl(St);
24217   SDValue StoredVal = St->getValue();
24218 
24219   // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
24220   if (StoredVal.getValueType().isVector() &&
24221       StoredVal.getValueType().getVectorElementType() == MVT::i1) {
24222     unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
24223     assert(NumElts <= 8 && "Unexpected VT");
24224     assert(!St->isTruncatingStore() && "Expected non-truncating store");
24225     assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
24226            "Expected AVX512F without AVX512DQI");
24227 
24228     // We must pad with zeros to ensure we store zeroes to any unused bits.
24229     StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
24230                             DAG.getUNDEF(MVT::v16i1), StoredVal,
24231                             DAG.getIntPtrConstant(0, dl));
24232     StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
24233     StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
24234     // Make sure we store zeros in the extra bits.
24235     if (NumElts < 8)
24236       StoredVal = DAG.getZeroExtendInReg(
24237           StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
24238 
24239     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24240                         St->getPointerInfo(), St->getOriginalAlign(),
24241                         St->getMemOperand()->getFlags());
24242   }
24243 
24244   if (St->isTruncatingStore())
24245     return SDValue();
24246 
24247   // If this is a 256-bit store of concatenated ops, we are better off splitting
24248   // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
24249   // and each half can execute independently. Some cores would split the op into
24250   // halves anyway, so the concat (vinsertf128) is purely an extra op.
24251   MVT StoreVT = StoredVal.getSimpleValueType();
24252   if (StoreVT.is256BitVector() ||
24253       ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
24254        !Subtarget.hasBWI())) {
24255     SmallVector<SDValue, 4> CatOps;
24256     if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
24257       return splitVectorStore(St, DAG);
24258     return SDValue();
24259   }
24260 
24261   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24262   assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
24263          "Unexpected VT");
24264   assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
24265              TargetLowering::TypeWidenVector && "Unexpected type action!");
24266 
24267   EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
24268   StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
24269                           DAG.getUNDEF(StoreVT));
24270 
24271   if (Subtarget.hasSSE2()) {
24272     // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
24273     // and store it.
24274     MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
24275     MVT CastVT = MVT::getVectorVT(StVT, 2);
24276     StoredVal = DAG.getBitcast(CastVT, StoredVal);
24277     StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
24278                             DAG.getIntPtrConstant(0, dl));
24279 
24280     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24281                         St->getPointerInfo(), St->getOriginalAlign(),
24282                         St->getMemOperand()->getFlags());
24283   }
24284   assert(Subtarget.hasSSE1() && "Expected SSE");
24285   SDVTList Tys = DAG.getVTList(MVT::Other);
24286   SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
24287   return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
24288                                  St->getMemOperand());
24289 }
24290 
24291 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
24292 // may emit an illegal shuffle but the expansion is still better than scalar
24293 // code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
24294 // we'll emit a shuffle and a arithmetic shift.
24295 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
24296 // TODO: It is possible to support ZExt by zeroing the undef values during
24297 // the shuffle phase or after the shuffle.
LowerLoad(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)24298 static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
24299                                  SelectionDAG &DAG) {
24300   MVT RegVT = Op.getSimpleValueType();
24301   assert(RegVT.isVector() && "We only custom lower vector loads.");
24302   assert(RegVT.isInteger() &&
24303          "We only custom lower integer vector loads.");
24304 
24305   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
24306   SDLoc dl(Ld);
24307 
24308   // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
24309   if (RegVT.getVectorElementType() == MVT::i1) {
24310     assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
24311     assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
24312     assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
24313            "Expected AVX512F without AVX512DQI");
24314 
24315     SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
24316                                 Ld->getPointerInfo(), Ld->getOriginalAlign(),
24317                                 Ld->getMemOperand()->getFlags());
24318 
24319     // Replace chain users with the new chain.
24320     assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
24321 
24322     SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
24323     Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
24324                       DAG.getBitcast(MVT::v16i1, Val),
24325                       DAG.getIntPtrConstant(0, dl));
24326     return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
24327   }
24328 
24329   return SDValue();
24330 }
24331 
24332 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
24333 /// each of which has no other use apart from the AND / OR.
isAndOrOfSetCCs(SDValue Op,unsigned & Opc)24334 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
24335   Opc = Op.getOpcode();
24336   if (Opc != ISD::OR && Opc != ISD::AND)
24337     return false;
24338   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
24339           Op.getOperand(0).hasOneUse() &&
24340           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
24341           Op.getOperand(1).hasOneUse());
24342 }
24343 
LowerBRCOND(SDValue Op,SelectionDAG & DAG) const24344 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
24345   SDValue Chain = Op.getOperand(0);
24346   SDValue Cond  = Op.getOperand(1);
24347   SDValue Dest  = Op.getOperand(2);
24348   SDLoc dl(Op);
24349 
24350   if (Cond.getOpcode() == ISD::SETCC &&
24351       Cond.getOperand(0).getValueType() != MVT::f128) {
24352     SDValue LHS = Cond.getOperand(0);
24353     SDValue RHS = Cond.getOperand(1);
24354     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
24355 
24356     // Special case for
24357     // setcc([su]{add,sub,mul}o == 0)
24358     // setcc([su]{add,sub,mul}o != 1)
24359     if (ISD::isOverflowIntrOpRes(LHS) &&
24360         (CC == ISD::SETEQ || CC == ISD::SETNE) &&
24361         (isNullConstant(RHS) || isOneConstant(RHS))) {
24362       SDValue Value, Overflow;
24363       X86::CondCode X86Cond;
24364       std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
24365 
24366       if ((CC == ISD::SETEQ) == isNullConstant(RHS))
24367         X86Cond = X86::GetOppositeBranchCondition(X86Cond);
24368 
24369       SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24370       return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24371                          Overflow);
24372     }
24373 
24374     if (LHS.getSimpleValueType().isInteger()) {
24375       SDValue CCVal;
24376       SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
24377       return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24378                          EFLAGS);
24379     }
24380 
24381     if (CC == ISD::SETOEQ) {
24382       // For FCMP_OEQ, we can emit
24383       // two branches instead of an explicit AND instruction with a
24384       // separate test. However, we only do this if this block doesn't
24385       // have a fall-through edge, because this requires an explicit
24386       // jmp when the condition is false.
24387       if (Op.getNode()->hasOneUse()) {
24388         SDNode *User = *Op.getNode()->use_begin();
24389         // Look for an unconditional branch following this conditional branch.
24390         // We need this because we need to reverse the successors in order
24391         // to implement FCMP_OEQ.
24392         if (User->getOpcode() == ISD::BR) {
24393           SDValue FalseBB = User->getOperand(1);
24394           SDNode *NewBR =
24395             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
24396           assert(NewBR == User);
24397           (void)NewBR;
24398           Dest = FalseBB;
24399 
24400           SDValue Cmp =
24401               DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24402           SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24403           Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
24404                               CCVal, Cmp);
24405           CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24406           return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24407                              Cmp);
24408         }
24409       }
24410     } else if (CC == ISD::SETUNE) {
24411       // For FCMP_UNE, we can emit
24412       // two branches instead of an explicit OR instruction with a
24413       // separate test.
24414       SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24415       SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24416       Chain =
24417           DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
24418       CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24419       return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24420                          Cmp);
24421     } else {
24422       X86::CondCode X86Cond =
24423           TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
24424       SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24425       SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24426       return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24427                          Cmp);
24428     }
24429   }
24430 
24431   if (ISD::isOverflowIntrOpRes(Cond)) {
24432     SDValue Value, Overflow;
24433     X86::CondCode X86Cond;
24434     std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24435 
24436     SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24437     return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24438                        Overflow);
24439   }
24440 
24441   // Look past the truncate if the high bits are known zero.
24442   if (isTruncWithZeroHighBitsInput(Cond, DAG))
24443     Cond = Cond.getOperand(0);
24444 
24445   EVT CondVT = Cond.getValueType();
24446 
24447   // Add an AND with 1 if we don't already have one.
24448   if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
24449     Cond =
24450         DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
24451 
24452   SDValue LHS = Cond;
24453   SDValue RHS = DAG.getConstant(0, dl, CondVT);
24454 
24455   SDValue CCVal;
24456   SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
24457   return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24458                      EFLAGS);
24459 }
24460 
24461 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
24462 // Calls to _alloca are needed to probe the stack when allocating more than 4k
24463 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
24464 // that the guard pages used by the OS virtual memory manager are allocated in
24465 // correct sequence.
24466 SDValue
LowerDYNAMIC_STACKALLOC(SDValue Op,SelectionDAG & DAG) const24467 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
24468                                            SelectionDAG &DAG) const {
24469   MachineFunction &MF = DAG.getMachineFunction();
24470   bool SplitStack = MF.shouldSplitStack();
24471   bool EmitStackProbeCall = hasStackProbeSymbol(MF);
24472   bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
24473                SplitStack || EmitStackProbeCall;
24474   SDLoc dl(Op);
24475 
24476   // Get the inputs.
24477   SDNode *Node = Op.getNode();
24478   SDValue Chain = Op.getOperand(0);
24479   SDValue Size  = Op.getOperand(1);
24480   MaybeAlign Alignment(Op.getConstantOperandVal(2));
24481   EVT VT = Node->getValueType(0);
24482 
24483   // Chain the dynamic stack allocation so that it doesn't modify the stack
24484   // pointer when other instructions are using the stack.
24485   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
24486 
24487   bool Is64Bit = Subtarget.is64Bit();
24488   MVT SPTy = getPointerTy(DAG.getDataLayout());
24489 
24490   SDValue Result;
24491   if (!Lower) {
24492     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24493     Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
24494     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
24495                     " not tell us which reg is the stack pointer!");
24496 
24497     const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
24498     const Align StackAlign = TFI.getStackAlign();
24499     if (hasInlineStackProbe(MF)) {
24500       MachineRegisterInfo &MRI = MF.getRegInfo();
24501 
24502       const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24503       Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24504       Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24505       Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
24506                            DAG.getRegister(Vreg, SPTy));
24507     } else {
24508       SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
24509       Chain = SP.getValue(1);
24510       Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
24511     }
24512     if (Alignment && *Alignment > StackAlign)
24513       Result =
24514           DAG.getNode(ISD::AND, dl, VT, Result,
24515                       DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24516     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
24517   } else if (SplitStack) {
24518     MachineRegisterInfo &MRI = MF.getRegInfo();
24519 
24520     if (Is64Bit) {
24521       // The 64 bit implementation of segmented stacks needs to clobber both r10
24522       // r11. This makes it impossible to use it along with nested parameters.
24523       const Function &F = MF.getFunction();
24524       for (const auto &A : F.args()) {
24525         if (A.hasNestAttr())
24526           report_fatal_error("Cannot use segmented stacks with functions that "
24527                              "have nested arguments.");
24528       }
24529     }
24530 
24531     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24532     Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24533     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24534     Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
24535                                 DAG.getRegister(Vreg, SPTy));
24536   } else {
24537     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
24538     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
24539     MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
24540 
24541     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
24542     Register SPReg = RegInfo->getStackRegister();
24543     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
24544     Chain = SP.getValue(1);
24545 
24546     if (Alignment) {
24547       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
24548                        DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24549       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
24550     }
24551 
24552     Result = SP;
24553   }
24554 
24555   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
24556                              DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
24557 
24558   SDValue Ops[2] = {Result, Chain};
24559   return DAG.getMergeValues(Ops, dl);
24560 }
24561 
LowerVASTART(SDValue Op,SelectionDAG & DAG) const24562 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
24563   MachineFunction &MF = DAG.getMachineFunction();
24564   auto PtrVT = getPointerTy(MF.getDataLayout());
24565   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
24566 
24567   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
24568   SDLoc DL(Op);
24569 
24570   if (!Subtarget.is64Bit() ||
24571       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
24572     // vastart just stores the address of the VarArgsFrameIndex slot into the
24573     // memory location argument.
24574     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24575     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
24576                         MachinePointerInfo(SV));
24577   }
24578 
24579   // __va_list_tag:
24580   //   gp_offset         (0 - 6 * 8)
24581   //   fp_offset         (48 - 48 + 8 * 16)
24582   //   overflow_arg_area (point to parameters coming in memory).
24583   //   reg_save_area
24584   SmallVector<SDValue, 8> MemOps;
24585   SDValue FIN = Op.getOperand(1);
24586   // Store gp_offset
24587   SDValue Store = DAG.getStore(
24588       Op.getOperand(0), DL,
24589       DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
24590       MachinePointerInfo(SV));
24591   MemOps.push_back(Store);
24592 
24593   // Store fp_offset
24594   FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
24595   Store = DAG.getStore(
24596       Op.getOperand(0), DL,
24597       DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
24598       MachinePointerInfo(SV, 4));
24599   MemOps.push_back(Store);
24600 
24601   // Store ptr to overflow_arg_area
24602   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
24603   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24604   Store =
24605       DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
24606   MemOps.push_back(Store);
24607 
24608   // Store ptr to reg_save_area.
24609   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
24610       Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
24611   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
24612   Store = DAG.getStore(
24613       Op.getOperand(0), DL, RSFIN, FIN,
24614       MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
24615   MemOps.push_back(Store);
24616   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
24617 }
24618 
LowerVAARG(SDValue Op,SelectionDAG & DAG) const24619 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
24620   assert(Subtarget.is64Bit() &&
24621          "LowerVAARG only handles 64-bit va_arg!");
24622   assert(Op.getNumOperands() == 4);
24623 
24624   MachineFunction &MF = DAG.getMachineFunction();
24625   if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
24626     // The Win64 ABI uses char* instead of a structure.
24627     return DAG.expandVAArg(Op.getNode());
24628 
24629   SDValue Chain = Op.getOperand(0);
24630   SDValue SrcPtr = Op.getOperand(1);
24631   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
24632   unsigned Align = Op.getConstantOperandVal(3);
24633   SDLoc dl(Op);
24634 
24635   EVT ArgVT = Op.getNode()->getValueType(0);
24636   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
24637   uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
24638   uint8_t ArgMode;
24639 
24640   // Decide which area this value should be read from.
24641   // TODO: Implement the AMD64 ABI in its entirety. This simple
24642   // selection mechanism works only for the basic types.
24643   assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
24644   if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
24645     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
24646   } else {
24647     assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
24648            "Unhandled argument type in LowerVAARG");
24649     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
24650   }
24651 
24652   if (ArgMode == 2) {
24653     // Sanity Check: Make sure using fp_offset makes sense.
24654     assert(!Subtarget.useSoftFloat() &&
24655            !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
24656            Subtarget.hasSSE1());
24657   }
24658 
24659   // Insert VAARG node into the DAG
24660   // VAARG returns two values: Variable Argument Address, Chain
24661   SDValue InstOps[] = {Chain, SrcPtr,
24662                        DAG.getTargetConstant(ArgSize, dl, MVT::i32),
24663                        DAG.getTargetConstant(ArgMode, dl, MVT::i8),
24664                        DAG.getTargetConstant(Align, dl, MVT::i32)};
24665   SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
24666   SDValue VAARG = DAG.getMemIntrinsicNode(
24667       Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
24668       VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
24669       /*Alignment=*/None,
24670       MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
24671   Chain = VAARG.getValue(1);
24672 
24673   // Load the next argument and return it
24674   return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
24675 }
24676 
LowerVACOPY(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)24677 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
24678                            SelectionDAG &DAG) {
24679   // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
24680   // where a va_list is still an i8*.
24681   assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
24682   if (Subtarget.isCallingConvWin64(
24683         DAG.getMachineFunction().getFunction().getCallingConv()))
24684     // Probably a Win64 va_copy.
24685     return DAG.expandVACopy(Op.getNode());
24686 
24687   SDValue Chain = Op.getOperand(0);
24688   SDValue DstPtr = Op.getOperand(1);
24689   SDValue SrcPtr = Op.getOperand(2);
24690   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
24691   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
24692   SDLoc DL(Op);
24693 
24694   return DAG.getMemcpy(
24695       Chain, DL, DstPtr, SrcPtr,
24696       DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
24697       Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
24698       false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
24699 }
24700 
24701 // Helper to get immediate/variable SSE shift opcode from other shift opcodes.
getTargetVShiftUniformOpcode(unsigned Opc,bool IsVariable)24702 static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
24703   switch (Opc) {
24704   case ISD::SHL:
24705   case X86ISD::VSHL:
24706   case X86ISD::VSHLI:
24707     return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
24708   case ISD::SRL:
24709   case X86ISD::VSRL:
24710   case X86ISD::VSRLI:
24711     return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
24712   case ISD::SRA:
24713   case X86ISD::VSRA:
24714   case X86ISD::VSRAI:
24715     return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
24716   }
24717   llvm_unreachable("Unknown target vector shift node");
24718 }
24719 
24720 /// Handle vector element shifts where the shift amount is a constant.
24721 /// Takes immediate version of shift as input.
getTargetVShiftByConstNode(unsigned Opc,const SDLoc & dl,MVT VT,SDValue SrcOp,uint64_t ShiftAmt,SelectionDAG & DAG)24722 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
24723                                           SDValue SrcOp, uint64_t ShiftAmt,
24724                                           SelectionDAG &DAG) {
24725   MVT ElementType = VT.getVectorElementType();
24726 
24727   // Bitcast the source vector to the output type, this is mainly necessary for
24728   // vXi8/vXi64 shifts.
24729   if (VT != SrcOp.getSimpleValueType())
24730     SrcOp = DAG.getBitcast(VT, SrcOp);
24731 
24732   // Fold this packed shift into its first operand if ShiftAmt is 0.
24733   if (ShiftAmt == 0)
24734     return SrcOp;
24735 
24736   // Check for ShiftAmt >= element width
24737   if (ShiftAmt >= ElementType.getSizeInBits()) {
24738     if (Opc == X86ISD::VSRAI)
24739       ShiftAmt = ElementType.getSizeInBits() - 1;
24740     else
24741       return DAG.getConstant(0, dl, VT);
24742   }
24743 
24744   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
24745          && "Unknown target vector shift-by-constant node");
24746 
24747   // Fold this packed vector shift into a build vector if SrcOp is a
24748   // vector of Constants or UNDEFs.
24749   if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
24750     SmallVector<SDValue, 8> Elts;
24751     unsigned NumElts = SrcOp->getNumOperands();
24752 
24753     switch (Opc) {
24754     default: llvm_unreachable("Unknown opcode!");
24755     case X86ISD::VSHLI:
24756       for (unsigned i = 0; i != NumElts; ++i) {
24757         SDValue CurrentOp = SrcOp->getOperand(i);
24758         if (CurrentOp->isUndef()) {
24759           // Must produce 0s in the correct bits.
24760           Elts.push_back(DAG.getConstant(0, dl, ElementType));
24761           continue;
24762         }
24763         auto *ND = cast<ConstantSDNode>(CurrentOp);
24764         const APInt &C = ND->getAPIntValue();
24765         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
24766       }
24767       break;
24768     case X86ISD::VSRLI:
24769       for (unsigned i = 0; i != NumElts; ++i) {
24770         SDValue CurrentOp = SrcOp->getOperand(i);
24771         if (CurrentOp->isUndef()) {
24772           // Must produce 0s in the correct bits.
24773           Elts.push_back(DAG.getConstant(0, dl, ElementType));
24774           continue;
24775         }
24776         auto *ND = cast<ConstantSDNode>(CurrentOp);
24777         const APInt &C = ND->getAPIntValue();
24778         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
24779       }
24780       break;
24781     case X86ISD::VSRAI:
24782       for (unsigned i = 0; i != NumElts; ++i) {
24783         SDValue CurrentOp = SrcOp->getOperand(i);
24784         if (CurrentOp->isUndef()) {
24785           // All shifted in bits must be the same so use 0.
24786           Elts.push_back(DAG.getConstant(0, dl, ElementType));
24787           continue;
24788         }
24789         auto *ND = cast<ConstantSDNode>(CurrentOp);
24790         const APInt &C = ND->getAPIntValue();
24791         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
24792       }
24793       break;
24794     }
24795 
24796     return DAG.getBuildVector(VT, dl, Elts);
24797   }
24798 
24799   return DAG.getNode(Opc, dl, VT, SrcOp,
24800                      DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
24801 }
24802 
24803 /// Handle vector element shifts where the shift amount may or may not be a
24804 /// constant. Takes immediate version of shift as input.
getTargetVShiftNode(unsigned Opc,const SDLoc & dl,MVT VT,SDValue SrcOp,SDValue ShAmt,const X86Subtarget & Subtarget,SelectionDAG & DAG)24805 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
24806                                    SDValue SrcOp, SDValue ShAmt,
24807                                    const X86Subtarget &Subtarget,
24808                                    SelectionDAG &DAG) {
24809   MVT SVT = ShAmt.getSimpleValueType();
24810   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
24811 
24812   // Catch shift-by-constant.
24813   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
24814     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
24815                                       CShAmt->getZExtValue(), DAG);
24816 
24817   // Change opcode to non-immediate version.
24818   Opc = getTargetVShiftUniformOpcode(Opc, true);
24819 
24820   // Need to build a vector containing shift amount.
24821   // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
24822   // +====================+============+=======================================+
24823   // | ShAmt is           | HasSSE4.1? | Construct ShAmt vector as             |
24824   // +====================+============+=======================================+
24825   // | i64                | Yes, No    | Use ShAmt as lowest elt               |
24826   // | i32                | Yes        | zero-extend in-reg                    |
24827   // | (i32 zext(i16/i8)) | Yes        | zero-extend in-reg                    |
24828   // | (i32 zext(i16/i8)) | No         | byte-shift-in-reg                     |
24829   // | i16/i32            | No         | v4i32 build_vector(ShAmt, 0, ud, ud)) |
24830   // +====================+============+=======================================+
24831 
24832   if (SVT == MVT::i64)
24833     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
24834   else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
24835            ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
24836            (ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 ||
24837             ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
24838     ShAmt = ShAmt.getOperand(0);
24839     MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
24840     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
24841     if (Subtarget.hasSSE41())
24842       ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
24843                           MVT::v2i64, ShAmt);
24844     else {
24845       SDValue ByteShift = DAG.getTargetConstant(
24846           (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
24847       ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
24848       ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
24849                           ByteShift);
24850       ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
24851                           ByteShift);
24852     }
24853   } else if (Subtarget.hasSSE41() &&
24854              ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
24855     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
24856     ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
24857                         MVT::v2i64, ShAmt);
24858   } else {
24859     SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
24860                         DAG.getUNDEF(SVT)};
24861     ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
24862   }
24863 
24864   // The return type has to be a 128-bit type with the same element
24865   // type as the input type.
24866   MVT EltVT = VT.getVectorElementType();
24867   MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
24868 
24869   ShAmt = DAG.getBitcast(ShVT, ShAmt);
24870   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
24871 }
24872 
24873 /// Return Mask with the necessary casting or extending
24874 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
getMaskNode(SDValue Mask,MVT MaskVT,const X86Subtarget & Subtarget,SelectionDAG & DAG,const SDLoc & dl)24875 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
24876                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
24877                            const SDLoc &dl) {
24878 
24879   if (isAllOnesConstant(Mask))
24880     return DAG.getConstant(1, dl, MaskVT);
24881   if (X86::isZeroNode(Mask))
24882     return DAG.getConstant(0, dl, MaskVT);
24883 
24884   assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
24885 
24886   if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
24887     assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
24888     assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
24889     // In case 32bit mode, bitcast i64 is illegal, extend/split it.
24890     SDValue Lo, Hi;
24891     Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
24892                         DAG.getConstant(0, dl, MVT::i32));
24893     Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
24894                         DAG.getConstant(1, dl, MVT::i32));
24895 
24896     Lo = DAG.getBitcast(MVT::v32i1, Lo);
24897     Hi = DAG.getBitcast(MVT::v32i1, Hi);
24898 
24899     return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
24900   } else {
24901     MVT BitcastVT = MVT::getVectorVT(MVT::i1,
24902                                      Mask.getSimpleValueType().getSizeInBits());
24903     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
24904     // are extracted by EXTRACT_SUBVECTOR.
24905     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
24906                        DAG.getBitcast(BitcastVT, Mask),
24907                        DAG.getIntPtrConstant(0, dl));
24908   }
24909 }
24910 
24911 /// Return (and \p Op, \p Mask) for compare instructions or
24912 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
24913 /// necessary casting or extending for \p Mask when lowering masking intrinsics
getVectorMaskingNode(SDValue Op,SDValue Mask,SDValue PreservedSrc,const X86Subtarget & Subtarget,SelectionDAG & DAG)24914 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
24915                   SDValue PreservedSrc,
24916                   const X86Subtarget &Subtarget,
24917                   SelectionDAG &DAG) {
24918   MVT VT = Op.getSimpleValueType();
24919   MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
24920   unsigned OpcodeSelect = ISD::VSELECT;
24921   SDLoc dl(Op);
24922 
24923   if (isAllOnesConstant(Mask))
24924     return Op;
24925 
24926   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
24927 
24928   if (PreservedSrc.isUndef())
24929     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
24930   return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
24931 }
24932 
24933 /// Creates an SDNode for a predicated scalar operation.
24934 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
24935 /// The mask is coming as MVT::i8 and it should be transformed
24936 /// to MVT::v1i1 while lowering masking intrinsics.
24937 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
24938 /// "X86select" instead of "vselect". We just can't create the "vselect" node
24939 /// for a scalar instruction.
getScalarMaskingNode(SDValue Op,SDValue Mask,SDValue PreservedSrc,const X86Subtarget & Subtarget,SelectionDAG & DAG)24940 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
24941                                     SDValue PreservedSrc,
24942                                     const X86Subtarget &Subtarget,
24943                                     SelectionDAG &DAG) {
24944 
24945   if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
24946     if (MaskConst->getZExtValue() & 0x1)
24947       return Op;
24948 
24949   MVT VT = Op.getSimpleValueType();
24950   SDLoc dl(Op);
24951 
24952   assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
24953   SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
24954                               DAG.getBitcast(MVT::v8i1, Mask),
24955                               DAG.getIntPtrConstant(0, dl));
24956   if (Op.getOpcode() == X86ISD::FSETCCM ||
24957       Op.getOpcode() == X86ISD::FSETCCM_SAE ||
24958       Op.getOpcode() == X86ISD::VFPCLASSS)
24959     return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
24960 
24961   if (PreservedSrc.isUndef())
24962     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
24963   return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
24964 }
24965 
getSEHRegistrationNodeSize(const Function * Fn)24966 static int getSEHRegistrationNodeSize(const Function *Fn) {
24967   if (!Fn->hasPersonalityFn())
24968     report_fatal_error(
24969         "querying registration node size for function without personality");
24970   // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
24971   // WinEHStatePass for the full struct definition.
24972   switch (classifyEHPersonality(Fn->getPersonalityFn())) {
24973   case EHPersonality::MSVC_X86SEH: return 24;
24974   case EHPersonality::MSVC_CXX: return 16;
24975   default: break;
24976   }
24977   report_fatal_error(
24978       "can only recover FP for 32-bit MSVC EH personality functions");
24979 }
24980 
24981 /// When the MSVC runtime transfers control to us, either to an outlined
24982 /// function or when returning to a parent frame after catching an exception, we
24983 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
24984 /// Here's the math:
24985 ///   RegNodeBase = EntryEBP - RegNodeSize
24986 ///   ParentFP = RegNodeBase - ParentFrameOffset
24987 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
24988 /// subtracting the offset (negative on x86) takes us back to the parent FP.
recoverFramePointer(SelectionDAG & DAG,const Function * Fn,SDValue EntryEBP)24989 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
24990                                    SDValue EntryEBP) {
24991   MachineFunction &MF = DAG.getMachineFunction();
24992   SDLoc dl;
24993 
24994   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24995   MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
24996 
24997   // It's possible that the parent function no longer has a personality function
24998   // if the exceptional code was optimized away, in which case we just return
24999   // the incoming EBP.
25000   if (!Fn->hasPersonalityFn())
25001     return EntryEBP;
25002 
25003   // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
25004   // registration, or the .set_setframe offset.
25005   MCSymbol *OffsetSym =
25006       MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
25007           GlobalValue::dropLLVMManglingEscape(Fn->getName()));
25008   SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
25009   SDValue ParentFrameOffset =
25010       DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
25011 
25012   // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
25013   // prologue to RBP in the parent function.
25014   const X86Subtarget &Subtarget =
25015       static_cast<const X86Subtarget &>(DAG.getSubtarget());
25016   if (Subtarget.is64Bit())
25017     return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
25018 
25019   int RegNodeSize = getSEHRegistrationNodeSize(Fn);
25020   // RegNodeBase = EntryEBP - RegNodeSize
25021   // ParentFP = RegNodeBase - ParentFrameOffset
25022   SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
25023                                     DAG.getConstant(RegNodeSize, dl, PtrVT));
25024   return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
25025 }
25026 
LowerINTRINSIC_WO_CHAIN(SDValue Op,SelectionDAG & DAG) const25027 SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
25028                                                    SelectionDAG &DAG) const {
25029   // Helper to detect if the operand is CUR_DIRECTION rounding mode.
25030   auto isRoundModeCurDirection = [](SDValue Rnd) {
25031     if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
25032       return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
25033 
25034     return false;
25035   };
25036   auto isRoundModeSAE = [](SDValue Rnd) {
25037     if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25038       unsigned RC = C->getZExtValue();
25039       if (RC & X86::STATIC_ROUNDING::NO_EXC) {
25040         // Clear the NO_EXC bit and check remaining bits.
25041         RC ^= X86::STATIC_ROUNDING::NO_EXC;
25042         // As a convenience we allow no other bits or explicitly
25043         // current direction.
25044         return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
25045       }
25046     }
25047 
25048     return false;
25049   };
25050   auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
25051     if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25052       RC = C->getZExtValue();
25053       if (RC & X86::STATIC_ROUNDING::NO_EXC) {
25054         // Clear the NO_EXC bit and check remaining bits.
25055         RC ^= X86::STATIC_ROUNDING::NO_EXC;
25056         return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
25057                RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
25058                RC == X86::STATIC_ROUNDING::TO_POS_INF ||
25059                RC == X86::STATIC_ROUNDING::TO_ZERO;
25060       }
25061     }
25062 
25063     return false;
25064   };
25065 
25066   SDLoc dl(Op);
25067   unsigned IntNo = Op.getConstantOperandVal(0);
25068   MVT VT = Op.getSimpleValueType();
25069   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
25070 
25071   // Propagate flags from original node to transformed node(s).
25072   SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
25073 
25074   if (IntrData) {
25075     switch(IntrData->Type) {
25076     case INTR_TYPE_1OP: {
25077       // We specify 2 possible opcodes for intrinsics with rounding modes.
25078       // First, we check if the intrinsic may have non-default rounding mode,
25079       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25080       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25081       if (IntrWithRoundingModeOpcode != 0) {
25082         SDValue Rnd = Op.getOperand(2);
25083         unsigned RC = 0;
25084         if (isRoundModeSAEToX(Rnd, RC))
25085           return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25086                              Op.getOperand(1),
25087                              DAG.getTargetConstant(RC, dl, MVT::i32));
25088         if (!isRoundModeCurDirection(Rnd))
25089           return SDValue();
25090       }
25091       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25092                          Op.getOperand(1));
25093     }
25094     case INTR_TYPE_1OP_SAE: {
25095       SDValue Sae = Op.getOperand(2);
25096 
25097       unsigned Opc;
25098       if (isRoundModeCurDirection(Sae))
25099         Opc = IntrData->Opc0;
25100       else if (isRoundModeSAE(Sae))
25101         Opc = IntrData->Opc1;
25102       else
25103         return SDValue();
25104 
25105       return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
25106     }
25107     case INTR_TYPE_2OP: {
25108       SDValue Src2 = Op.getOperand(2);
25109 
25110       // We specify 2 possible opcodes for intrinsics with rounding modes.
25111       // First, we check if the intrinsic may have non-default rounding mode,
25112       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25113       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25114       if (IntrWithRoundingModeOpcode != 0) {
25115         SDValue Rnd = Op.getOperand(3);
25116         unsigned RC = 0;
25117         if (isRoundModeSAEToX(Rnd, RC))
25118           return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25119                              Op.getOperand(1), Src2,
25120                              DAG.getTargetConstant(RC, dl, MVT::i32));
25121         if (!isRoundModeCurDirection(Rnd))
25122           return SDValue();
25123       }
25124 
25125       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25126                          Op.getOperand(1), Src2);
25127     }
25128     case INTR_TYPE_2OP_SAE: {
25129       SDValue Sae = Op.getOperand(3);
25130 
25131       unsigned Opc;
25132       if (isRoundModeCurDirection(Sae))
25133         Opc = IntrData->Opc0;
25134       else if (isRoundModeSAE(Sae))
25135         Opc = IntrData->Opc1;
25136       else
25137         return SDValue();
25138 
25139       return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
25140                          Op.getOperand(2));
25141     }
25142     case INTR_TYPE_3OP:
25143     case INTR_TYPE_3OP_IMM8: {
25144       SDValue Src1 = Op.getOperand(1);
25145       SDValue Src2 = Op.getOperand(2);
25146       SDValue Src3 = Op.getOperand(3);
25147 
25148       if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
25149           Src3.getValueType() != MVT::i8) {
25150         Src3 = DAG.getTargetConstant(
25151             cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
25152       }
25153 
25154       // We specify 2 possible opcodes for intrinsics with rounding modes.
25155       // First, we check if the intrinsic may have non-default rounding mode,
25156       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25157       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25158       if (IntrWithRoundingModeOpcode != 0) {
25159         SDValue Rnd = Op.getOperand(4);
25160         unsigned RC = 0;
25161         if (isRoundModeSAEToX(Rnd, RC))
25162           return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25163                              Src1, Src2, Src3,
25164                              DAG.getTargetConstant(RC, dl, MVT::i32));
25165         if (!isRoundModeCurDirection(Rnd))
25166           return SDValue();
25167       }
25168 
25169       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25170                          {Src1, Src2, Src3});
25171     }
25172     case INTR_TYPE_4OP_IMM8: {
25173       assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
25174       SDValue Src4 = Op.getOperand(4);
25175       if (Src4.getValueType() != MVT::i8) {
25176         Src4 = DAG.getTargetConstant(
25177             cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
25178       }
25179 
25180       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25181                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
25182                          Src4);
25183     }
25184     case INTR_TYPE_1OP_MASK: {
25185       SDValue Src = Op.getOperand(1);
25186       SDValue PassThru = Op.getOperand(2);
25187       SDValue Mask = Op.getOperand(3);
25188       // We add rounding mode to the Node when
25189       //   - RC Opcode is specified and
25190       //   - RC is not "current direction".
25191       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25192       if (IntrWithRoundingModeOpcode != 0) {
25193         SDValue Rnd = Op.getOperand(4);
25194         unsigned RC = 0;
25195         if (isRoundModeSAEToX(Rnd, RC))
25196           return getVectorMaskingNode(
25197               DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25198                           Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
25199               Mask, PassThru, Subtarget, DAG);
25200         if (!isRoundModeCurDirection(Rnd))
25201           return SDValue();
25202       }
25203       return getVectorMaskingNode(
25204           DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
25205           Subtarget, DAG);
25206     }
25207     case INTR_TYPE_1OP_MASK_SAE: {
25208       SDValue Src = Op.getOperand(1);
25209       SDValue PassThru = Op.getOperand(2);
25210       SDValue Mask = Op.getOperand(3);
25211       SDValue Rnd = Op.getOperand(4);
25212 
25213       unsigned Opc;
25214       if (isRoundModeCurDirection(Rnd))
25215         Opc = IntrData->Opc0;
25216       else if (isRoundModeSAE(Rnd))
25217         Opc = IntrData->Opc1;
25218       else
25219         return SDValue();
25220 
25221       return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
25222                                   Subtarget, DAG);
25223     }
25224     case INTR_TYPE_SCALAR_MASK: {
25225       SDValue Src1 = Op.getOperand(1);
25226       SDValue Src2 = Op.getOperand(2);
25227       SDValue passThru = Op.getOperand(3);
25228       SDValue Mask = Op.getOperand(4);
25229       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25230       // There are 2 kinds of intrinsics in this group:
25231       // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
25232       // (2) With rounding mode and sae - 7 operands.
25233       bool HasRounding = IntrWithRoundingModeOpcode != 0;
25234       if (Op.getNumOperands() == (5U + HasRounding)) {
25235         if (HasRounding) {
25236           SDValue Rnd = Op.getOperand(5);
25237           unsigned RC = 0;
25238           if (isRoundModeSAEToX(Rnd, RC))
25239             return getScalarMaskingNode(
25240                 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
25241                             DAG.getTargetConstant(RC, dl, MVT::i32)),
25242                 Mask, passThru, Subtarget, DAG);
25243           if (!isRoundModeCurDirection(Rnd))
25244             return SDValue();
25245         }
25246         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
25247                                                 Src2),
25248                                     Mask, passThru, Subtarget, DAG);
25249       }
25250 
25251       assert(Op.getNumOperands() == (6U + HasRounding) &&
25252              "Unexpected intrinsic form");
25253       SDValue RoundingMode = Op.getOperand(5);
25254       unsigned Opc = IntrData->Opc0;
25255       if (HasRounding) {
25256         SDValue Sae = Op.getOperand(6);
25257         if (isRoundModeSAE(Sae))
25258           Opc = IntrWithRoundingModeOpcode;
25259         else if (!isRoundModeCurDirection(Sae))
25260           return SDValue();
25261       }
25262       return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
25263                                               Src2, RoundingMode),
25264                                   Mask, passThru, Subtarget, DAG);
25265     }
25266     case INTR_TYPE_SCALAR_MASK_RND: {
25267       SDValue Src1 = Op.getOperand(1);
25268       SDValue Src2 = Op.getOperand(2);
25269       SDValue passThru = Op.getOperand(3);
25270       SDValue Mask = Op.getOperand(4);
25271       SDValue Rnd = Op.getOperand(5);
25272 
25273       SDValue NewOp;
25274       unsigned RC = 0;
25275       if (isRoundModeCurDirection(Rnd))
25276         NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25277       else if (isRoundModeSAEToX(Rnd, RC))
25278         NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25279                             DAG.getTargetConstant(RC, dl, MVT::i32));
25280       else
25281         return SDValue();
25282 
25283       return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
25284     }
25285     case INTR_TYPE_SCALAR_MASK_SAE: {
25286       SDValue Src1 = Op.getOperand(1);
25287       SDValue Src2 = Op.getOperand(2);
25288       SDValue passThru = Op.getOperand(3);
25289       SDValue Mask = Op.getOperand(4);
25290       SDValue Sae = Op.getOperand(5);
25291       unsigned Opc;
25292       if (isRoundModeCurDirection(Sae))
25293         Opc = IntrData->Opc0;
25294       else if (isRoundModeSAE(Sae))
25295         Opc = IntrData->Opc1;
25296       else
25297         return SDValue();
25298 
25299       return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25300                                   Mask, passThru, Subtarget, DAG);
25301     }
25302     case INTR_TYPE_2OP_MASK: {
25303       SDValue Src1 = Op.getOperand(1);
25304       SDValue Src2 = Op.getOperand(2);
25305       SDValue PassThru = Op.getOperand(3);
25306       SDValue Mask = Op.getOperand(4);
25307       SDValue NewOp;
25308       if (IntrData->Opc1 != 0) {
25309         SDValue Rnd = Op.getOperand(5);
25310         unsigned RC = 0;
25311         if (isRoundModeSAEToX(Rnd, RC))
25312           NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25313                               DAG.getTargetConstant(RC, dl, MVT::i32));
25314         else if (!isRoundModeCurDirection(Rnd))
25315           return SDValue();
25316       }
25317       if (!NewOp)
25318         NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25319       return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25320     }
25321     case INTR_TYPE_2OP_MASK_SAE: {
25322       SDValue Src1 = Op.getOperand(1);
25323       SDValue Src2 = Op.getOperand(2);
25324       SDValue PassThru = Op.getOperand(3);
25325       SDValue Mask = Op.getOperand(4);
25326 
25327       unsigned Opc = IntrData->Opc0;
25328       if (IntrData->Opc1 != 0) {
25329         SDValue Sae = Op.getOperand(5);
25330         if (isRoundModeSAE(Sae))
25331           Opc = IntrData->Opc1;
25332         else if (!isRoundModeCurDirection(Sae))
25333           return SDValue();
25334       }
25335 
25336       return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25337                                   Mask, PassThru, Subtarget, DAG);
25338     }
25339     case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
25340       SDValue Src1 = Op.getOperand(1);
25341       SDValue Src2 = Op.getOperand(2);
25342       SDValue Src3 = Op.getOperand(3);
25343       SDValue PassThru = Op.getOperand(4);
25344       SDValue Mask = Op.getOperand(5);
25345       SDValue Sae = Op.getOperand(6);
25346       unsigned Opc;
25347       if (isRoundModeCurDirection(Sae))
25348         Opc = IntrData->Opc0;
25349       else if (isRoundModeSAE(Sae))
25350         Opc = IntrData->Opc1;
25351       else
25352         return SDValue();
25353 
25354       return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25355                                   Mask, PassThru, Subtarget, DAG);
25356     }
25357     case INTR_TYPE_3OP_MASK_SAE: {
25358       SDValue Src1 = Op.getOperand(1);
25359       SDValue Src2 = Op.getOperand(2);
25360       SDValue Src3 = Op.getOperand(3);
25361       SDValue PassThru = Op.getOperand(4);
25362       SDValue Mask = Op.getOperand(5);
25363 
25364       unsigned Opc = IntrData->Opc0;
25365       if (IntrData->Opc1 != 0) {
25366         SDValue Sae = Op.getOperand(6);
25367         if (isRoundModeSAE(Sae))
25368           Opc = IntrData->Opc1;
25369         else if (!isRoundModeCurDirection(Sae))
25370           return SDValue();
25371       }
25372       return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25373                                   Mask, PassThru, Subtarget, DAG);
25374     }
25375     case BLENDV: {
25376       SDValue Src1 = Op.getOperand(1);
25377       SDValue Src2 = Op.getOperand(2);
25378       SDValue Src3 = Op.getOperand(3);
25379 
25380       EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
25381       Src3 = DAG.getBitcast(MaskVT, Src3);
25382 
25383       // Reverse the operands to match VSELECT order.
25384       return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
25385     }
25386     case VPERM_2OP : {
25387       SDValue Src1 = Op.getOperand(1);
25388       SDValue Src2 = Op.getOperand(2);
25389 
25390       // Swap Src1 and Src2 in the node creation
25391       return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
25392     }
25393     case IFMA_OP:
25394       // NOTE: We need to swizzle the operands to pass the multiply operands
25395       // first.
25396       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25397                          Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
25398     case FPCLASSS: {
25399       SDValue Src1 = Op.getOperand(1);
25400       SDValue Imm = Op.getOperand(2);
25401       SDValue Mask = Op.getOperand(3);
25402       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
25403       SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
25404                                                  Subtarget, DAG);
25405       // Need to fill with zeros to ensure the bitcast will produce zeroes
25406       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25407       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25408                                 DAG.getConstant(0, dl, MVT::v8i1),
25409                                 FPclassMask, DAG.getIntPtrConstant(0, dl));
25410       return DAG.getBitcast(MVT::i8, Ins);
25411     }
25412 
25413     case CMP_MASK_CC: {
25414       MVT MaskVT = Op.getSimpleValueType();
25415       SDValue CC = Op.getOperand(3);
25416       SDValue Mask = Op.getOperand(4);
25417       // We specify 2 possible opcodes for intrinsics with rounding modes.
25418       // First, we check if the intrinsic may have non-default rounding mode,
25419       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25420       if (IntrData->Opc1 != 0) {
25421         SDValue Sae = Op.getOperand(5);
25422         if (isRoundModeSAE(Sae))
25423           return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
25424                              Op.getOperand(2), CC, Mask, Sae);
25425         if (!isRoundModeCurDirection(Sae))
25426           return SDValue();
25427       }
25428       //default rounding mode
25429       return DAG.getNode(IntrData->Opc0, dl, MaskVT,
25430                          {Op.getOperand(1), Op.getOperand(2), CC, Mask});
25431     }
25432     case CMP_MASK_SCALAR_CC: {
25433       SDValue Src1 = Op.getOperand(1);
25434       SDValue Src2 = Op.getOperand(2);
25435       SDValue CC = Op.getOperand(3);
25436       SDValue Mask = Op.getOperand(4);
25437 
25438       SDValue Cmp;
25439       if (IntrData->Opc1 != 0) {
25440         SDValue Sae = Op.getOperand(5);
25441         if (isRoundModeSAE(Sae))
25442           Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
25443         else if (!isRoundModeCurDirection(Sae))
25444           return SDValue();
25445       }
25446       //default rounding mode
25447       if (!Cmp.getNode())
25448         Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
25449 
25450       SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
25451                                              Subtarget, DAG);
25452       // Need to fill with zeros to ensure the bitcast will produce zeroes
25453       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25454       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25455                                 DAG.getConstant(0, dl, MVT::v8i1),
25456                                 CmpMask, DAG.getIntPtrConstant(0, dl));
25457       return DAG.getBitcast(MVT::i8, Ins);
25458     }
25459     case COMI: { // Comparison intrinsics
25460       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
25461       SDValue LHS = Op.getOperand(1);
25462       SDValue RHS = Op.getOperand(2);
25463       // Some conditions require the operands to be swapped.
25464       if (CC == ISD::SETLT || CC == ISD::SETLE)
25465         std::swap(LHS, RHS);
25466 
25467       SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
25468       SDValue SetCC;
25469       switch (CC) {
25470       case ISD::SETEQ: { // (ZF = 0 and PF = 0)
25471         SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
25472         SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
25473         SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
25474         break;
25475       }
25476       case ISD::SETNE: { // (ZF = 1 or PF = 1)
25477         SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
25478         SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
25479         SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
25480         break;
25481       }
25482       case ISD::SETGT: // (CF = 0 and ZF = 0)
25483       case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
25484         SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
25485         break;
25486       }
25487       case ISD::SETGE: // CF = 0
25488       case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
25489         SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
25490         break;
25491       default:
25492         llvm_unreachable("Unexpected illegal condition!");
25493       }
25494       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25495     }
25496     case COMI_RM: { // Comparison intrinsics with Sae
25497       SDValue LHS = Op.getOperand(1);
25498       SDValue RHS = Op.getOperand(2);
25499       unsigned CondVal = Op.getConstantOperandVal(3);
25500       SDValue Sae = Op.getOperand(4);
25501 
25502       SDValue FCmp;
25503       if (isRoundModeCurDirection(Sae))
25504         FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
25505                            DAG.getTargetConstant(CondVal, dl, MVT::i8));
25506       else if (isRoundModeSAE(Sae))
25507         FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
25508                            DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
25509       else
25510         return SDValue();
25511       // Need to fill with zeros to ensure the bitcast will produce zeroes
25512       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25513       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25514                                 DAG.getConstant(0, dl, MVT::v16i1),
25515                                 FCmp, DAG.getIntPtrConstant(0, dl));
25516       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
25517                          DAG.getBitcast(MVT::i16, Ins));
25518     }
25519     case VSHIFT:
25520       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
25521                                  Op.getOperand(1), Op.getOperand(2), Subtarget,
25522                                  DAG);
25523     case COMPRESS_EXPAND_IN_REG: {
25524       SDValue Mask = Op.getOperand(3);
25525       SDValue DataToCompress = Op.getOperand(1);
25526       SDValue PassThru = Op.getOperand(2);
25527       if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
25528         return Op.getOperand(1);
25529 
25530       // Avoid false dependency.
25531       if (PassThru.isUndef())
25532         PassThru = DAG.getConstant(0, dl, VT);
25533 
25534       return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
25535                          Mask);
25536     }
25537     case FIXUPIMM:
25538     case FIXUPIMM_MASKZ: {
25539       SDValue Src1 = Op.getOperand(1);
25540       SDValue Src2 = Op.getOperand(2);
25541       SDValue Src3 = Op.getOperand(3);
25542       SDValue Imm = Op.getOperand(4);
25543       SDValue Mask = Op.getOperand(5);
25544       SDValue Passthru = (IntrData->Type == FIXUPIMM)
25545                              ? Src1
25546                              : getZeroVector(VT, Subtarget, DAG, dl);
25547 
25548       unsigned Opc = IntrData->Opc0;
25549       if (IntrData->Opc1 != 0) {
25550         SDValue Sae = Op.getOperand(6);
25551         if (isRoundModeSAE(Sae))
25552           Opc = IntrData->Opc1;
25553         else if (!isRoundModeCurDirection(Sae))
25554           return SDValue();
25555       }
25556 
25557       SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
25558 
25559       if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
25560         return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
25561 
25562       return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
25563     }
25564     case ROUNDP: {
25565       assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
25566       // Clear the upper bits of the rounding immediate so that the legacy
25567       // intrinsic can't trigger the scaling behavior of VRNDSCALE.
25568       auto Round = cast<ConstantSDNode>(Op.getOperand(2));
25569       SDValue RoundingMode =
25570           DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
25571       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25572                          Op.getOperand(1), RoundingMode);
25573     }
25574     case ROUNDS: {
25575       assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
25576       // Clear the upper bits of the rounding immediate so that the legacy
25577       // intrinsic can't trigger the scaling behavior of VRNDSCALE.
25578       auto Round = cast<ConstantSDNode>(Op.getOperand(3));
25579       SDValue RoundingMode =
25580           DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
25581       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25582                          Op.getOperand(1), Op.getOperand(2), RoundingMode);
25583     }
25584     case BEXTRI: {
25585       assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
25586 
25587       uint64_t Imm = Op.getConstantOperandVal(2);
25588       SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
25589                                               Op.getValueType());
25590       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25591                          Op.getOperand(1), Control);
25592     }
25593     // ADC/ADCX/SBB
25594     case ADX: {
25595       SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
25596       SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
25597 
25598       SDValue Res;
25599       // If the carry in is zero, then we should just use ADD/SUB instead of
25600       // ADC/SBB.
25601       if (isNullConstant(Op.getOperand(1))) {
25602         Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
25603                           Op.getOperand(3));
25604       } else {
25605         SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
25606                                     DAG.getConstant(-1, dl, MVT::i8));
25607         Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
25608                           Op.getOperand(3), GenCF.getValue(1));
25609       }
25610       SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
25611       SDValue Results[] = { SetCC, Res };
25612       return DAG.getMergeValues(Results, dl);
25613     }
25614     case CVTPD2PS_MASK:
25615     case CVTPD2DQ_MASK:
25616     case CVTQQ2PS_MASK:
25617     case TRUNCATE_TO_REG: {
25618       SDValue Src = Op.getOperand(1);
25619       SDValue PassThru = Op.getOperand(2);
25620       SDValue Mask = Op.getOperand(3);
25621 
25622       if (isAllOnesConstant(Mask))
25623         return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
25624 
25625       MVT SrcVT = Src.getSimpleValueType();
25626       MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
25627       Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25628       return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
25629                          {Src, PassThru, Mask});
25630     }
25631     case CVTPS2PH_MASK: {
25632       SDValue Src = Op.getOperand(1);
25633       SDValue Rnd = Op.getOperand(2);
25634       SDValue PassThru = Op.getOperand(3);
25635       SDValue Mask = Op.getOperand(4);
25636 
25637       if (isAllOnesConstant(Mask))
25638         return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);
25639 
25640       MVT SrcVT = Src.getSimpleValueType();
25641       MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
25642       Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25643       return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
25644                          PassThru, Mask);
25645 
25646     }
25647     case CVTNEPS2BF16_MASK: {
25648       SDValue Src = Op.getOperand(1);
25649       SDValue PassThru = Op.getOperand(2);
25650       SDValue Mask = Op.getOperand(3);
25651 
25652       if (ISD::isBuildVectorAllOnes(Mask.getNode()))
25653         return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
25654 
25655       // Break false dependency.
25656       if (PassThru.isUndef())
25657         PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
25658 
25659       return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
25660                          Mask);
25661     }
25662     default:
25663       break;
25664     }
25665   }
25666 
25667   switch (IntNo) {
25668   default: return SDValue();    // Don't custom lower most intrinsics.
25669 
25670   // ptest and testp intrinsics. The intrinsic these come from are designed to
25671   // return an integer value, not just an instruction so lower it to the ptest
25672   // or testp pattern and a setcc for the result.
25673   case Intrinsic::x86_avx512_ktestc_b:
25674   case Intrinsic::x86_avx512_ktestc_w:
25675   case Intrinsic::x86_avx512_ktestc_d:
25676   case Intrinsic::x86_avx512_ktestc_q:
25677   case Intrinsic::x86_avx512_ktestz_b:
25678   case Intrinsic::x86_avx512_ktestz_w:
25679   case Intrinsic::x86_avx512_ktestz_d:
25680   case Intrinsic::x86_avx512_ktestz_q:
25681   case Intrinsic::x86_sse41_ptestz:
25682   case Intrinsic::x86_sse41_ptestc:
25683   case Intrinsic::x86_sse41_ptestnzc:
25684   case Intrinsic::x86_avx_ptestz_256:
25685   case Intrinsic::x86_avx_ptestc_256:
25686   case Intrinsic::x86_avx_ptestnzc_256:
25687   case Intrinsic::x86_avx_vtestz_ps:
25688   case Intrinsic::x86_avx_vtestc_ps:
25689   case Intrinsic::x86_avx_vtestnzc_ps:
25690   case Intrinsic::x86_avx_vtestz_pd:
25691   case Intrinsic::x86_avx_vtestc_pd:
25692   case Intrinsic::x86_avx_vtestnzc_pd:
25693   case Intrinsic::x86_avx_vtestz_ps_256:
25694   case Intrinsic::x86_avx_vtestc_ps_256:
25695   case Intrinsic::x86_avx_vtestnzc_ps_256:
25696   case Intrinsic::x86_avx_vtestz_pd_256:
25697   case Intrinsic::x86_avx_vtestc_pd_256:
25698   case Intrinsic::x86_avx_vtestnzc_pd_256: {
25699     unsigned TestOpc = X86ISD::PTEST;
25700     X86::CondCode X86CC;
25701     switch (IntNo) {
25702     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
25703     case Intrinsic::x86_avx512_ktestc_b:
25704     case Intrinsic::x86_avx512_ktestc_w:
25705     case Intrinsic::x86_avx512_ktestc_d:
25706     case Intrinsic::x86_avx512_ktestc_q:
25707       // CF = 1
25708       TestOpc = X86ISD::KTEST;
25709       X86CC = X86::COND_B;
25710       break;
25711     case Intrinsic::x86_avx512_ktestz_b:
25712     case Intrinsic::x86_avx512_ktestz_w:
25713     case Intrinsic::x86_avx512_ktestz_d:
25714     case Intrinsic::x86_avx512_ktestz_q:
25715       TestOpc = X86ISD::KTEST;
25716       X86CC = X86::COND_E;
25717       break;
25718     case Intrinsic::x86_avx_vtestz_ps:
25719     case Intrinsic::x86_avx_vtestz_pd:
25720     case Intrinsic::x86_avx_vtestz_ps_256:
25721     case Intrinsic::x86_avx_vtestz_pd_256:
25722       TestOpc = X86ISD::TESTP;
25723       LLVM_FALLTHROUGH;
25724     case Intrinsic::x86_sse41_ptestz:
25725     case Intrinsic::x86_avx_ptestz_256:
25726       // ZF = 1
25727       X86CC = X86::COND_E;
25728       break;
25729     case Intrinsic::x86_avx_vtestc_ps:
25730     case Intrinsic::x86_avx_vtestc_pd:
25731     case Intrinsic::x86_avx_vtestc_ps_256:
25732     case Intrinsic::x86_avx_vtestc_pd_256:
25733       TestOpc = X86ISD::TESTP;
25734       LLVM_FALLTHROUGH;
25735     case Intrinsic::x86_sse41_ptestc:
25736     case Intrinsic::x86_avx_ptestc_256:
25737       // CF = 1
25738       X86CC = X86::COND_B;
25739       break;
25740     case Intrinsic::x86_avx_vtestnzc_ps:
25741     case Intrinsic::x86_avx_vtestnzc_pd:
25742     case Intrinsic::x86_avx_vtestnzc_ps_256:
25743     case Intrinsic::x86_avx_vtestnzc_pd_256:
25744       TestOpc = X86ISD::TESTP;
25745       LLVM_FALLTHROUGH;
25746     case Intrinsic::x86_sse41_ptestnzc:
25747     case Intrinsic::x86_avx_ptestnzc_256:
25748       // ZF and CF = 0
25749       X86CC = X86::COND_A;
25750       break;
25751     }
25752 
25753     SDValue LHS = Op.getOperand(1);
25754     SDValue RHS = Op.getOperand(2);
25755     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
25756     SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
25757     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25758   }
25759 
25760   case Intrinsic::x86_sse42_pcmpistria128:
25761   case Intrinsic::x86_sse42_pcmpestria128:
25762   case Intrinsic::x86_sse42_pcmpistric128:
25763   case Intrinsic::x86_sse42_pcmpestric128:
25764   case Intrinsic::x86_sse42_pcmpistrio128:
25765   case Intrinsic::x86_sse42_pcmpestrio128:
25766   case Intrinsic::x86_sse42_pcmpistris128:
25767   case Intrinsic::x86_sse42_pcmpestris128:
25768   case Intrinsic::x86_sse42_pcmpistriz128:
25769   case Intrinsic::x86_sse42_pcmpestriz128: {
25770     unsigned Opcode;
25771     X86::CondCode X86CC;
25772     switch (IntNo) {
25773     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
25774     case Intrinsic::x86_sse42_pcmpistria128:
25775       Opcode = X86ISD::PCMPISTR;
25776       X86CC = X86::COND_A;
25777       break;
25778     case Intrinsic::x86_sse42_pcmpestria128:
25779       Opcode = X86ISD::PCMPESTR;
25780       X86CC = X86::COND_A;
25781       break;
25782     case Intrinsic::x86_sse42_pcmpistric128:
25783       Opcode = X86ISD::PCMPISTR;
25784       X86CC = X86::COND_B;
25785       break;
25786     case Intrinsic::x86_sse42_pcmpestric128:
25787       Opcode = X86ISD::PCMPESTR;
25788       X86CC = X86::COND_B;
25789       break;
25790     case Intrinsic::x86_sse42_pcmpistrio128:
25791       Opcode = X86ISD::PCMPISTR;
25792       X86CC = X86::COND_O;
25793       break;
25794     case Intrinsic::x86_sse42_pcmpestrio128:
25795       Opcode = X86ISD::PCMPESTR;
25796       X86CC = X86::COND_O;
25797       break;
25798     case Intrinsic::x86_sse42_pcmpistris128:
25799       Opcode = X86ISD::PCMPISTR;
25800       X86CC = X86::COND_S;
25801       break;
25802     case Intrinsic::x86_sse42_pcmpestris128:
25803       Opcode = X86ISD::PCMPESTR;
25804       X86CC = X86::COND_S;
25805       break;
25806     case Intrinsic::x86_sse42_pcmpistriz128:
25807       Opcode = X86ISD::PCMPISTR;
25808       X86CC = X86::COND_E;
25809       break;
25810     case Intrinsic::x86_sse42_pcmpestriz128:
25811       Opcode = X86ISD::PCMPESTR;
25812       X86CC = X86::COND_E;
25813       break;
25814     }
25815     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
25816     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
25817     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
25818     SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
25819     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25820   }
25821 
25822   case Intrinsic::x86_sse42_pcmpistri128:
25823   case Intrinsic::x86_sse42_pcmpestri128: {
25824     unsigned Opcode;
25825     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
25826       Opcode = X86ISD::PCMPISTR;
25827     else
25828       Opcode = X86ISD::PCMPESTR;
25829 
25830     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
25831     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
25832     return DAG.getNode(Opcode, dl, VTs, NewOps);
25833   }
25834 
25835   case Intrinsic::x86_sse42_pcmpistrm128:
25836   case Intrinsic::x86_sse42_pcmpestrm128: {
25837     unsigned Opcode;
25838     if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
25839       Opcode = X86ISD::PCMPISTR;
25840     else
25841       Opcode = X86ISD::PCMPESTR;
25842 
25843     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
25844     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
25845     return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
25846   }
25847 
25848   case Intrinsic::eh_sjlj_lsda: {
25849     MachineFunction &MF = DAG.getMachineFunction();
25850     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25851     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25852     auto &Context = MF.getMMI().getContext();
25853     MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
25854                                             Twine(MF.getFunctionNumber()));
25855     return DAG.getNode(getGlobalWrapperKind(), dl, VT,
25856                        DAG.getMCSymbol(S, PtrVT));
25857   }
25858 
25859   case Intrinsic::x86_seh_lsda: {
25860     // Compute the symbol for the LSDA. We know it'll get emitted later.
25861     MachineFunction &MF = DAG.getMachineFunction();
25862     SDValue Op1 = Op.getOperand(1);
25863     auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
25864     MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
25865         GlobalValue::dropLLVMManglingEscape(Fn->getName()));
25866 
25867     // Generate a simple absolute symbol reference. This intrinsic is only
25868     // supported on 32-bit Windows, which isn't PIC.
25869     SDValue Result = DAG.getMCSymbol(LSDASym, VT);
25870     return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
25871   }
25872 
25873   case Intrinsic::eh_recoverfp: {
25874     SDValue FnOp = Op.getOperand(1);
25875     SDValue IncomingFPOp = Op.getOperand(2);
25876     GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
25877     auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
25878     if (!Fn)
25879       report_fatal_error(
25880           "llvm.eh.recoverfp must take a function as the first argument");
25881     return recoverFramePointer(DAG, Fn, IncomingFPOp);
25882   }
25883 
25884   case Intrinsic::localaddress: {
25885     // Returns one of the stack, base, or frame pointer registers, depending on
25886     // which is used to reference local variables.
25887     MachineFunction &MF = DAG.getMachineFunction();
25888     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25889     unsigned Reg;
25890     if (RegInfo->hasBasePointer(MF))
25891       Reg = RegInfo->getBaseRegister();
25892     else { // Handles the SP or FP case.
25893       bool CantUseFP = RegInfo->hasStackRealignment(MF);
25894       if (CantUseFP)
25895         Reg = RegInfo->getPtrSizedStackRegister(MF);
25896       else
25897         Reg = RegInfo->getPtrSizedFrameRegister(MF);
25898     }
25899     return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
25900   }
25901   case Intrinsic::swift_async_context_addr: {
25902     auto &MF = DAG.getMachineFunction();
25903     auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
25904     if (Subtarget.is64Bit()) {
25905       MF.getFrameInfo().setFrameAddressIsTaken(true);
25906       X86FI->setHasSwiftAsyncContext(true);
25907       return SDValue(
25908           DAG.getMachineNode(
25909               X86::SUB64ri8, dl, MVT::i64,
25910               DAG.getCopyFromReg(DAG.getEntryNode(), dl, X86::RBP, MVT::i64),
25911               DAG.getTargetConstant(8, dl, MVT::i32)),
25912           0);
25913     } else {
25914       // 32-bit so no special extended frame, create or reuse an existing stack
25915       // slot.
25916       if (!X86FI->getSwiftAsyncContextFrameIdx())
25917         X86FI->setSwiftAsyncContextFrameIdx(
25918             MF.getFrameInfo().CreateStackObject(4, Align(4), false));
25919       return DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
25920     }
25921   }
25922   case Intrinsic::x86_avx512_vp2intersect_q_512:
25923   case Intrinsic::x86_avx512_vp2intersect_q_256:
25924   case Intrinsic::x86_avx512_vp2intersect_q_128:
25925   case Intrinsic::x86_avx512_vp2intersect_d_512:
25926   case Intrinsic::x86_avx512_vp2intersect_d_256:
25927   case Intrinsic::x86_avx512_vp2intersect_d_128: {
25928     MVT MaskVT = Op.getSimpleValueType();
25929 
25930     SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
25931     SDLoc DL(Op);
25932 
25933     SDValue Operation =
25934         DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
25935                     Op->getOperand(1), Op->getOperand(2));
25936 
25937     SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
25938                                                  MaskVT, Operation);
25939     SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
25940                                                  MaskVT, Operation);
25941     return DAG.getMergeValues({Result0, Result1}, DL);
25942   }
25943   case Intrinsic::x86_mmx_pslli_w:
25944   case Intrinsic::x86_mmx_pslli_d:
25945   case Intrinsic::x86_mmx_pslli_q:
25946   case Intrinsic::x86_mmx_psrli_w:
25947   case Intrinsic::x86_mmx_psrli_d:
25948   case Intrinsic::x86_mmx_psrli_q:
25949   case Intrinsic::x86_mmx_psrai_w:
25950   case Intrinsic::x86_mmx_psrai_d: {
25951     SDLoc DL(Op);
25952     SDValue ShAmt = Op.getOperand(2);
25953     // If the argument is a constant, convert it to a target constant.
25954     if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
25955       // Clamp out of bounds shift amounts since they will otherwise be masked
25956       // to 8-bits which may make it no longer out of bounds.
25957       unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
25958       if (ShiftAmount == 0)
25959         return Op.getOperand(1);
25960 
25961       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
25962                          Op.getOperand(0), Op.getOperand(1),
25963                          DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
25964     }
25965 
25966     unsigned NewIntrinsic;
25967     switch (IntNo) {
25968     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
25969     case Intrinsic::x86_mmx_pslli_w:
25970       NewIntrinsic = Intrinsic::x86_mmx_psll_w;
25971       break;
25972     case Intrinsic::x86_mmx_pslli_d:
25973       NewIntrinsic = Intrinsic::x86_mmx_psll_d;
25974       break;
25975     case Intrinsic::x86_mmx_pslli_q:
25976       NewIntrinsic = Intrinsic::x86_mmx_psll_q;
25977       break;
25978     case Intrinsic::x86_mmx_psrli_w:
25979       NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
25980       break;
25981     case Intrinsic::x86_mmx_psrli_d:
25982       NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
25983       break;
25984     case Intrinsic::x86_mmx_psrli_q:
25985       NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
25986       break;
25987     case Intrinsic::x86_mmx_psrai_w:
25988       NewIntrinsic = Intrinsic::x86_mmx_psra_w;
25989       break;
25990     case Intrinsic::x86_mmx_psrai_d:
25991       NewIntrinsic = Intrinsic::x86_mmx_psra_d;
25992       break;
25993     }
25994 
25995     // The vector shift intrinsics with scalars uses 32b shift amounts but
25996     // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
25997     // MMX register.
25998     ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
25999     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26000                        DAG.getTargetConstant(NewIntrinsic, DL,
26001                                              getPointerTy(DAG.getDataLayout())),
26002                        Op.getOperand(1), ShAmt);
26003   }
26004   }
26005 }
26006 
getAVX2GatherNode(unsigned Opc,SDValue Op,SelectionDAG & DAG,SDValue Src,SDValue Mask,SDValue Base,SDValue Index,SDValue ScaleOp,SDValue Chain,const X86Subtarget & Subtarget)26007 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26008                                  SDValue Src, SDValue Mask, SDValue Base,
26009                                  SDValue Index, SDValue ScaleOp, SDValue Chain,
26010                                  const X86Subtarget &Subtarget) {
26011   SDLoc dl(Op);
26012   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26013   // Scale must be constant.
26014   if (!C)
26015     return SDValue();
26016   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26017   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26018                                         TLI.getPointerTy(DAG.getDataLayout()));
26019   EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
26020   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26021   // If source is undef or we know it won't be used, use a zero vector
26022   // to break register dependency.
26023   // TODO: use undef instead and let BreakFalseDeps deal with it?
26024   if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26025     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26026 
26027   // Cast mask to an integer type.
26028   Mask = DAG.getBitcast(MaskVT, Mask);
26029 
26030   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26031 
26032   SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26033   SDValue Res =
26034       DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26035                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26036   return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26037 }
26038 
getGatherNode(SDValue Op,SelectionDAG & DAG,SDValue Src,SDValue Mask,SDValue Base,SDValue Index,SDValue ScaleOp,SDValue Chain,const X86Subtarget & Subtarget)26039 static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
26040                              SDValue Src, SDValue Mask, SDValue Base,
26041                              SDValue Index, SDValue ScaleOp, SDValue Chain,
26042                              const X86Subtarget &Subtarget) {
26043   MVT VT = Op.getSimpleValueType();
26044   SDLoc dl(Op);
26045   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26046   // Scale must be constant.
26047   if (!C)
26048     return SDValue();
26049   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26050   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26051                                         TLI.getPointerTy(DAG.getDataLayout()));
26052   unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26053                               VT.getVectorNumElements());
26054   MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26055 
26056   // We support two versions of the gather intrinsics. One with scalar mask and
26057   // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26058   if (Mask.getValueType() != MaskVT)
26059     Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26060 
26061   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26062   // If source is undef or we know it won't be used, use a zero vector
26063   // to break register dependency.
26064   // TODO: use undef instead and let BreakFalseDeps deal with it?
26065   if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26066     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26067 
26068   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26069 
26070   SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26071   SDValue Res =
26072       DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26073                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26074   return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26075 }
26076 
getScatterNode(unsigned Opc,SDValue Op,SelectionDAG & DAG,SDValue Src,SDValue Mask,SDValue Base,SDValue Index,SDValue ScaleOp,SDValue Chain,const X86Subtarget & Subtarget)26077 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26078                                SDValue Src, SDValue Mask, SDValue Base,
26079                                SDValue Index, SDValue ScaleOp, SDValue Chain,
26080                                const X86Subtarget &Subtarget) {
26081   SDLoc dl(Op);
26082   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26083   // Scale must be constant.
26084   if (!C)
26085     return SDValue();
26086   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26087   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26088                                         TLI.getPointerTy(DAG.getDataLayout()));
26089   unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26090                               Src.getSimpleValueType().getVectorNumElements());
26091   MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26092 
26093   // We support two versions of the scatter intrinsics. One with scalar mask and
26094   // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26095   if (Mask.getValueType() != MaskVT)
26096     Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26097 
26098   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26099 
26100   SDVTList VTs = DAG.getVTList(MVT::Other);
26101   SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
26102   SDValue Res =
26103       DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
26104                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26105   return Res;
26106 }
26107 
getPrefetchNode(unsigned Opc,SDValue Op,SelectionDAG & DAG,SDValue Mask,SDValue Base,SDValue Index,SDValue ScaleOp,SDValue Chain,const X86Subtarget & Subtarget)26108 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26109                                SDValue Mask, SDValue Base, SDValue Index,
26110                                SDValue ScaleOp, SDValue Chain,
26111                                const X86Subtarget &Subtarget) {
26112   SDLoc dl(Op);
26113   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26114   // Scale must be constant.
26115   if (!C)
26116     return SDValue();
26117   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26118   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26119                                         TLI.getPointerTy(DAG.getDataLayout()));
26120   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
26121   SDValue Segment = DAG.getRegister(0, MVT::i32);
26122   MVT MaskVT =
26123     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
26124   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26125   SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
26126   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
26127   return SDValue(Res, 0);
26128 }
26129 
26130 /// Handles the lowering of builtin intrinsics with chain that return their
26131 /// value into registers EDX:EAX.
26132 /// If operand ScrReg is a valid register identifier, then operand 2 of N is
26133 /// copied to SrcReg. The assumption is that SrcReg is an implicit input to
26134 /// TargetOpcode.
26135 /// Returns a Glue value which can be used to add extra copy-from-reg if the
26136 /// expanded intrinsics implicitly defines extra registers (i.e. not just
26137 /// EDX:EAX).
expandIntrinsicWChainHelper(SDNode * N,const SDLoc & DL,SelectionDAG & DAG,unsigned TargetOpcode,unsigned SrcReg,const X86Subtarget & Subtarget,SmallVectorImpl<SDValue> & Results)26138 static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
26139                                         SelectionDAG &DAG,
26140                                         unsigned TargetOpcode,
26141                                         unsigned SrcReg,
26142                                         const X86Subtarget &Subtarget,
26143                                         SmallVectorImpl<SDValue> &Results) {
26144   SDValue Chain = N->getOperand(0);
26145   SDValue Glue;
26146 
26147   if (SrcReg) {
26148     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
26149     Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
26150     Glue = Chain.getValue(1);
26151   }
26152 
26153   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
26154   SDValue N1Ops[] = {Chain, Glue};
26155   SDNode *N1 = DAG.getMachineNode(
26156       TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
26157   Chain = SDValue(N1, 0);
26158 
26159   // Reads the content of XCR and returns it in registers EDX:EAX.
26160   SDValue LO, HI;
26161   if (Subtarget.is64Bit()) {
26162     LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
26163     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
26164                             LO.getValue(2));
26165   } else {
26166     LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
26167     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
26168                             LO.getValue(2));
26169   }
26170   Chain = HI.getValue(1);
26171   Glue = HI.getValue(2);
26172 
26173   if (Subtarget.is64Bit()) {
26174     // Merge the two 32-bit values into a 64-bit one.
26175     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
26176                               DAG.getConstant(32, DL, MVT::i8));
26177     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
26178     Results.push_back(Chain);
26179     return Glue;
26180   }
26181 
26182   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
26183   SDValue Ops[] = { LO, HI };
26184   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
26185   Results.push_back(Pair);
26186   Results.push_back(Chain);
26187   return Glue;
26188 }
26189 
26190 /// Handles the lowering of builtin intrinsics that read the time stamp counter
26191 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
26192 /// READCYCLECOUNTER nodes.
getReadTimeStampCounter(SDNode * N,const SDLoc & DL,unsigned Opcode,SelectionDAG & DAG,const X86Subtarget & Subtarget,SmallVectorImpl<SDValue> & Results)26193 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
26194                                     SelectionDAG &DAG,
26195                                     const X86Subtarget &Subtarget,
26196                                     SmallVectorImpl<SDValue> &Results) {
26197   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
26198   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
26199   // and the EAX register is loaded with the low-order 32 bits.
26200   SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
26201                                              /* NoRegister */0, Subtarget,
26202                                              Results);
26203   if (Opcode != X86::RDTSCP)
26204     return;
26205 
26206   SDValue Chain = Results[1];
26207   // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
26208   // the ECX register. Add 'ecx' explicitly to the chain.
26209   SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
26210   Results[1] = ecx;
26211   Results.push_back(ecx.getValue(1));
26212 }
26213 
LowerREADCYCLECOUNTER(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)26214 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
26215                                      SelectionDAG &DAG) {
26216   SmallVector<SDValue, 3> Results;
26217   SDLoc DL(Op);
26218   getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
26219                           Results);
26220   return DAG.getMergeValues(Results, DL);
26221 }
26222 
MarkEHRegistrationNode(SDValue Op,SelectionDAG & DAG)26223 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
26224   MachineFunction &MF = DAG.getMachineFunction();
26225   SDValue Chain = Op.getOperand(0);
26226   SDValue RegNode = Op.getOperand(2);
26227   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26228   if (!EHInfo)
26229     report_fatal_error("EH registrations only live in functions using WinEH");
26230 
26231   // Cast the operand to an alloca, and remember the frame index.
26232   auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
26233   if (!FINode)
26234     report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
26235   EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
26236 
26237   // Return the chain operand without making any DAG nodes.
26238   return Chain;
26239 }
26240 
MarkEHGuard(SDValue Op,SelectionDAG & DAG)26241 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
26242   MachineFunction &MF = DAG.getMachineFunction();
26243   SDValue Chain = Op.getOperand(0);
26244   SDValue EHGuard = Op.getOperand(2);
26245   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26246   if (!EHInfo)
26247     report_fatal_error("EHGuard only live in functions using WinEH");
26248 
26249   // Cast the operand to an alloca, and remember the frame index.
26250   auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
26251   if (!FINode)
26252     report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
26253   EHInfo->EHGuardFrameIndex = FINode->getIndex();
26254 
26255   // Return the chain operand without making any DAG nodes.
26256   return Chain;
26257 }
26258 
26259 /// Emit Truncating Store with signed or unsigned saturation.
26260 static SDValue
EmitTruncSStore(bool SignedSat,SDValue Chain,const SDLoc & Dl,SDValue Val,SDValue Ptr,EVT MemVT,MachineMemOperand * MMO,SelectionDAG & DAG)26261 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
26262                 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
26263                 SelectionDAG &DAG) {
26264   SDVTList VTs = DAG.getVTList(MVT::Other);
26265   SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
26266   SDValue Ops[] = { Chain, Val, Ptr, Undef };
26267   unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
26268   return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
26269 }
26270 
26271 /// Emit Masked Truncating Store with signed or unsigned saturation.
26272 static SDValue
EmitMaskedTruncSStore(bool SignedSat,SDValue Chain,const SDLoc & Dl,SDValue Val,SDValue Ptr,SDValue Mask,EVT MemVT,MachineMemOperand * MMO,SelectionDAG & DAG)26273 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
26274                       SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
26275                       MachineMemOperand *MMO, SelectionDAG &DAG) {
26276   SDVTList VTs = DAG.getVTList(MVT::Other);
26277   SDValue Ops[] = { Chain, Val, Ptr, Mask };
26278   unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
26279   return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
26280 }
26281 
LowerINTRINSIC_W_CHAIN(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)26282 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
26283                                       SelectionDAG &DAG) {
26284   unsigned IntNo = Op.getConstantOperandVal(1);
26285   const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
26286   if (!IntrData) {
26287     switch (IntNo) {
26288     case llvm::Intrinsic::x86_seh_ehregnode:
26289       return MarkEHRegistrationNode(Op, DAG);
26290     case llvm::Intrinsic::x86_seh_ehguard:
26291       return MarkEHGuard(Op, DAG);
26292     case llvm::Intrinsic::x86_rdpkru: {
26293       SDLoc dl(Op);
26294       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26295       // Create a RDPKRU node and pass 0 to the ECX parameter.
26296       return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
26297                          DAG.getConstant(0, dl, MVT::i32));
26298     }
26299     case llvm::Intrinsic::x86_wrpkru: {
26300       SDLoc dl(Op);
26301       // Create a WRPKRU node, pass the input to the EAX parameter,  and pass 0
26302       // to the EDX and ECX parameters.
26303       return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
26304                          Op.getOperand(0), Op.getOperand(2),
26305                          DAG.getConstant(0, dl, MVT::i32),
26306                          DAG.getConstant(0, dl, MVT::i32));
26307     }
26308     case llvm::Intrinsic::x86_flags_read_u32:
26309     case llvm::Intrinsic::x86_flags_read_u64:
26310     case llvm::Intrinsic::x86_flags_write_u32:
26311     case llvm::Intrinsic::x86_flags_write_u64: {
26312       // We need a frame pointer because this will get lowered to a PUSH/POP
26313       // sequence.
26314       MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
26315       MFI.setHasCopyImplyingStackAdjustment(true);
26316       // Don't do anything here, we will expand these intrinsics out later
26317       // during FinalizeISel in EmitInstrWithCustomInserter.
26318       return Op;
26319     }
26320     case Intrinsic::x86_lwpins32:
26321     case Intrinsic::x86_lwpins64:
26322     case Intrinsic::x86_umwait:
26323     case Intrinsic::x86_tpause: {
26324       SDLoc dl(Op);
26325       SDValue Chain = Op->getOperand(0);
26326       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26327       unsigned Opcode;
26328 
26329       switch (IntNo) {
26330       default: llvm_unreachable("Impossible intrinsic");
26331       case Intrinsic::x86_umwait:
26332         Opcode = X86ISD::UMWAIT;
26333         break;
26334       case Intrinsic::x86_tpause:
26335         Opcode = X86ISD::TPAUSE;
26336         break;
26337       case Intrinsic::x86_lwpins32:
26338       case Intrinsic::x86_lwpins64:
26339         Opcode = X86ISD::LWPINS;
26340         break;
26341       }
26342 
26343       SDValue Operation =
26344           DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
26345                       Op->getOperand(3), Op->getOperand(4));
26346       SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26347       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26348                          Operation.getValue(1));
26349     }
26350     case Intrinsic::x86_enqcmd:
26351     case Intrinsic::x86_enqcmds: {
26352       SDLoc dl(Op);
26353       SDValue Chain = Op.getOperand(0);
26354       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26355       unsigned Opcode;
26356       switch (IntNo) {
26357       default: llvm_unreachable("Impossible intrinsic!");
26358       case Intrinsic::x86_enqcmd:
26359         Opcode = X86ISD::ENQCMD;
26360         break;
26361       case Intrinsic::x86_enqcmds:
26362         Opcode = X86ISD::ENQCMDS;
26363         break;
26364       }
26365       SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
26366                                       Op.getOperand(3));
26367       SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
26368       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26369                          Operation.getValue(1));
26370     }
26371     case Intrinsic::x86_aesenc128kl:
26372     case Intrinsic::x86_aesdec128kl:
26373     case Intrinsic::x86_aesenc256kl:
26374     case Intrinsic::x86_aesdec256kl: {
26375       SDLoc DL(Op);
26376       SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
26377       SDValue Chain = Op.getOperand(0);
26378       unsigned Opcode;
26379 
26380       switch (IntNo) {
26381       default: llvm_unreachable("Impossible intrinsic");
26382       case Intrinsic::x86_aesenc128kl:
26383         Opcode = X86ISD::AESENC128KL;
26384         break;
26385       case Intrinsic::x86_aesdec128kl:
26386         Opcode = X86ISD::AESDEC128KL;
26387         break;
26388       case Intrinsic::x86_aesenc256kl:
26389         Opcode = X86ISD::AESENC256KL;
26390         break;
26391       case Intrinsic::x86_aesdec256kl:
26392         Opcode = X86ISD::AESDEC256KL;
26393         break;
26394       }
26395 
26396       MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26397       MachineMemOperand *MMO = MemIntr->getMemOperand();
26398       EVT MemVT = MemIntr->getMemoryVT();
26399       SDValue Operation = DAG.getMemIntrinsicNode(
26400           Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
26401           MMO);
26402       SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
26403 
26404       return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26405                          {ZF, Operation.getValue(0), Operation.getValue(2)});
26406     }
26407     case Intrinsic::x86_aesencwide128kl:
26408     case Intrinsic::x86_aesdecwide128kl:
26409     case Intrinsic::x86_aesencwide256kl:
26410     case Intrinsic::x86_aesdecwide256kl: {
26411       SDLoc DL(Op);
26412       SDVTList VTs = DAG.getVTList(
26413           {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
26414            MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
26415       SDValue Chain = Op.getOperand(0);
26416       unsigned Opcode;
26417 
26418       switch (IntNo) {
26419       default: llvm_unreachable("Impossible intrinsic");
26420       case Intrinsic::x86_aesencwide128kl:
26421         Opcode = X86ISD::AESENCWIDE128KL;
26422         break;
26423       case Intrinsic::x86_aesdecwide128kl:
26424         Opcode = X86ISD::AESDECWIDE128KL;
26425         break;
26426       case Intrinsic::x86_aesencwide256kl:
26427         Opcode = X86ISD::AESENCWIDE256KL;
26428         break;
26429       case Intrinsic::x86_aesdecwide256kl:
26430         Opcode = X86ISD::AESDECWIDE256KL;
26431         break;
26432       }
26433 
26434       MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26435       MachineMemOperand *MMO = MemIntr->getMemOperand();
26436       EVT MemVT = MemIntr->getMemoryVT();
26437       SDValue Operation = DAG.getMemIntrinsicNode(
26438           Opcode, DL, VTs,
26439           {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
26440            Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
26441            Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
26442           MemVT, MMO);
26443       SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
26444 
26445       return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26446                          {ZF, Operation.getValue(1), Operation.getValue(2),
26447                           Operation.getValue(3), Operation.getValue(4),
26448                           Operation.getValue(5), Operation.getValue(6),
26449                           Operation.getValue(7), Operation.getValue(8),
26450                           Operation.getValue(9)});
26451     }
26452     case Intrinsic::x86_testui: {
26453       SDLoc dl(Op);
26454       SDValue Chain = Op.getOperand(0);
26455       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26456       SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
26457       SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26458       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26459                          Operation.getValue(1));
26460     }
26461     }
26462     return SDValue();
26463   }
26464 
26465   SDLoc dl(Op);
26466   switch(IntrData->Type) {
26467   default: llvm_unreachable("Unknown Intrinsic Type");
26468   case RDSEED:
26469   case RDRAND: {
26470     // Emit the node with the right value type.
26471     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
26472     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
26473 
26474     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
26475     // Otherwise return the value from Rand, which is always 0, casted to i32.
26476     SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
26477                      DAG.getConstant(1, dl, Op->getValueType(1)),
26478                      DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
26479                      SDValue(Result.getNode(), 1)};
26480     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
26481 
26482     // Return { result, isValid, chain }.
26483     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
26484                        SDValue(Result.getNode(), 2));
26485   }
26486   case GATHER_AVX2: {
26487     SDValue Chain = Op.getOperand(0);
26488     SDValue Src   = Op.getOperand(2);
26489     SDValue Base  = Op.getOperand(3);
26490     SDValue Index = Op.getOperand(4);
26491     SDValue Mask  = Op.getOperand(5);
26492     SDValue Scale = Op.getOperand(6);
26493     return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
26494                              Scale, Chain, Subtarget);
26495   }
26496   case GATHER: {
26497   //gather(v1, mask, index, base, scale);
26498     SDValue Chain = Op.getOperand(0);
26499     SDValue Src   = Op.getOperand(2);
26500     SDValue Base  = Op.getOperand(3);
26501     SDValue Index = Op.getOperand(4);
26502     SDValue Mask  = Op.getOperand(5);
26503     SDValue Scale = Op.getOperand(6);
26504     return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
26505                          Chain, Subtarget);
26506   }
26507   case SCATTER: {
26508   //scatter(base, mask, index, v1, scale);
26509     SDValue Chain = Op.getOperand(0);
26510     SDValue Base  = Op.getOperand(2);
26511     SDValue Mask  = Op.getOperand(3);
26512     SDValue Index = Op.getOperand(4);
26513     SDValue Src   = Op.getOperand(5);
26514     SDValue Scale = Op.getOperand(6);
26515     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
26516                           Scale, Chain, Subtarget);
26517   }
26518   case PREFETCH: {
26519     const APInt &HintVal = Op.getConstantOperandAPInt(6);
26520     assert((HintVal == 2 || HintVal == 3) &&
26521            "Wrong prefetch hint in intrinsic: should be 2 or 3");
26522     unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
26523     SDValue Chain = Op.getOperand(0);
26524     SDValue Mask  = Op.getOperand(2);
26525     SDValue Index = Op.getOperand(3);
26526     SDValue Base  = Op.getOperand(4);
26527     SDValue Scale = Op.getOperand(5);
26528     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
26529                            Subtarget);
26530   }
26531   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
26532   case RDTSC: {
26533     SmallVector<SDValue, 2> Results;
26534     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
26535                             Results);
26536     return DAG.getMergeValues(Results, dl);
26537   }
26538   // Read Performance Monitoring Counters.
26539   case RDPMC:
26540   // GetExtended Control Register.
26541   case XGETBV: {
26542     SmallVector<SDValue, 2> Results;
26543 
26544     // RDPMC uses ECX to select the index of the performance counter to read.
26545     // XGETBV uses ECX to select the index of the XCR register to return.
26546     // The result is stored into registers EDX:EAX.
26547     expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
26548                                 Subtarget, Results);
26549     return DAG.getMergeValues(Results, dl);
26550   }
26551   // XTEST intrinsics.
26552   case XTEST: {
26553     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
26554     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
26555 
26556     SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
26557     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
26558     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
26559                        Ret, SDValue(InTrans.getNode(), 1));
26560   }
26561   case TRUNCATE_TO_MEM_VI8:
26562   case TRUNCATE_TO_MEM_VI16:
26563   case TRUNCATE_TO_MEM_VI32: {
26564     SDValue Mask = Op.getOperand(4);
26565     SDValue DataToTruncate = Op.getOperand(3);
26566     SDValue Addr = Op.getOperand(2);
26567     SDValue Chain = Op.getOperand(0);
26568 
26569     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
26570     assert(MemIntr && "Expected MemIntrinsicSDNode!");
26571 
26572     EVT MemVT  = MemIntr->getMemoryVT();
26573 
26574     uint16_t TruncationOp = IntrData->Opc0;
26575     switch (TruncationOp) {
26576     case X86ISD::VTRUNC: {
26577       if (isAllOnesConstant(Mask)) // return just a truncate store
26578         return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
26579                                  MemIntr->getMemOperand());
26580 
26581       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
26582       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26583       SDValue Offset = DAG.getUNDEF(VMask.getValueType());
26584 
26585       return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
26586                                 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
26587                                 true /* truncating */);
26588     }
26589     case X86ISD::VTRUNCUS:
26590     case X86ISD::VTRUNCS: {
26591       bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
26592       if (isAllOnesConstant(Mask))
26593         return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
26594                                MemIntr->getMemOperand(), DAG);
26595 
26596       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
26597       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26598 
26599       return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
26600                                    VMask, MemVT, MemIntr->getMemOperand(), DAG);
26601     }
26602     default:
26603       llvm_unreachable("Unsupported truncstore intrinsic");
26604     }
26605   }
26606   }
26607 }
26608 
LowerRETURNADDR(SDValue Op,SelectionDAG & DAG) const26609 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
26610                                            SelectionDAG &DAG) const {
26611   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
26612   MFI.setReturnAddressIsTaken(true);
26613 
26614   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
26615     return SDValue();
26616 
26617   unsigned Depth = Op.getConstantOperandVal(0);
26618   SDLoc dl(Op);
26619   EVT PtrVT = getPointerTy(DAG.getDataLayout());
26620 
26621   if (Depth > 0) {
26622     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
26623     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26624     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
26625     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
26626                        DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
26627                        MachinePointerInfo());
26628   }
26629 
26630   // Just load the return address.
26631   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
26632   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
26633                      MachinePointerInfo());
26634 }
26635 
LowerADDROFRETURNADDR(SDValue Op,SelectionDAG & DAG) const26636 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
26637                                                  SelectionDAG &DAG) const {
26638   DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
26639   return getReturnAddressFrameIndex(DAG);
26640 }
26641 
LowerFRAMEADDR(SDValue Op,SelectionDAG & DAG) const26642 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
26643   MachineFunction &MF = DAG.getMachineFunction();
26644   MachineFrameInfo &MFI = MF.getFrameInfo();
26645   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
26646   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26647   EVT VT = Op.getValueType();
26648 
26649   MFI.setFrameAddressIsTaken(true);
26650 
26651   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
26652     // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
26653     // is not possible to crawl up the stack without looking at the unwind codes
26654     // simultaneously.
26655     int FrameAddrIndex = FuncInfo->getFAIndex();
26656     if (!FrameAddrIndex) {
26657       // Set up a frame object for the return address.
26658       unsigned SlotSize = RegInfo->getSlotSize();
26659       FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
26660           SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
26661       FuncInfo->setFAIndex(FrameAddrIndex);
26662     }
26663     return DAG.getFrameIndex(FrameAddrIndex, VT);
26664   }
26665 
26666   unsigned FrameReg =
26667       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
26668   SDLoc dl(Op);  // FIXME probably not meaningful
26669   unsigned Depth = Op.getConstantOperandVal(0);
26670   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
26671           (FrameReg == X86::EBP && VT == MVT::i32)) &&
26672          "Invalid Frame Register!");
26673   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
26674   while (Depth--)
26675     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
26676                             MachinePointerInfo());
26677   return FrameAddr;
26678 }
26679 
26680 // FIXME? Maybe this could be a TableGen attribute on some registers and
26681 // this table could be generated automatically from RegInfo.
getRegisterByName(const char * RegName,LLT VT,const MachineFunction & MF) const26682 Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
26683                                               const MachineFunction &MF) const {
26684   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
26685 
26686   Register Reg = StringSwitch<unsigned>(RegName)
26687                        .Case("esp", X86::ESP)
26688                        .Case("rsp", X86::RSP)
26689                        .Case("ebp", X86::EBP)
26690                        .Case("rbp", X86::RBP)
26691                        .Default(0);
26692 
26693   if (Reg == X86::EBP || Reg == X86::RBP) {
26694     if (!TFI.hasFP(MF))
26695       report_fatal_error("register " + StringRef(RegName) +
26696                          " is allocatable: function has no frame pointer");
26697 #ifndef NDEBUG
26698     else {
26699       const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26700       Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
26701       assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
26702              "Invalid Frame Register!");
26703     }
26704 #endif
26705   }
26706 
26707   if (Reg)
26708     return Reg;
26709 
26710   report_fatal_error("Invalid register name global variable");
26711 }
26712 
LowerFRAME_TO_ARGS_OFFSET(SDValue Op,SelectionDAG & DAG) const26713 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
26714                                                      SelectionDAG &DAG) const {
26715   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26716   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
26717 }
26718 
getExceptionPointerRegister(const Constant * PersonalityFn) const26719 Register X86TargetLowering::getExceptionPointerRegister(
26720     const Constant *PersonalityFn) const {
26721   if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
26722     return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
26723 
26724   return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
26725 }
26726 
getExceptionSelectorRegister(const Constant * PersonalityFn) const26727 Register X86TargetLowering::getExceptionSelectorRegister(
26728     const Constant *PersonalityFn) const {
26729   // Funclet personalities don't use selectors (the runtime does the selection).
26730   if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
26731     return X86::NoRegister;
26732   return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
26733 }
26734 
needsFixedCatchObjects() const26735 bool X86TargetLowering::needsFixedCatchObjects() const {
26736   return Subtarget.isTargetWin64();
26737 }
26738 
LowerEH_RETURN(SDValue Op,SelectionDAG & DAG) const26739 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
26740   SDValue Chain     = Op.getOperand(0);
26741   SDValue Offset    = Op.getOperand(1);
26742   SDValue Handler   = Op.getOperand(2);
26743   SDLoc dl      (Op);
26744 
26745   EVT PtrVT = getPointerTy(DAG.getDataLayout());
26746   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26747   Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
26748   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
26749           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
26750          "Invalid Frame Register!");
26751   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
26752   Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
26753 
26754   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
26755                                  DAG.getIntPtrConstant(RegInfo->getSlotSize(),
26756                                                        dl));
26757   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
26758   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
26759   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
26760 
26761   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
26762                      DAG.getRegister(StoreAddrReg, PtrVT));
26763 }
26764 
lowerEH_SJLJ_SETJMP(SDValue Op,SelectionDAG & DAG) const26765 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
26766                                                SelectionDAG &DAG) const {
26767   SDLoc DL(Op);
26768   // If the subtarget is not 64bit, we may need the global base reg
26769   // after isel expand pseudo, i.e., after CGBR pass ran.
26770   // Therefore, ask for the GlobalBaseReg now, so that the pass
26771   // inserts the code for us in case we need it.
26772   // Otherwise, we will end up in a situation where we will
26773   // reference a virtual register that is not defined!
26774   if (!Subtarget.is64Bit()) {
26775     const X86InstrInfo *TII = Subtarget.getInstrInfo();
26776     (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
26777   }
26778   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
26779                      DAG.getVTList(MVT::i32, MVT::Other),
26780                      Op.getOperand(0), Op.getOperand(1));
26781 }
26782 
lowerEH_SJLJ_LONGJMP(SDValue Op,SelectionDAG & DAG) const26783 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
26784                                                 SelectionDAG &DAG) const {
26785   SDLoc DL(Op);
26786   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
26787                      Op.getOperand(0), Op.getOperand(1));
26788 }
26789 
lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,SelectionDAG & DAG) const26790 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
26791                                                        SelectionDAG &DAG) const {
26792   SDLoc DL(Op);
26793   return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
26794                      Op.getOperand(0));
26795 }
26796 
LowerADJUST_TRAMPOLINE(SDValue Op,SelectionDAG & DAG)26797 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
26798   return Op.getOperand(0);
26799 }
26800 
LowerINIT_TRAMPOLINE(SDValue Op,SelectionDAG & DAG) const26801 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
26802                                                 SelectionDAG &DAG) const {
26803   SDValue Root = Op.getOperand(0);
26804   SDValue Trmp = Op.getOperand(1); // trampoline
26805   SDValue FPtr = Op.getOperand(2); // nested function
26806   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
26807   SDLoc dl (Op);
26808 
26809   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
26810   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26811 
26812   if (Subtarget.is64Bit()) {
26813     SDValue OutChains[6];
26814 
26815     // Large code-model.
26816     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
26817     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
26818 
26819     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
26820     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
26821 
26822     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
26823 
26824     // Load the pointer to the nested function into R11.
26825     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
26826     SDValue Addr = Trmp;
26827     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
26828                                 Addr, MachinePointerInfo(TrmpAddr));
26829 
26830     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
26831                        DAG.getConstant(2, dl, MVT::i64));
26832     OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
26833                                 MachinePointerInfo(TrmpAddr, 2), Align(2));
26834 
26835     // Load the 'nest' parameter value into R10.
26836     // R10 is specified in X86CallingConv.td
26837     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
26838     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
26839                        DAG.getConstant(10, dl, MVT::i64));
26840     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
26841                                 Addr, MachinePointerInfo(TrmpAddr, 10));
26842 
26843     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
26844                        DAG.getConstant(12, dl, MVT::i64));
26845     OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
26846                                 MachinePointerInfo(TrmpAddr, 12), Align(2));
26847 
26848     // Jump to the nested function.
26849     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
26850     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
26851                        DAG.getConstant(20, dl, MVT::i64));
26852     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
26853                                 Addr, MachinePointerInfo(TrmpAddr, 20));
26854 
26855     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
26856     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
26857                        DAG.getConstant(22, dl, MVT::i64));
26858     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
26859                                 Addr, MachinePointerInfo(TrmpAddr, 22));
26860 
26861     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
26862   } else {
26863     const Function *Func =
26864       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
26865     CallingConv::ID CC = Func->getCallingConv();
26866     unsigned NestReg;
26867 
26868     switch (CC) {
26869     default:
26870       llvm_unreachable("Unsupported calling convention");
26871     case CallingConv::C:
26872     case CallingConv::X86_StdCall: {
26873       // Pass 'nest' parameter in ECX.
26874       // Must be kept in sync with X86CallingConv.td
26875       NestReg = X86::ECX;
26876 
26877       // Check that ECX wasn't needed by an 'inreg' parameter.
26878       FunctionType *FTy = Func->getFunctionType();
26879       const AttributeList &Attrs = Func->getAttributes();
26880 
26881       if (!Attrs.isEmpty() && !Func->isVarArg()) {
26882         unsigned InRegCount = 0;
26883         unsigned Idx = 1;
26884 
26885         for (FunctionType::param_iterator I = FTy->param_begin(),
26886              E = FTy->param_end(); I != E; ++I, ++Idx)
26887           if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
26888             const DataLayout &DL = DAG.getDataLayout();
26889             // FIXME: should only count parameters that are lowered to integers.
26890             InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
26891           }
26892 
26893         if (InRegCount > 2) {
26894           report_fatal_error("Nest register in use - reduce number of inreg"
26895                              " parameters!");
26896         }
26897       }
26898       break;
26899     }
26900     case CallingConv::X86_FastCall:
26901     case CallingConv::X86_ThisCall:
26902     case CallingConv::Fast:
26903     case CallingConv::Tail:
26904     case CallingConv::SwiftTail:
26905       // Pass 'nest' parameter in EAX.
26906       // Must be kept in sync with X86CallingConv.td
26907       NestReg = X86::EAX;
26908       break;
26909     }
26910 
26911     SDValue OutChains[4];
26912     SDValue Addr, Disp;
26913 
26914     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
26915                        DAG.getConstant(10, dl, MVT::i32));
26916     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
26917 
26918     // This is storing the opcode for MOV32ri.
26919     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
26920     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
26921     OutChains[0] =
26922         DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
26923                      Trmp, MachinePointerInfo(TrmpAddr));
26924 
26925     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
26926                        DAG.getConstant(1, dl, MVT::i32));
26927     OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
26928                                 MachinePointerInfo(TrmpAddr, 1), Align(1));
26929 
26930     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
26931     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
26932                        DAG.getConstant(5, dl, MVT::i32));
26933     OutChains[2] =
26934         DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
26935                      MachinePointerInfo(TrmpAddr, 5), Align(1));
26936 
26937     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
26938                        DAG.getConstant(6, dl, MVT::i32));
26939     OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
26940                                 MachinePointerInfo(TrmpAddr, 6), Align(1));
26941 
26942     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
26943   }
26944 }
26945 
LowerFLT_ROUNDS_(SDValue Op,SelectionDAG & DAG) const26946 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
26947                                             SelectionDAG &DAG) const {
26948   /*
26949    The rounding mode is in bits 11:10 of FPSR, and has the following
26950    settings:
26951      00 Round to nearest
26952      01 Round to -inf
26953      10 Round to +inf
26954      11 Round to 0
26955 
26956   FLT_ROUNDS, on the other hand, expects the following:
26957     -1 Undefined
26958      0 Round to 0
26959      1 Round to nearest
26960      2 Round to +inf
26961      3 Round to -inf
26962 
26963   To perform the conversion, we use a packed lookup table of the four 2-bit
26964   values that we can index by FPSP[11:10]
26965     0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
26966 
26967     (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
26968   */
26969 
26970   MachineFunction &MF = DAG.getMachineFunction();
26971   MVT VT = Op.getSimpleValueType();
26972   SDLoc DL(Op);
26973 
26974   // Save FP Control Word to stack slot
26975   int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
26976   SDValue StackSlot =
26977       DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
26978 
26979   MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
26980 
26981   SDValue Chain = Op.getOperand(0);
26982   SDValue Ops[] = {Chain, StackSlot};
26983   Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
26984                                   DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
26985                                   Align(2), MachineMemOperand::MOStore);
26986 
26987   // Load FP Control Word from stack slot
26988   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
26989   Chain = CWD.getValue(1);
26990 
26991   // Mask and turn the control bits into a shift for the lookup table.
26992   SDValue Shift =
26993     DAG.getNode(ISD::SRL, DL, MVT::i16,
26994                 DAG.getNode(ISD::AND, DL, MVT::i16,
26995                             CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
26996                 DAG.getConstant(9, DL, MVT::i8));
26997   Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
26998 
26999   SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
27000   SDValue RetVal =
27001     DAG.getNode(ISD::AND, DL, MVT::i32,
27002                 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
27003                 DAG.getConstant(3, DL, MVT::i32));
27004 
27005   RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
27006 
27007   return DAG.getMergeValues({RetVal, Chain}, DL);
27008 }
27009 
LowerSET_ROUNDING(SDValue Op,SelectionDAG & DAG) const27010 SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
27011                                              SelectionDAG &DAG) const {
27012   MachineFunction &MF = DAG.getMachineFunction();
27013   SDLoc DL(Op);
27014   SDValue Chain = Op.getNode()->getOperand(0);
27015 
27016   // FP control word may be set only from data in memory. So we need to allocate
27017   // stack space to save/load FP control word.
27018   int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
27019   SDValue StackSlot =
27020       DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
27021   MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
27022   MachineMemOperand *MMO =
27023       MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));
27024 
27025   // Store FP control word into memory.
27026   SDValue Ops[] = {Chain, StackSlot};
27027   Chain = DAG.getMemIntrinsicNode(
27028       X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
27029 
27030   // Load FP Control Word from stack slot and clear RM field (bits 11:10).
27031   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
27032   Chain = CWD.getValue(1);
27033   CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
27034                     DAG.getConstant(0xf3ff, DL, MVT::i16));
27035 
27036   // Calculate new rounding mode.
27037   SDValue NewRM = Op.getNode()->getOperand(1);
27038   SDValue RMBits;
27039   if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
27040     uint64_t RM = CVal->getZExtValue();
27041     int FieldVal;
27042     switch (static_cast<RoundingMode>(RM)) {
27043     case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
27044     case RoundingMode::TowardNegative:    FieldVal = X86::rmDownward; break;
27045     case RoundingMode::TowardPositive:    FieldVal = X86::rmUpward; break;
27046     case RoundingMode::TowardZero:        FieldVal = X86::rmTowardZero; break;
27047     default:
27048       llvm_unreachable("rounding mode is not supported by X86 hardware");
27049     }
27050     RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
27051   } else {
27052     // Need to convert argument into bits of control word:
27053     //    0 Round to 0       -> 11
27054     //    1 Round to nearest -> 00
27055     //    2 Round to +inf    -> 10
27056     //    3 Round to -inf    -> 01
27057     // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
27058     // To make the conversion, put all these values into a value 0xc9 and shift
27059     // it left depending on the rounding mode:
27060     //    (0xc9 << 4) & 0xc00 = X86::rmTowardZero
27061     //    (0xc9 << 6) & 0xc00 = X86::rmToNearest
27062     //    ...
27063     // (0xc9 << (2 * NewRM + 4)) & 0xc00
27064     SDValue ShiftValue =
27065         DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
27066                     DAG.getNode(ISD::ADD, DL, MVT::i32,
27067                                 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
27068                                             DAG.getConstant(1, DL, MVT::i8)),
27069                                 DAG.getConstant(4, DL, MVT::i32)));
27070     SDValue Shifted =
27071         DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
27072                     ShiftValue);
27073     RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
27074                          DAG.getConstant(0xc00, DL, MVT::i16));
27075   }
27076 
27077   // Update rounding mode bits and store the new FP Control Word into stack.
27078   CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
27079   Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 2);
27080 
27081   // Load FP control word from the slot.
27082   SDValue OpsLD[] = {Chain, StackSlot};
27083   MachineMemOperand *MMOL =
27084       MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));
27085   Chain = DAG.getMemIntrinsicNode(
27086       X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
27087 
27088   // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
27089   // same way but in bits 14:13.
27090   if (Subtarget.hasSSE1()) {
27091     // Store MXCSR into memory.
27092     Chain = DAG.getNode(
27093         ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27094         DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27095         StackSlot);
27096 
27097     // Load MXCSR from stack slot and clear RM field (bits 14:13).
27098     SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
27099     Chain = CWD.getValue(1);
27100     CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
27101                       DAG.getConstant(0xffff9fff, DL, MVT::i32));
27102 
27103     // Shift X87 RM bits from 11:10 to 14:13.
27104     RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
27105     RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
27106                          DAG.getConstant(3, DL, MVT::i8));
27107 
27108     // Update rounding mode bits and store the new FP Control Word into stack.
27109     CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
27110     Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 4);
27111 
27112     // Load MXCSR from the slot.
27113     Chain = DAG.getNode(
27114         ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27115         DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27116         StackSlot);
27117   }
27118 
27119   return Chain;
27120 }
27121 
27122 /// Lower a vector CTLZ using native supported vector CTLZ instruction.
27123 //
27124 // i8/i16 vector implemented using dword LZCNT vector instruction
27125 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
27126 // split the vector, perform operation on it's Lo a Hi part and
27127 // concatenate the results.
LowerVectorCTLZ_AVX512CDI(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)27128 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
27129                                          const X86Subtarget &Subtarget) {
27130   assert(Op.getOpcode() == ISD::CTLZ);
27131   SDLoc dl(Op);
27132   MVT VT = Op.getSimpleValueType();
27133   MVT EltVT = VT.getVectorElementType();
27134   unsigned NumElems = VT.getVectorNumElements();
27135 
27136   assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
27137           "Unsupported element type");
27138 
27139   // Split vector, it's Lo and Hi parts will be handled in next iteration.
27140   if (NumElems > 16 ||
27141       (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
27142     return splitVectorIntUnary(Op, DAG);
27143 
27144   MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
27145   assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
27146           "Unsupported value type for operation");
27147 
27148   // Use native supported vector instruction vplzcntd.
27149   Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
27150   SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
27151   SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
27152   SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
27153 
27154   return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
27155 }
27156 
27157 // Lower CTLZ using a PSHUFB lookup table implementation.
LowerVectorCTLZInRegLUT(SDValue Op,const SDLoc & DL,const X86Subtarget & Subtarget,SelectionDAG & DAG)27158 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
27159                                        const X86Subtarget &Subtarget,
27160                                        SelectionDAG &DAG) {
27161   MVT VT = Op.getSimpleValueType();
27162   int NumElts = VT.getVectorNumElements();
27163   int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
27164   MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
27165 
27166   // Per-nibble leading zero PSHUFB lookup table.
27167   const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
27168                        /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
27169                        /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
27170                        /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
27171 
27172   SmallVector<SDValue, 64> LUTVec;
27173   for (int i = 0; i < NumBytes; ++i)
27174     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
27175   SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
27176 
27177   // Begin by bitcasting the input to byte vector, then split those bytes
27178   // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
27179   // If the hi input nibble is zero then we add both results together, otherwise
27180   // we just take the hi result (by masking the lo result to zero before the
27181   // add).
27182   SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
27183   SDValue Zero = DAG.getConstant(0, DL, CurrVT);
27184 
27185   SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
27186   SDValue Lo = Op0;
27187   SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
27188   SDValue HiZ;
27189   if (CurrVT.is512BitVector()) {
27190     MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27191     HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
27192     HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27193   } else {
27194     HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
27195   }
27196 
27197   Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
27198   Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
27199   Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
27200   SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
27201 
27202   // Merge result back from vXi8 back to VT, working on the lo/hi halves
27203   // of the current vector width in the same way we did for the nibbles.
27204   // If the upper half of the input element is zero then add the halves'
27205   // leading zero counts together, otherwise just use the upper half's.
27206   // Double the width of the result until we are at target width.
27207   while (CurrVT != VT) {
27208     int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
27209     int CurrNumElts = CurrVT.getVectorNumElements();
27210     MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
27211     MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
27212     SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
27213 
27214     // Check if the upper half of the input element is zero.
27215     if (CurrVT.is512BitVector()) {
27216       MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27217       HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
27218                          DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27219       HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27220     } else {
27221       HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
27222                          DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27223     }
27224     HiZ = DAG.getBitcast(NextVT, HiZ);
27225 
27226     // Move the upper/lower halves to the lower bits as we'll be extending to
27227     // NextVT. Mask the lower result to zero if HiZ is true and add the results
27228     // together.
27229     SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
27230     SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
27231     SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
27232     R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
27233     Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
27234     CurrVT = NextVT;
27235   }
27236 
27237   return Res;
27238 }
27239 
LowerVectorCTLZ(SDValue Op,const SDLoc & DL,const X86Subtarget & Subtarget,SelectionDAG & DAG)27240 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
27241                                const X86Subtarget &Subtarget,
27242                                SelectionDAG &DAG) {
27243   MVT VT = Op.getSimpleValueType();
27244 
27245   if (Subtarget.hasCDI() &&
27246       // vXi8 vectors need to be promoted to 512-bits for vXi32.
27247       (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
27248     return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
27249 
27250   // Decompose 256-bit ops into smaller 128-bit ops.
27251   if (VT.is256BitVector() && !Subtarget.hasInt256())
27252     return splitVectorIntUnary(Op, DAG);
27253 
27254   // Decompose 512-bit ops into smaller 256-bit ops.
27255   if (VT.is512BitVector() && !Subtarget.hasBWI())
27256     return splitVectorIntUnary(Op, DAG);
27257 
27258   assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
27259   return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
27260 }
27261 
LowerCTLZ(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)27262 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
27263                          SelectionDAG &DAG) {
27264   MVT VT = Op.getSimpleValueType();
27265   MVT OpVT = VT;
27266   unsigned NumBits = VT.getSizeInBits();
27267   SDLoc dl(Op);
27268   unsigned Opc = Op.getOpcode();
27269 
27270   if (VT.isVector())
27271     return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
27272 
27273   Op = Op.getOperand(0);
27274   if (VT == MVT::i8) {
27275     // Zero extend to i32 since there is not an i8 bsr.
27276     OpVT = MVT::i32;
27277     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
27278   }
27279 
27280   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
27281   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
27282   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
27283 
27284   if (Opc == ISD::CTLZ) {
27285     // If src is zero (i.e. bsr sets ZF), returns NumBits.
27286     SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
27287                      DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
27288                      Op.getValue(1)};
27289     Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
27290   }
27291 
27292   // Finally xor with NumBits-1.
27293   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
27294                    DAG.getConstant(NumBits - 1, dl, OpVT));
27295 
27296   if (VT == MVT::i8)
27297     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
27298   return Op;
27299 }
27300 
LowerCTTZ(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)27301 static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
27302                          SelectionDAG &DAG) {
27303   MVT VT = Op.getSimpleValueType();
27304   unsigned NumBits = VT.getScalarSizeInBits();
27305   SDValue N0 = Op.getOperand(0);
27306   SDLoc dl(Op);
27307 
27308   assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
27309          "Only scalar CTTZ requires custom lowering");
27310 
27311   // Issue a bsf (scan bits forward) which also sets EFLAGS.
27312   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
27313   Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
27314 
27315   // If src is zero (i.e. bsf sets ZF), returns NumBits.
27316   SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
27317                    DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
27318                    Op.getValue(1)};
27319   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
27320 }
27321 
lowerAddSub(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)27322 static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
27323                            const X86Subtarget &Subtarget) {
27324   MVT VT = Op.getSimpleValueType();
27325   if (VT == MVT::i16 || VT == MVT::i32)
27326     return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
27327 
27328   if (VT == MVT::v32i16 || VT == MVT::v64i8)
27329     return splitVectorIntBinary(Op, DAG);
27330 
27331   assert(Op.getSimpleValueType().is256BitVector() &&
27332          Op.getSimpleValueType().isInteger() &&
27333          "Only handle AVX 256-bit vector integer operation");
27334   return splitVectorIntBinary(Op, DAG);
27335 }
27336 
LowerADDSAT_SUBSAT(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)27337 static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
27338                                   const X86Subtarget &Subtarget) {
27339   MVT VT = Op.getSimpleValueType();
27340   SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
27341   unsigned Opcode = Op.getOpcode();
27342   SDLoc DL(Op);
27343 
27344   if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
27345       (VT.is256BitVector() && !Subtarget.hasInt256())) {
27346     assert(Op.getSimpleValueType().isInteger() &&
27347            "Only handle AVX vector integer operation");
27348     return splitVectorIntBinary(Op, DAG);
27349   }
27350 
27351   // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
27352   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27353   EVT SetCCResultType =
27354       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
27355 
27356   if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
27357     // usubsat X, Y --> (X >u Y) ? X - Y : 0
27358     SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
27359     SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
27360     // TODO: Move this to DAGCombiner?
27361     if (SetCCResultType == VT &&
27362         DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
27363       return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
27364     return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
27365   }
27366 
27367   // Use default expansion.
27368   return SDValue();
27369 }
27370 
LowerABS(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)27371 static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
27372                         SelectionDAG &DAG) {
27373   MVT VT = Op.getSimpleValueType();
27374   if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
27375     // Since X86 does not have CMOV for 8-bit integer, we don't convert
27376     // 8-bit integer abs to NEG and CMOV.
27377     SDLoc DL(Op);
27378     SDValue N0 = Op.getOperand(0);
27379     SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
27380                               DAG.getConstant(0, DL, VT), N0);
27381     SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8),
27382                      SDValue(Neg.getNode(), 1)};
27383     return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
27384   }
27385 
27386   // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
27387   if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
27388     SDLoc DL(Op);
27389     SDValue Src = Op.getOperand(0);
27390     SDValue Sub =
27391         DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
27392     return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
27393   }
27394 
27395   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
27396     assert(VT.isInteger() &&
27397            "Only handle AVX 256-bit vector integer operation");
27398     return splitVectorIntUnary(Op, DAG);
27399   }
27400 
27401   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27402     return splitVectorIntUnary(Op, DAG);
27403 
27404   // Default to expand.
27405   return SDValue();
27406 }
27407 
LowerMINMAX(SDValue Op,SelectionDAG & DAG)27408 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
27409   MVT VT = Op.getSimpleValueType();
27410 
27411   // For AVX1 cases, split to use legal ops (everything but v4i64).
27412   if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
27413     return splitVectorIntBinary(Op, DAG);
27414 
27415   if (VT == MVT::v32i16 || VT == MVT::v64i8)
27416     return splitVectorIntBinary(Op, DAG);
27417 
27418   // Default to expand.
27419   return SDValue();
27420 }
27421 
LowerMUL(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)27422 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
27423                         SelectionDAG &DAG) {
27424   SDLoc dl(Op);
27425   MVT VT = Op.getSimpleValueType();
27426 
27427   // Decompose 256-bit ops into 128-bit ops.
27428   if (VT.is256BitVector() && !Subtarget.hasInt256())
27429     return splitVectorIntBinary(Op, DAG);
27430 
27431   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27432     return splitVectorIntBinary(Op, DAG);
27433 
27434   SDValue A = Op.getOperand(0);
27435   SDValue B = Op.getOperand(1);
27436 
27437   // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
27438   // vector pairs, multiply and truncate.
27439   if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
27440     unsigned NumElts = VT.getVectorNumElements();
27441 
27442     if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
27443         (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
27444       MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
27445       return DAG.getNode(
27446           ISD::TRUNCATE, dl, VT,
27447           DAG.getNode(ISD::MUL, dl, ExVT,
27448                       DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
27449                       DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
27450     }
27451 
27452     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
27453 
27454     // Extract the lo/hi parts to any extend to i16.
27455     // We're going to mask off the low byte of each result element of the
27456     // pmullw, so it doesn't matter what's in the high byte of each 16-bit
27457     // element.
27458     SDValue Undef = DAG.getUNDEF(VT);
27459     SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
27460     SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
27461 
27462     SDValue BLo, BHi;
27463     if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
27464       // If the RHS is a constant, manually unpackl/unpackh.
27465       SmallVector<SDValue, 16> LoOps, HiOps;
27466       for (unsigned i = 0; i != NumElts; i += 16) {
27467         for (unsigned j = 0; j != 8; ++j) {
27468           LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
27469                                                MVT::i16));
27470           HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
27471                                                MVT::i16));
27472         }
27473       }
27474 
27475       BLo = DAG.getBuildVector(ExVT, dl, LoOps);
27476       BHi = DAG.getBuildVector(ExVT, dl, HiOps);
27477     } else {
27478       BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
27479       BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
27480     }
27481 
27482     // Multiply, mask the lower 8bits of the lo/hi results and pack.
27483     SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
27484     SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
27485     RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
27486     RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
27487     return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
27488   }
27489 
27490   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
27491   if (VT == MVT::v4i32) {
27492     assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
27493            "Should not custom lower when pmulld is available!");
27494 
27495     // Extract the odd parts.
27496     static const int UnpackMask[] = { 1, -1, 3, -1 };
27497     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
27498     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
27499 
27500     // Multiply the even parts.
27501     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
27502                                 DAG.getBitcast(MVT::v2i64, A),
27503                                 DAG.getBitcast(MVT::v2i64, B));
27504     // Now multiply odd parts.
27505     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
27506                                DAG.getBitcast(MVT::v2i64, Aodds),
27507                                DAG.getBitcast(MVT::v2i64, Bodds));
27508 
27509     Evens = DAG.getBitcast(VT, Evens);
27510     Odds = DAG.getBitcast(VT, Odds);
27511 
27512     // Merge the two vectors back together with a shuffle. This expands into 2
27513     // shuffles.
27514     static const int ShufMask[] = { 0, 4, 2, 6 };
27515     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
27516   }
27517 
27518   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
27519          "Only know how to lower V2I64/V4I64/V8I64 multiply");
27520   assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
27521 
27522   //  Ahi = psrlqi(a, 32);
27523   //  Bhi = psrlqi(b, 32);
27524   //
27525   //  AloBlo = pmuludq(a, b);
27526   //  AloBhi = pmuludq(a, Bhi);
27527   //  AhiBlo = pmuludq(Ahi, b);
27528   //
27529   //  Hi = psllqi(AloBhi + AhiBlo, 32);
27530   //  return AloBlo + Hi;
27531   KnownBits AKnown = DAG.computeKnownBits(A);
27532   KnownBits BKnown = DAG.computeKnownBits(B);
27533 
27534   APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
27535   bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
27536   bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
27537 
27538   APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
27539   bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
27540   bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
27541 
27542   SDValue Zero = DAG.getConstant(0, dl, VT);
27543 
27544   // Only multiply lo/hi halves that aren't known to be zero.
27545   SDValue AloBlo = Zero;
27546   if (!ALoIsZero && !BLoIsZero)
27547     AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
27548 
27549   SDValue AloBhi = Zero;
27550   if (!ALoIsZero && !BHiIsZero) {
27551     SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
27552     AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
27553   }
27554 
27555   SDValue AhiBlo = Zero;
27556   if (!AHiIsZero && !BLoIsZero) {
27557     SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
27558     AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
27559   }
27560 
27561   SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
27562   Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
27563 
27564   return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
27565 }
27566 
LowervXi8MulWithUNPCK(SDValue A,SDValue B,const SDLoc & dl,MVT VT,bool IsSigned,const X86Subtarget & Subtarget,SelectionDAG & DAG,SDValue * Low=nullptr)27567 static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
27568                                      MVT VT, bool IsSigned,
27569                                      const X86Subtarget &Subtarget,
27570                                      SelectionDAG &DAG,
27571                                      SDValue *Low = nullptr) {
27572   unsigned NumElts = VT.getVectorNumElements();
27573 
27574   // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
27575   // to a vXi16 type. Do the multiplies, shift the results and pack the half
27576   // lane results back together.
27577 
27578   // We'll take different approaches for signed and unsigned.
27579   // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
27580   // and use pmullw to calculate the full 16-bit product.
27581   // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
27582   // shift them left into the upper byte of each word. This allows us to use
27583   // pmulhw to calculate the full 16-bit product. This trick means we don't
27584   // need to sign extend the bytes to use pmullw.
27585 
27586   MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
27587   SDValue Zero = DAG.getConstant(0, dl, VT);
27588 
27589   SDValue ALo, AHi;
27590   if (IsSigned) {
27591     ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
27592     AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
27593   } else {
27594     ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
27595     AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
27596   }
27597 
27598   SDValue BLo, BHi;
27599   if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
27600     // If the RHS is a constant, manually unpackl/unpackh and extend.
27601     SmallVector<SDValue, 16> LoOps, HiOps;
27602     for (unsigned i = 0; i != NumElts; i += 16) {
27603       for (unsigned j = 0; j != 8; ++j) {
27604         SDValue LoOp = B.getOperand(i + j);
27605         SDValue HiOp = B.getOperand(i + j + 8);
27606 
27607         if (IsSigned) {
27608           LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
27609           HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
27610           LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
27611                              DAG.getConstant(8, dl, MVT::i16));
27612           HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
27613                              DAG.getConstant(8, dl, MVT::i16));
27614         } else {
27615           LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
27616           HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
27617         }
27618 
27619         LoOps.push_back(LoOp);
27620         HiOps.push_back(HiOp);
27621       }
27622     }
27623 
27624     BLo = DAG.getBuildVector(ExVT, dl, LoOps);
27625     BHi = DAG.getBuildVector(ExVT, dl, HiOps);
27626   } else if (IsSigned) {
27627     BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
27628     BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
27629   } else {
27630     BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
27631     BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
27632   }
27633 
27634   // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
27635   // pack back to vXi8.
27636   unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
27637   SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
27638   SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
27639 
27640   if (Low) {
27641     // Mask the lower bits and pack the results to rejoin the halves.
27642     SDValue Mask = DAG.getConstant(255, dl, ExVT);
27643     SDValue LLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, Mask);
27644     SDValue LHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, Mask);
27645     *Low = DAG.getNode(X86ISD::PACKUS, dl, VT, LLo, LHi);
27646   }
27647 
27648   RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
27649   RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);
27650 
27651   // Bitcast back to VT and then pack all the even elements from Lo and Hi.
27652   return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
27653 }
27654 
LowerMULH(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)27655 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
27656                          SelectionDAG &DAG) {
27657   SDLoc dl(Op);
27658   MVT VT = Op.getSimpleValueType();
27659   bool IsSigned = Op->getOpcode() == ISD::MULHS;
27660   unsigned NumElts = VT.getVectorNumElements();
27661   SDValue A = Op.getOperand(0);
27662   SDValue B = Op.getOperand(1);
27663 
27664   // Decompose 256-bit ops into 128-bit ops.
27665   if (VT.is256BitVector() && !Subtarget.hasInt256())
27666     return splitVectorIntBinary(Op, DAG);
27667 
27668   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27669     return splitVectorIntBinary(Op, DAG);
27670 
27671   if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
27672     assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
27673            (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
27674            (VT == MVT::v16i32 && Subtarget.hasAVX512()));
27675 
27676     // PMULxD operations multiply each even value (starting at 0) of LHS with
27677     // the related value of RHS and produce a widen result.
27678     // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
27679     // => <2 x i64> <ae|cg>
27680     //
27681     // In other word, to have all the results, we need to perform two PMULxD:
27682     // 1. one with the even values.
27683     // 2. one with the odd values.
27684     // To achieve #2, with need to place the odd values at an even position.
27685     //
27686     // Place the odd value at an even position (basically, shift all values 1
27687     // step to the left):
27688     const int Mask[] = {1, -1,  3, -1,  5, -1,  7, -1,
27689                         9, -1, 11, -1, 13, -1, 15, -1};
27690     // <a|b|c|d> => <b|undef|d|undef>
27691     SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
27692                                         makeArrayRef(&Mask[0], NumElts));
27693     // <e|f|g|h> => <f|undef|h|undef>
27694     SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
27695                                         makeArrayRef(&Mask[0], NumElts));
27696 
27697     // Emit two multiplies, one for the lower 2 ints and one for the higher 2
27698     // ints.
27699     MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
27700     unsigned Opcode =
27701         (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
27702     // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
27703     // => <2 x i64> <ae|cg>
27704     SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
27705                                                   DAG.getBitcast(MulVT, A),
27706                                                   DAG.getBitcast(MulVT, B)));
27707     // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
27708     // => <2 x i64> <bf|dh>
27709     SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
27710                                                   DAG.getBitcast(MulVT, Odd0),
27711                                                   DAG.getBitcast(MulVT, Odd1)));
27712 
27713     // Shuffle it back into the right order.
27714     SmallVector<int, 16> ShufMask(NumElts);
27715     for (int i = 0; i != (int)NumElts; ++i)
27716       ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
27717 
27718     SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
27719 
27720     // If we have a signed multiply but no PMULDQ fix up the result of an
27721     // unsigned multiply.
27722     if (IsSigned && !Subtarget.hasSSE41()) {
27723       SDValue Zero = DAG.getConstant(0, dl, VT);
27724       SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
27725                                DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
27726       SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
27727                                DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
27728 
27729       SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
27730       Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
27731     }
27732 
27733     return Res;
27734   }
27735 
27736   // Only i8 vectors should need custom lowering after this.
27737   assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
27738          (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
27739          "Unsupported vector type");
27740 
27741   // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
27742   // logical shift down the upper half and pack back to i8.
27743 
27744   // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
27745   // and then ashr/lshr the upper bits down to the lower bits before multiply.
27746 
27747   if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
27748       (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
27749     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
27750     unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
27751     SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
27752     SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
27753     SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
27754     Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
27755     return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
27756   }
27757 
27758   return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
27759 }
27760 
27761 // Custom lowering for SMULO/UMULO.
LowerMULO(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)27762 static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
27763                          SelectionDAG &DAG) {
27764   MVT VT = Op.getSimpleValueType();
27765 
27766   // Scalars defer to LowerXALUO.
27767   if (!VT.isVector())
27768     return LowerXALUO(Op, DAG);
27769 
27770   SDLoc dl(Op);
27771   bool IsSigned = Op->getOpcode() == ISD::SMULO;
27772   SDValue A = Op.getOperand(0);
27773   SDValue B = Op.getOperand(1);
27774   EVT OvfVT = Op->getValueType(1);
27775 
27776   if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
27777       (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
27778     // Extract the LHS Lo/Hi vectors
27779     SDValue LHSLo, LHSHi;
27780     std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
27781 
27782     // Extract the RHS Lo/Hi vectors
27783     SDValue RHSLo, RHSHi;
27784     std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
27785 
27786     EVT LoOvfVT, HiOvfVT;
27787     std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
27788     SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
27789     SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
27790 
27791     // Issue the split operations.
27792     SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
27793     SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
27794 
27795     // Join the separate data results and the overflow results.
27796     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
27797     SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
27798                               Hi.getValue(1));
27799 
27800     return DAG.getMergeValues({Res, Ovf}, dl);
27801   }
27802 
27803   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27804   EVT SetccVT =
27805       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
27806 
27807   if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
27808       (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
27809     unsigned NumElts = VT.getVectorNumElements();
27810     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
27811     unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
27812     SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
27813     SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
27814     SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
27815 
27816     SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
27817 
27818     SDValue Ovf;
27819     if (IsSigned) {
27820       SDValue High, LowSign;
27821       if (OvfVT.getVectorElementType() == MVT::i1 &&
27822           (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
27823         // Rather the truncating try to do the compare on vXi16 or vXi32.
27824         // Shift the high down filling with sign bits.
27825         High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
27826         // Fill all 16 bits with the sign bit from the low.
27827         LowSign =
27828             getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
27829         LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
27830                                              15, DAG);
27831         SetccVT = OvfVT;
27832         if (!Subtarget.hasBWI()) {
27833           // We can't do a vXi16 compare so sign extend to v16i32.
27834           High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
27835           LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
27836         }
27837       } else {
27838         // Otherwise do the compare at vXi8.
27839         High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
27840         High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
27841         LowSign =
27842             DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
27843       }
27844 
27845       Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
27846     } else {
27847       SDValue High =
27848           getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
27849       if (OvfVT.getVectorElementType() == MVT::i1 &&
27850           (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
27851         // Rather the truncating try to do the compare on vXi16 or vXi32.
27852         SetccVT = OvfVT;
27853         if (!Subtarget.hasBWI()) {
27854           // We can't do a vXi16 compare so sign extend to v16i32.
27855           High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
27856         }
27857       } else {
27858         // Otherwise do the compare at vXi8.
27859         High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
27860       }
27861 
27862       Ovf =
27863           DAG.getSetCC(dl, SetccVT, High,
27864                        DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
27865     }
27866 
27867     Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
27868 
27869     return DAG.getMergeValues({Low, Ovf}, dl);
27870   }
27871 
27872   SDValue Low;
27873   SDValue High =
27874       LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
27875 
27876   SDValue Ovf;
27877   if (IsSigned) {
27878     // SMULO overflows if the high bits don't match the sign of the low.
27879     SDValue LowSign =
27880         DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
27881     Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
27882   } else {
27883     // UMULO overflows if the high bits are non-zero.
27884     Ovf =
27885         DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
27886   }
27887 
27888   Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
27889 
27890   return DAG.getMergeValues({Low, Ovf}, dl);
27891 }
27892 
LowerWin64_i128OP(SDValue Op,SelectionDAG & DAG) const27893 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
27894   assert(Subtarget.isTargetWin64() && "Unexpected target");
27895   EVT VT = Op.getValueType();
27896   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
27897          "Unexpected return type for lowering");
27898 
27899   RTLIB::Libcall LC;
27900   bool isSigned;
27901   switch (Op->getOpcode()) {
27902   default: llvm_unreachable("Unexpected request for libcall!");
27903   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
27904   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
27905   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
27906   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
27907   }
27908 
27909   SDLoc dl(Op);
27910   SDValue InChain = DAG.getEntryNode();
27911 
27912   TargetLowering::ArgListTy Args;
27913   TargetLowering::ArgListEntry Entry;
27914   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
27915     EVT ArgVT = Op->getOperand(i).getValueType();
27916     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
27917            "Unexpected argument type for lowering");
27918     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
27919     int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
27920     MachinePointerInfo MPI =
27921         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
27922     Entry.Node = StackPtr;
27923     InChain =
27924         DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
27925     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
27926     Entry.Ty = PointerType::get(ArgTy,0);
27927     Entry.IsSExt = false;
27928     Entry.IsZExt = false;
27929     Args.push_back(Entry);
27930   }
27931 
27932   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
27933                                          getPointerTy(DAG.getDataLayout()));
27934 
27935   TargetLowering::CallLoweringInfo CLI(DAG);
27936   CLI.setDebugLoc(dl)
27937       .setChain(InChain)
27938       .setLibCallee(
27939           getLibcallCallingConv(LC),
27940           static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
27941           std::move(Args))
27942       .setInRegister()
27943       .setSExtResult(isSigned)
27944       .setZExtResult(!isSigned);
27945 
27946   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
27947   return DAG.getBitcast(VT, CallInfo.first);
27948 }
27949 
27950 // Return true if the required (according to Opcode) shift-imm form is natively
27951 // supported by the Subtarget
SupportedVectorShiftWithImm(MVT VT,const X86Subtarget & Subtarget,unsigned Opcode)27952 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
27953                                         unsigned Opcode) {
27954   if (VT.getScalarSizeInBits() < 16)
27955     return false;
27956 
27957   if (VT.is512BitVector() && Subtarget.hasAVX512() &&
27958       (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
27959     return true;
27960 
27961   bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
27962                 (VT.is256BitVector() && Subtarget.hasInt256());
27963 
27964   bool AShift = LShift && (Subtarget.hasAVX512() ||
27965                            (VT != MVT::v2i64 && VT != MVT::v4i64));
27966   return (Opcode == ISD::SRA) ? AShift : LShift;
27967 }
27968 
27969 // The shift amount is a variable, but it is the same for all vector lanes.
27970 // These instructions are defined together with shift-immediate.
27971 static
SupportedVectorShiftWithBaseAmnt(MVT VT,const X86Subtarget & Subtarget,unsigned Opcode)27972 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
27973                                       unsigned Opcode) {
27974   return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
27975 }
27976 
27977 // Return true if the required (according to Opcode) variable-shift form is
27978 // natively supported by the Subtarget
SupportedVectorVarShift(MVT VT,const X86Subtarget & Subtarget,unsigned Opcode)27979 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
27980                                     unsigned Opcode) {
27981 
27982   if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
27983     return false;
27984 
27985   // vXi16 supported only on AVX-512, BWI
27986   if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
27987     return false;
27988 
27989   if (Subtarget.hasAVX512())
27990     return true;
27991 
27992   bool LShift = VT.is128BitVector() || VT.is256BitVector();
27993   bool AShift = LShift &&  VT != MVT::v2i64 && VT != MVT::v4i64;
27994   return (Opcode == ISD::SRA) ? AShift : LShift;
27995 }
27996 
LowerScalarImmediateShift(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)27997 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
27998                                          const X86Subtarget &Subtarget) {
27999   MVT VT = Op.getSimpleValueType();
28000   SDLoc dl(Op);
28001   SDValue R = Op.getOperand(0);
28002   SDValue Amt = Op.getOperand(1);
28003   unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
28004 
28005   auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
28006     assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
28007     MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
28008     SDValue Ex = DAG.getBitcast(ExVT, R);
28009 
28010     // ashr(R, 63) === cmp_slt(R, 0)
28011     if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
28012       assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
28013              "Unsupported PCMPGT op");
28014       return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
28015     }
28016 
28017     if (ShiftAmt >= 32) {
28018       // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
28019       SDValue Upper =
28020           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
28021       SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
28022                                                  ShiftAmt - 32, DAG);
28023       if (VT == MVT::v2i64)
28024         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
28025       if (VT == MVT::v4i64)
28026         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
28027                                   {9, 1, 11, 3, 13, 5, 15, 7});
28028     } else {
28029       // SRA upper i32, SRL whole i64 and select lower i32.
28030       SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
28031                                                  ShiftAmt, DAG);
28032       SDValue Lower =
28033           getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
28034       Lower = DAG.getBitcast(ExVT, Lower);
28035       if (VT == MVT::v2i64)
28036         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
28037       if (VT == MVT::v4i64)
28038         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
28039                                   {8, 1, 10, 3, 12, 5, 14, 7});
28040     }
28041     return DAG.getBitcast(VT, Ex);
28042   };
28043 
28044   // Optimize shl/srl/sra with constant shift amount.
28045   APInt APIntShiftAmt;
28046   if (!X86::isConstantSplat(Amt, APIntShiftAmt))
28047     return SDValue();
28048 
28049   // If the shift amount is out of range, return undef.
28050   if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
28051     return DAG.getUNDEF(VT);
28052 
28053   uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
28054 
28055   if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
28056     return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
28057 
28058   // i64 SRA needs to be performed as partial shifts.
28059   if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
28060        (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
28061       Op.getOpcode() == ISD::SRA)
28062     return ArithmeticShiftRight64(ShiftAmt);
28063 
28064   if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
28065       (Subtarget.hasBWI() && VT == MVT::v64i8)) {
28066     unsigned NumElts = VT.getVectorNumElements();
28067     MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28068 
28069     // Simple i8 add case
28070     if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
28071       return DAG.getNode(ISD::ADD, dl, VT, R, R);
28072 
28073     // ashr(R, 7)  === cmp_slt(R, 0)
28074     if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
28075       SDValue Zeros = DAG.getConstant(0, dl, VT);
28076       if (VT.is512BitVector()) {
28077         assert(VT == MVT::v64i8 && "Unexpected element type!");
28078         SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
28079         return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
28080       }
28081       return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
28082     }
28083 
28084     // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
28085     if (VT == MVT::v16i8 && Subtarget.hasXOP())
28086       return SDValue();
28087 
28088     if (Op.getOpcode() == ISD::SHL) {
28089       // Make a large shift.
28090       SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
28091                                                ShiftAmt, DAG);
28092       SHL = DAG.getBitcast(VT, SHL);
28093       // Zero out the rightmost bits.
28094       APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
28095       return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
28096     }
28097     if (Op.getOpcode() == ISD::SRL) {
28098       // Make a large shift.
28099       SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
28100                                                ShiftAmt, DAG);
28101       SRL = DAG.getBitcast(VT, SRL);
28102       // Zero out the leftmost bits.
28103       APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
28104       return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
28105     }
28106     if (Op.getOpcode() == ISD::SRA) {
28107       // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
28108       SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
28109 
28110       SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
28111       Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
28112       Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
28113       return Res;
28114     }
28115     llvm_unreachable("Unknown shift opcode.");
28116   }
28117 
28118   return SDValue();
28119 }
28120 
LowerScalarVariableShift(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)28121 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
28122                                         const X86Subtarget &Subtarget) {
28123   MVT VT = Op.getSimpleValueType();
28124   SDLoc dl(Op);
28125   SDValue R = Op.getOperand(0);
28126   SDValue Amt = Op.getOperand(1);
28127   unsigned Opcode = Op.getOpcode();
28128   unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
28129   unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
28130 
28131   if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
28132     if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
28133       MVT EltVT = VT.getVectorElementType();
28134       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
28135       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
28136         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
28137       else if (EltVT.bitsLT(MVT::i32))
28138         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
28139 
28140       return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
28141     }
28142 
28143     // vXi8 shifts - shift as v8i16 + mask result.
28144     if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
28145          (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
28146          VT == MVT::v64i8) &&
28147         !Subtarget.hasXOP()) {
28148       unsigned NumElts = VT.getVectorNumElements();
28149       MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28150       if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
28151         unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
28152         unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
28153         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
28154 
28155         // Create the mask using vXi16 shifts. For shift-rights we need to move
28156         // the upper byte down before splatting the vXi8 mask.
28157         SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
28158         BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
28159                                       BaseShAmt, Subtarget, DAG);
28160         if (Opcode != ISD::SHL)
28161           BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
28162                                                8, DAG);
28163         BitMask = DAG.getBitcast(VT, BitMask);
28164         BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
28165                                        SmallVector<int, 64>(NumElts, 0));
28166 
28167         SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
28168                                           DAG.getBitcast(ExtVT, R), BaseShAmt,
28169                                           Subtarget, DAG);
28170         Res = DAG.getBitcast(VT, Res);
28171         Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
28172 
28173         if (Opcode == ISD::SRA) {
28174           // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
28175           // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
28176           SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
28177           SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
28178                                          BaseShAmt, Subtarget, DAG);
28179           SignMask = DAG.getBitcast(VT, SignMask);
28180           Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
28181           Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
28182         }
28183         return Res;
28184       }
28185     }
28186   }
28187 
28188   // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
28189   if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
28190       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
28191     Amt = Amt.getOperand(0);
28192     unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
28193     std::vector<SDValue> Vals(Ratio);
28194     for (unsigned i = 0; i != Ratio; ++i)
28195       Vals[i] = Amt.getOperand(i);
28196     for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
28197       for (unsigned j = 0; j != Ratio; ++j)
28198         if (Vals[j] != Amt.getOperand(i + j))
28199           return SDValue();
28200     }
28201 
28202     if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
28203       return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
28204   }
28205   return SDValue();
28206 }
28207 
28208 // Convert a shift/rotate left amount to a multiplication scale factor.
convertShiftLeftToScale(SDValue Amt,const SDLoc & dl,const X86Subtarget & Subtarget,SelectionDAG & DAG)28209 static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
28210                                        const X86Subtarget &Subtarget,
28211                                        SelectionDAG &DAG) {
28212   MVT VT = Amt.getSimpleValueType();
28213   if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
28214         (Subtarget.hasInt256() && VT == MVT::v16i16) ||
28215         (Subtarget.hasVBMI2() && VT == MVT::v32i16) ||
28216         (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
28217     return SDValue();
28218 
28219   if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
28220     SmallVector<SDValue, 8> Elts;
28221     MVT SVT = VT.getVectorElementType();
28222     unsigned SVTBits = SVT.getSizeInBits();
28223     APInt One(SVTBits, 1);
28224     unsigned NumElems = VT.getVectorNumElements();
28225 
28226     for (unsigned i = 0; i != NumElems; ++i) {
28227       SDValue Op = Amt->getOperand(i);
28228       if (Op->isUndef()) {
28229         Elts.push_back(Op);
28230         continue;
28231       }
28232 
28233       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
28234       APInt C(SVTBits, ND->getZExtValue());
28235       uint64_t ShAmt = C.getZExtValue();
28236       if (ShAmt >= SVTBits) {
28237         Elts.push_back(DAG.getUNDEF(SVT));
28238         continue;
28239       }
28240       Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
28241     }
28242     return DAG.getBuildVector(VT, dl, Elts);
28243   }
28244 
28245   // If the target doesn't support variable shifts, use either FP conversion
28246   // or integer multiplication to avoid shifting each element individually.
28247   if (VT == MVT::v4i32) {
28248     Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
28249     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
28250                       DAG.getConstant(0x3f800000U, dl, VT));
28251     Amt = DAG.getBitcast(MVT::v4f32, Amt);
28252     return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
28253   }
28254 
28255   // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
28256   if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
28257     SDValue Z = DAG.getConstant(0, dl, VT);
28258     SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
28259     SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
28260     Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
28261     Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
28262     if (Subtarget.hasSSE41())
28263       return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
28264 
28265     return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
28266                                         DAG.getBitcast(VT, Hi),
28267                                         {0, 2, 4, 6, 8, 10, 12, 14});
28268   }
28269 
28270   return SDValue();
28271 }
28272 
LowerShift(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)28273 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
28274                           SelectionDAG &DAG) {
28275   MVT VT = Op.getSimpleValueType();
28276   SDLoc dl(Op);
28277   SDValue R = Op.getOperand(0);
28278   SDValue Amt = Op.getOperand(1);
28279   unsigned EltSizeInBits = VT.getScalarSizeInBits();
28280   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
28281 
28282   unsigned Opc = Op.getOpcode();
28283   unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
28284   unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
28285 
28286   assert(VT.isVector() && "Custom lowering only for vector shifts!");
28287   assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
28288 
28289   if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
28290     return V;
28291 
28292   if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
28293     return V;
28294 
28295   if (SupportedVectorVarShift(VT, Subtarget, Opc))
28296     return Op;
28297 
28298   // XOP has 128-bit variable logical/arithmetic shifts.
28299   // +ve/-ve Amt = shift left/right.
28300   if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
28301                              VT == MVT::v8i16 || VT == MVT::v16i8)) {
28302     if (Opc == ISD::SRL || Opc == ISD::SRA) {
28303       SDValue Zero = DAG.getConstant(0, dl, VT);
28304       Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
28305     }
28306     if (Opc == ISD::SHL || Opc == ISD::SRL)
28307       return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
28308     if (Opc == ISD::SRA)
28309       return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
28310   }
28311 
28312   // 2i64 vector logical shifts can efficiently avoid scalarization - do the
28313   // shifts per-lane and then shuffle the partial results back together.
28314   if (VT == MVT::v2i64 && Opc != ISD::SRA) {
28315     // Splat the shift amounts so the scalar shifts above will catch it.
28316     SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
28317     SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
28318     SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
28319     SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
28320     return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
28321   }
28322 
28323   // i64 vector arithmetic shift can be emulated with the transform:
28324   // M = lshr(SIGN_MASK, Amt)
28325   // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
28326   if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
28327       Opc == ISD::SRA) {
28328     SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
28329     SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
28330     R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
28331     R = DAG.getNode(ISD::XOR, dl, VT, R, M);
28332     R = DAG.getNode(ISD::SUB, dl, VT, R, M);
28333     return R;
28334   }
28335 
28336   // If possible, lower this shift as a sequence of two shifts by
28337   // constant plus a BLENDing shuffle instead of scalarizing it.
28338   // Example:
28339   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
28340   //
28341   // Could be rewritten as:
28342   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
28343   //
28344   // The advantage is that the two shifts from the example would be
28345   // lowered as X86ISD::VSRLI nodes in parallel before blending.
28346   if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
28347                       (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
28348     SDValue Amt1, Amt2;
28349     unsigned NumElts = VT.getVectorNumElements();
28350     SmallVector<int, 8> ShuffleMask;
28351     for (unsigned i = 0; i != NumElts; ++i) {
28352       SDValue A = Amt->getOperand(i);
28353       if (A.isUndef()) {
28354         ShuffleMask.push_back(SM_SentinelUndef);
28355         continue;
28356       }
28357       if (!Amt1 || Amt1 == A) {
28358         ShuffleMask.push_back(i);
28359         Amt1 = A;
28360         continue;
28361       }
28362       if (!Amt2 || Amt2 == A) {
28363         ShuffleMask.push_back(i + NumElts);
28364         Amt2 = A;
28365         continue;
28366       }
28367       break;
28368     }
28369 
28370     // Only perform this blend if we can perform it without loading a mask.
28371     if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
28372         (VT != MVT::v16i16 ||
28373          is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
28374         (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
28375          canWidenShuffleElements(ShuffleMask))) {
28376       auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
28377       auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
28378       if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
28379           Cst2->getAPIntValue().ult(EltSizeInBits)) {
28380         SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
28381                                                     Cst1->getZExtValue(), DAG);
28382         SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
28383                                                     Cst2->getZExtValue(), DAG);
28384         return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
28385       }
28386     }
28387   }
28388 
28389   // If possible, lower this packed shift into a vector multiply instead of
28390   // expanding it into a sequence of scalar shifts.
28391   if (Opc == ISD::SHL)
28392     if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
28393       return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
28394 
28395   // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
28396   // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
28397   if (Opc == ISD::SRL && ConstantAmt &&
28398       (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
28399     SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
28400     SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
28401     if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
28402       SDValue Zero = DAG.getConstant(0, dl, VT);
28403       SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
28404       SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
28405       return DAG.getSelect(dl, VT, ZAmt, R, Res);
28406     }
28407   }
28408 
28409   // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
28410   // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
28411   // TODO: Special case handling for shift by 0/1, really we can afford either
28412   // of these cases in pre-SSE41/XOP/AVX512 but not both.
28413   if (Opc == ISD::SRA && ConstantAmt &&
28414       (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
28415       ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
28416         !Subtarget.hasAVX512()) ||
28417        DAG.isKnownNeverZero(Amt))) {
28418     SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
28419     SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
28420     if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
28421       SDValue Amt0 =
28422           DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
28423       SDValue Amt1 =
28424           DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
28425       SDValue Sra1 =
28426           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
28427       SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
28428       Res = DAG.getSelect(dl, VT, Amt0, R, Res);
28429       return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
28430     }
28431   }
28432 
28433   // v4i32 Non Uniform Shifts.
28434   // If the shift amount is constant we can shift each lane using the SSE2
28435   // immediate shifts, else we need to zero-extend each lane to the lower i64
28436   // and shift using the SSE2 variable shifts.
28437   // The separate results can then be blended together.
28438   if (VT == MVT::v4i32) {
28439     SDValue Amt0, Amt1, Amt2, Amt3;
28440     if (ConstantAmt) {
28441       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
28442       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
28443       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
28444       Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
28445     } else {
28446       // The SSE2 shifts use the lower i64 as the same shift amount for
28447       // all lanes and the upper i64 is ignored. On AVX we're better off
28448       // just zero-extending, but for SSE just duplicating the top 16-bits is
28449       // cheaper and has the same effect for out of range values.
28450       if (Subtarget.hasAVX()) {
28451         SDValue Z = DAG.getConstant(0, dl, VT);
28452         Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
28453         Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
28454         Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
28455         Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
28456       } else {
28457         SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
28458         SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
28459                                              {4, 5, 6, 7, -1, -1, -1, -1});
28460         Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
28461                                     {0, 1, 1, 1, -1, -1, -1, -1});
28462         Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
28463                                     {2, 3, 3, 3, -1, -1, -1, -1});
28464         Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
28465                                     {0, 1, 1, 1, -1, -1, -1, -1});
28466         Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
28467                                     {2, 3, 3, 3, -1, -1, -1, -1});
28468       }
28469     }
28470 
28471     unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
28472     SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
28473     SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
28474     SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
28475     SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
28476 
28477     // Merge the shifted lane results optimally with/without PBLENDW.
28478     // TODO - ideally shuffle combining would handle this.
28479     if (Subtarget.hasSSE41()) {
28480       SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
28481       SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
28482       return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
28483     }
28484     SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
28485     SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
28486     return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
28487   }
28488 
28489   // It's worth extending once and using the vXi16/vXi32 shifts for smaller
28490   // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
28491   // make the existing SSE solution better.
28492   // NOTE: We honor prefered vector width before promoting to 512-bits.
28493   if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
28494       (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
28495       (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
28496       (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
28497       (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
28498     assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
28499            "Unexpected vector type");
28500     MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
28501     MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
28502     unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28503     R = DAG.getNode(ExtOpc, dl, ExtVT, R);
28504     Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
28505     return DAG.getNode(ISD::TRUNCATE, dl, VT,
28506                        DAG.getNode(Opc, dl, ExtVT, R, Amt));
28507   }
28508 
28509   // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
28510   // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
28511   if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
28512       (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
28513        (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
28514       !Subtarget.hasXOP()) {
28515     int NumElts = VT.getVectorNumElements();
28516     SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
28517 
28518     // Extend constant shift amount to vXi16 (it doesn't matter if the type
28519     // isn't legal).
28520     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28521     Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
28522     Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
28523     Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
28524     assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
28525            "Constant build vector expected");
28526 
28527     if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
28528       R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
28529                           : DAG.getZExtOrTrunc(R, dl, ExVT);
28530       R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
28531       R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
28532       return DAG.getZExtOrTrunc(R, dl, VT);
28533     }
28534 
28535     SmallVector<SDValue, 16> LoAmt, HiAmt;
28536     for (int i = 0; i != NumElts; i += 16) {
28537       for (int j = 0; j != 8; ++j) {
28538         LoAmt.push_back(Amt.getOperand(i + j));
28539         HiAmt.push_back(Amt.getOperand(i + j + 8));
28540       }
28541     }
28542 
28543     MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
28544     SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
28545     SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
28546 
28547     SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
28548     SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
28549     LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
28550     HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
28551     LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
28552     HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
28553     LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
28554     HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
28555     return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
28556   }
28557 
28558   if (VT == MVT::v16i8 ||
28559       (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
28560       (VT == MVT::v64i8 && Subtarget.hasBWI())) {
28561     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
28562 
28563     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
28564       if (VT.is512BitVector()) {
28565         // On AVX512BW targets we make use of the fact that VSELECT lowers
28566         // to a masked blend which selects bytes based just on the sign bit
28567         // extracted to a mask.
28568         MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
28569         V0 = DAG.getBitcast(VT, V0);
28570         V1 = DAG.getBitcast(VT, V1);
28571         Sel = DAG.getBitcast(VT, Sel);
28572         Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
28573                            ISD::SETGT);
28574         return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
28575       } else if (Subtarget.hasSSE41()) {
28576         // On SSE41 targets we can use PBLENDVB which selects bytes based just
28577         // on the sign bit.
28578         V0 = DAG.getBitcast(VT, V0);
28579         V1 = DAG.getBitcast(VT, V1);
28580         Sel = DAG.getBitcast(VT, Sel);
28581         return DAG.getBitcast(SelVT,
28582                               DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
28583       }
28584       // On pre-SSE41 targets we test for the sign bit by comparing to
28585       // zero - a negative value will set all bits of the lanes to true
28586       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
28587       SDValue Z = DAG.getConstant(0, dl, SelVT);
28588       SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
28589       return DAG.getSelect(dl, SelVT, C, V0, V1);
28590     };
28591 
28592     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
28593     // We can safely do this using i16 shifts as we're only interested in
28594     // the 3 lower bits of each byte.
28595     Amt = DAG.getBitcast(ExtVT, Amt);
28596     Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
28597     Amt = DAG.getBitcast(VT, Amt);
28598 
28599     if (Opc == ISD::SHL || Opc == ISD::SRL) {
28600       // r = VSELECT(r, shift(r, 4), a);
28601       SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
28602       R = SignBitSelect(VT, Amt, M, R);
28603 
28604       // a += a
28605       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28606 
28607       // r = VSELECT(r, shift(r, 2), a);
28608       M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
28609       R = SignBitSelect(VT, Amt, M, R);
28610 
28611       // a += a
28612       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28613 
28614       // return VSELECT(r, shift(r, 1), a);
28615       M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
28616       R = SignBitSelect(VT, Amt, M, R);
28617       return R;
28618     }
28619 
28620     if (Opc == ISD::SRA) {
28621       // For SRA we need to unpack each byte to the higher byte of a i16 vector
28622       // so we can correctly sign extend. We don't care what happens to the
28623       // lower byte.
28624       SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
28625       SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
28626       SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
28627       SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
28628       ALo = DAG.getBitcast(ExtVT, ALo);
28629       AHi = DAG.getBitcast(ExtVT, AHi);
28630       RLo = DAG.getBitcast(ExtVT, RLo);
28631       RHi = DAG.getBitcast(ExtVT, RHi);
28632 
28633       // r = VSELECT(r, shift(r, 4), a);
28634       SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
28635       SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
28636       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
28637       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
28638 
28639       // a += a
28640       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
28641       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
28642 
28643       // r = VSELECT(r, shift(r, 2), a);
28644       MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
28645       MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
28646       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
28647       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
28648 
28649       // a += a
28650       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
28651       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
28652 
28653       // r = VSELECT(r, shift(r, 1), a);
28654       MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
28655       MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
28656       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
28657       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
28658 
28659       // Logical shift the result back to the lower byte, leaving a zero upper
28660       // byte meaning that we can safely pack with PACKUSWB.
28661       RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
28662       RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
28663       return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
28664     }
28665   }
28666 
28667   if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
28668     MVT ExtVT = MVT::v8i32;
28669     SDValue Z = DAG.getConstant(0, dl, VT);
28670     SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
28671     SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
28672     SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
28673     SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
28674     ALo = DAG.getBitcast(ExtVT, ALo);
28675     AHi = DAG.getBitcast(ExtVT, AHi);
28676     RLo = DAG.getBitcast(ExtVT, RLo);
28677     RHi = DAG.getBitcast(ExtVT, RHi);
28678     SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
28679     SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
28680     Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
28681     Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
28682     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
28683   }
28684 
28685   if (VT == MVT::v8i16) {
28686     // If we have a constant shift amount, the non-SSE41 path is best as
28687     // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
28688     bool UseSSE41 = Subtarget.hasSSE41() &&
28689                     !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
28690 
28691     auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
28692       // On SSE41 targets we can use PBLENDVB which selects bytes based just on
28693       // the sign bit.
28694       if (UseSSE41) {
28695         MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
28696         V0 = DAG.getBitcast(ExtVT, V0);
28697         V1 = DAG.getBitcast(ExtVT, V1);
28698         Sel = DAG.getBitcast(ExtVT, Sel);
28699         return DAG.getBitcast(
28700             VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
28701       }
28702       // On pre-SSE41 targets we splat the sign bit - a negative value will
28703       // set all bits of the lanes to true and VSELECT uses that in
28704       // its OR(AND(V0,C),AND(V1,~C)) lowering.
28705       SDValue C =
28706           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
28707       return DAG.getSelect(dl, VT, C, V0, V1);
28708     };
28709 
28710     // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
28711     if (UseSSE41) {
28712       // On SSE41 targets we need to replicate the shift mask in both
28713       // bytes for PBLENDVB.
28714       Amt = DAG.getNode(
28715           ISD::OR, dl, VT,
28716           getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
28717           getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
28718     } else {
28719       Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
28720     }
28721 
28722     // r = VSELECT(r, shift(r, 8), a);
28723     SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
28724     R = SignBitSelect(Amt, M, R);
28725 
28726     // a += a
28727     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28728 
28729     // r = VSELECT(r, shift(r, 4), a);
28730     M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
28731     R = SignBitSelect(Amt, M, R);
28732 
28733     // a += a
28734     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28735 
28736     // r = VSELECT(r, shift(r, 2), a);
28737     M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
28738     R = SignBitSelect(Amt, M, R);
28739 
28740     // a += a
28741     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28742 
28743     // return VSELECT(r, shift(r, 1), a);
28744     M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
28745     R = SignBitSelect(Amt, M, R);
28746     return R;
28747   }
28748 
28749   // Decompose 256-bit shifts into 128-bit shifts.
28750   if (VT.is256BitVector())
28751     return splitVectorIntBinary(Op, DAG);
28752 
28753   if (VT == MVT::v32i16 || VT == MVT::v64i8)
28754     return splitVectorIntBinary(Op, DAG);
28755 
28756   return SDValue();
28757 }
28758 
LowerRotate(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)28759 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
28760                            SelectionDAG &DAG) {
28761   MVT VT = Op.getSimpleValueType();
28762   assert(VT.isVector() && "Custom lowering only for vector rotates!");
28763 
28764   SDLoc DL(Op);
28765   SDValue R = Op.getOperand(0);
28766   SDValue Amt = Op.getOperand(1);
28767   unsigned Opcode = Op.getOpcode();
28768   unsigned EltSizeInBits = VT.getScalarSizeInBits();
28769   int NumElts = VT.getVectorNumElements();
28770 
28771   // Check for constant splat rotation amount.
28772   APInt CstSplatValue;
28773   bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
28774 
28775   // Check for splat rotate by zero.
28776   if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
28777     return R;
28778 
28779   // AVX512 implicitly uses modulo rotation amounts.
28780   if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
28781     // Attempt to rotate by immediate.
28782     if (IsCstSplat) {
28783       unsigned RotOpc = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
28784       uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
28785       return DAG.getNode(RotOpc, DL, VT, R,
28786                          DAG.getTargetConstant(RotAmt, DL, MVT::i8));
28787     }
28788 
28789     // Else, fall-back on VPROLV/VPRORV.
28790     return Op;
28791   }
28792 
28793   // AVX512 VBMI2 vXi16 - lower to funnel shifts.
28794   if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
28795     unsigned FunnelOpc = (Opcode == ISD::ROTL ? ISD::FSHL : ISD::FSHR);
28796     return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
28797   }
28798 
28799   assert((Opcode == ISD::ROTL) && "Only ROTL supported");
28800 
28801   // XOP has 128-bit vector variable + immediate rotates.
28802   // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
28803   // XOP implicitly uses modulo rotation amounts.
28804   if (Subtarget.hasXOP()) {
28805     if (VT.is256BitVector())
28806       return splitVectorIntBinary(Op, DAG);
28807     assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
28808 
28809     // Attempt to rotate by immediate.
28810     if (IsCstSplat) {
28811       uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
28812       return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
28813                          DAG.getTargetConstant(RotAmt, DL, MVT::i8));
28814     }
28815 
28816     // Use general rotate by variable (per-element).
28817     return Op;
28818   }
28819 
28820   // Split 256-bit integers on pre-AVX2 targets.
28821   if (VT.is256BitVector() && !Subtarget.hasAVX2())
28822     return splitVectorIntBinary(Op, DAG);
28823 
28824   assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
28825           ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 ||
28826             VT == MVT::v32i16) &&
28827            Subtarget.hasAVX2())) &&
28828          "Only vXi32/vXi16/vXi8 vector rotates supported");
28829 
28830   // Rotate by an uniform constant - expand back to shifts.
28831   if (IsCstSplat)
28832     return SDValue();
28833 
28834   bool IsSplatAmt = DAG.isSplatValue(Amt);
28835 
28836   // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
28837   // the amount bit.
28838   if (EltSizeInBits == 8 && !IsSplatAmt) {
28839     if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
28840       return SDValue();
28841 
28842     // We don't need ModuloAmt here as we just peek at individual bits.
28843     MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28844 
28845     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
28846       if (Subtarget.hasSSE41()) {
28847         // On SSE41 targets we can use PBLENDVB which selects bytes based just
28848         // on the sign bit.
28849         V0 = DAG.getBitcast(VT, V0);
28850         V1 = DAG.getBitcast(VT, V1);
28851         Sel = DAG.getBitcast(VT, Sel);
28852         return DAG.getBitcast(SelVT,
28853                               DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
28854       }
28855       // On pre-SSE41 targets we test for the sign bit by comparing to
28856       // zero - a negative value will set all bits of the lanes to true
28857       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
28858       SDValue Z = DAG.getConstant(0, DL, SelVT);
28859       SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
28860       return DAG.getSelect(DL, SelVT, C, V0, V1);
28861     };
28862 
28863     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
28864     // We can safely do this using i16 shifts as we're only interested in
28865     // the 3 lower bits of each byte.
28866     Amt = DAG.getBitcast(ExtVT, Amt);
28867     Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
28868     Amt = DAG.getBitcast(VT, Amt);
28869 
28870     // r = VSELECT(r, rot(r, 4), a);
28871     SDValue M;
28872     M = DAG.getNode(
28873         ISD::OR, DL, VT,
28874         DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
28875         DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
28876     R = SignBitSelect(VT, Amt, M, R);
28877 
28878     // a += a
28879     Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
28880 
28881     // r = VSELECT(r, rot(r, 2), a);
28882     M = DAG.getNode(
28883         ISD::OR, DL, VT,
28884         DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
28885         DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
28886     R = SignBitSelect(VT, Amt, M, R);
28887 
28888     // a += a
28889     Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
28890 
28891     // return VSELECT(r, rot(r, 1), a);
28892     M = DAG.getNode(
28893         ISD::OR, DL, VT,
28894         DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
28895         DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
28896     return SignBitSelect(VT, Amt, M, R);
28897   }
28898 
28899   // ISD::ROT* uses modulo rotate amounts.
28900   Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
28901                     DAG.getConstant(EltSizeInBits - 1, DL, VT));
28902 
28903   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
28904   bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
28905                         SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
28906 
28907   // Fallback for splats + all supported variable shifts.
28908   // Fallback for non-constants AVX2 vXi16 as well.
28909   if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
28910     SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
28911     AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
28912     SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
28913     SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
28914     return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
28915   }
28916 
28917   // As with shifts, convert the rotation amount to a multiplication factor.
28918   SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
28919   assert(Scale && "Failed to convert ROTL amount to scale");
28920 
28921   // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
28922   if (EltSizeInBits == 16) {
28923     SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
28924     SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
28925     return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
28926   }
28927 
28928   // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
28929   // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
28930   // that can then be OR'd with the lower 32-bits.
28931   assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
28932   static const int OddMask[] = {1, -1, 3, -1};
28933   SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
28934   SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
28935 
28936   SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
28937                               DAG.getBitcast(MVT::v2i64, R),
28938                               DAG.getBitcast(MVT::v2i64, Scale));
28939   SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
28940                               DAG.getBitcast(MVT::v2i64, R13),
28941                               DAG.getBitcast(MVT::v2i64, Scale13));
28942   Res02 = DAG.getBitcast(VT, Res02);
28943   Res13 = DAG.getBitcast(VT, Res13);
28944 
28945   return DAG.getNode(ISD::OR, DL, VT,
28946                      DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
28947                      DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
28948 }
28949 
28950 /// Returns true if the operand type is exactly twice the native width, and
28951 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
28952 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
28953 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
needsCmpXchgNb(Type * MemType) const28954 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
28955   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
28956 
28957   if (OpWidth == 64)
28958     return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
28959   if (OpWidth == 128)
28960     return Subtarget.hasCmpxchg16b();
28961 
28962   return false;
28963 }
28964 
shouldExpandAtomicStoreInIR(StoreInst * SI) const28965 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
28966   Type *MemType = SI->getValueOperand()->getType();
28967 
28968   bool NoImplicitFloatOps =
28969       SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
28970   if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
28971       !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
28972       (Subtarget.hasSSE1() || Subtarget.hasX87()))
28973     return false;
28974 
28975   return needsCmpXchgNb(MemType);
28976 }
28977 
28978 // Note: this turns large loads into lock cmpxchg8b/16b.
28979 // TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
28980 TargetLowering::AtomicExpansionKind
shouldExpandAtomicLoadInIR(LoadInst * LI) const28981 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
28982   Type *MemType = LI->getType();
28983 
28984   // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
28985   // can use movq to do the load. If we have X87 we can load into an 80-bit
28986   // X87 register and store it to a stack temporary.
28987   bool NoImplicitFloatOps =
28988       LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
28989   if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
28990       !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
28991       (Subtarget.hasSSE1() || Subtarget.hasX87()))
28992     return AtomicExpansionKind::None;
28993 
28994   return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
28995                                  : AtomicExpansionKind::None;
28996 }
28997 
28998 TargetLowering::AtomicExpansionKind
shouldExpandAtomicRMWInIR(AtomicRMWInst * AI) const28999 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
29000   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
29001   Type *MemType = AI->getType();
29002 
29003   // If the operand is too big, we must see if cmpxchg8/16b is available
29004   // and default to library calls otherwise.
29005   if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
29006     return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
29007                                    : AtomicExpansionKind::None;
29008   }
29009 
29010   AtomicRMWInst::BinOp Op = AI->getOperation();
29011   switch (Op) {
29012   default:
29013     llvm_unreachable("Unknown atomic operation");
29014   case AtomicRMWInst::Xchg:
29015   case AtomicRMWInst::Add:
29016   case AtomicRMWInst::Sub:
29017     // It's better to use xadd, xsub or xchg for these in all cases.
29018     return AtomicExpansionKind::None;
29019   case AtomicRMWInst::Or:
29020   case AtomicRMWInst::And:
29021   case AtomicRMWInst::Xor:
29022     // If the atomicrmw's result isn't actually used, we can just add a "lock"
29023     // prefix to a normal instruction for these operations.
29024     return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
29025                             : AtomicExpansionKind::None;
29026   case AtomicRMWInst::Nand:
29027   case AtomicRMWInst::Max:
29028   case AtomicRMWInst::Min:
29029   case AtomicRMWInst::UMax:
29030   case AtomicRMWInst::UMin:
29031   case AtomicRMWInst::FAdd:
29032   case AtomicRMWInst::FSub:
29033     // These always require a non-trivial set of data operations on x86. We must
29034     // use a cmpxchg loop.
29035     return AtomicExpansionKind::CmpXChg;
29036   }
29037 }
29038 
29039 LoadInst *
lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst * AI) const29040 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
29041   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
29042   Type *MemType = AI->getType();
29043   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
29044   // there is no benefit in turning such RMWs into loads, and it is actually
29045   // harmful as it introduces a mfence.
29046   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
29047     return nullptr;
29048 
29049   // If this is a canonical idempotent atomicrmw w/no uses, we have a better
29050   // lowering available in lowerAtomicArith.
29051   // TODO: push more cases through this path.
29052   if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
29053     if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
29054         AI->use_empty())
29055       return nullptr;
29056 
29057   IRBuilder<> Builder(AI);
29058   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
29059   auto SSID = AI->getSyncScopeID();
29060   // We must restrict the ordering to avoid generating loads with Release or
29061   // ReleaseAcquire orderings.
29062   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
29063 
29064   // Before the load we need a fence. Here is an example lifted from
29065   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
29066   // is required:
29067   // Thread 0:
29068   //   x.store(1, relaxed);
29069   //   r1 = y.fetch_add(0, release);
29070   // Thread 1:
29071   //   y.fetch_add(42, acquire);
29072   //   r2 = x.load(relaxed);
29073   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
29074   // lowered to just a load without a fence. A mfence flushes the store buffer,
29075   // making the optimization clearly correct.
29076   // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
29077   // otherwise, we might be able to be more aggressive on relaxed idempotent
29078   // rmw. In practice, they do not look useful, so we don't try to be
29079   // especially clever.
29080   if (SSID == SyncScope::SingleThread)
29081     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
29082     // the IR level, so we must wrap it in an intrinsic.
29083     return nullptr;
29084 
29085   if (!Subtarget.hasMFence())
29086     // FIXME: it might make sense to use a locked operation here but on a
29087     // different cache-line to prevent cache-line bouncing. In practice it
29088     // is probably a small win, and x86 processors without mfence are rare
29089     // enough that we do not bother.
29090     return nullptr;
29091 
29092   Function *MFence =
29093       llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
29094   Builder.CreateCall(MFence, {});
29095 
29096   // Finally we can emit the atomic load.
29097   LoadInst *Loaded = Builder.CreateAlignedLoad(
29098       AI->getType(), AI->getPointerOperand(), AI->getAlign());
29099   Loaded->setAtomic(Order, SSID);
29100   AI->replaceAllUsesWith(Loaded);
29101   AI->eraseFromParent();
29102   return Loaded;
29103 }
29104 
lowerAtomicStoreAsStoreSDNode(const StoreInst & SI) const29105 bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
29106   if (!SI.isUnordered())
29107     return false;
29108   return ExperimentalUnorderedISEL;
29109 }
lowerAtomicLoadAsLoadSDNode(const LoadInst & LI) const29110 bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
29111   if (!LI.isUnordered())
29112     return false;
29113   return ExperimentalUnorderedISEL;
29114 }
29115 
29116 
29117 /// Emit a locked operation on a stack location which does not change any
29118 /// memory location, but does involve a lock prefix.  Location is chosen to be
29119 /// a) very likely accessed only by a single thread to minimize cache traffic,
29120 /// and b) definitely dereferenceable.  Returns the new Chain result.
emitLockedStackOp(SelectionDAG & DAG,const X86Subtarget & Subtarget,SDValue Chain,const SDLoc & DL)29121 static SDValue emitLockedStackOp(SelectionDAG &DAG,
29122                                  const X86Subtarget &Subtarget, SDValue Chain,
29123                                  const SDLoc &DL) {
29124   // Implementation notes:
29125   // 1) LOCK prefix creates a full read/write reordering barrier for memory
29126   // operations issued by the current processor.  As such, the location
29127   // referenced is not relevant for the ordering properties of the instruction.
29128   // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
29129   // 8.2.3.9  Loads and Stores Are Not Reordered with Locked Instructions
29130   // 2) Using an immediate operand appears to be the best encoding choice
29131   // here since it doesn't require an extra register.
29132   // 3) OR appears to be very slightly faster than ADD. (Though, the difference
29133   // is small enough it might just be measurement noise.)
29134   // 4) When choosing offsets, there are several contributing factors:
29135   //   a) If there's no redzone, we default to TOS.  (We could allocate a cache
29136   //      line aligned stack object to improve this case.)
29137   //   b) To minimize our chances of introducing a false dependence, we prefer
29138   //      to offset the stack usage from TOS slightly.
29139   //   c) To minimize concerns about cross thread stack usage - in particular,
29140   //      the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
29141   //      captures state in the TOS frame and accesses it from many threads -
29142   //      we want to use an offset such that the offset is in a distinct cache
29143   //      line from the TOS frame.
29144   //
29145   // For a general discussion of the tradeoffs and benchmark results, see:
29146   // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
29147 
29148   auto &MF = DAG.getMachineFunction();
29149   auto &TFL = *Subtarget.getFrameLowering();
29150   const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
29151 
29152   if (Subtarget.is64Bit()) {
29153     SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
29154     SDValue Ops[] = {
29155       DAG.getRegister(X86::RSP, MVT::i64),                  // Base
29156       DAG.getTargetConstant(1, DL, MVT::i8),                // Scale
29157       DAG.getRegister(0, MVT::i64),                         // Index
29158       DAG.getTargetConstant(SPOffset, DL, MVT::i32),        // Disp
29159       DAG.getRegister(0, MVT::i16),                         // Segment.
29160       Zero,
29161       Chain};
29162     SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
29163                                      MVT::Other, Ops);
29164     return SDValue(Res, 1);
29165   }
29166 
29167   SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
29168   SDValue Ops[] = {
29169     DAG.getRegister(X86::ESP, MVT::i32),            // Base
29170     DAG.getTargetConstant(1, DL, MVT::i8),          // Scale
29171     DAG.getRegister(0, MVT::i32),                   // Index
29172     DAG.getTargetConstant(SPOffset, DL, MVT::i32),  // Disp
29173     DAG.getRegister(0, MVT::i16),                   // Segment.
29174     Zero,
29175     Chain
29176   };
29177   SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
29178                                    MVT::Other, Ops);
29179   return SDValue(Res, 1);
29180 }
29181 
LowerATOMIC_FENCE(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)29182 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
29183                                  SelectionDAG &DAG) {
29184   SDLoc dl(Op);
29185   AtomicOrdering FenceOrdering =
29186       static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
29187   SyncScope::ID FenceSSID =
29188       static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
29189 
29190   // The only fence that needs an instruction is a sequentially-consistent
29191   // cross-thread fence.
29192   if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
29193       FenceSSID == SyncScope::System) {
29194     if (Subtarget.hasMFence())
29195       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
29196 
29197     SDValue Chain = Op.getOperand(0);
29198     return emitLockedStackOp(DAG, Subtarget, Chain, dl);
29199   }
29200 
29201   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
29202   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
29203 }
29204 
LowerCMP_SWAP(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)29205 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
29206                              SelectionDAG &DAG) {
29207   MVT T = Op.getSimpleValueType();
29208   SDLoc DL(Op);
29209   unsigned Reg = 0;
29210   unsigned size = 0;
29211   switch(T.SimpleTy) {
29212   default: llvm_unreachable("Invalid value type!");
29213   case MVT::i8:  Reg = X86::AL;  size = 1; break;
29214   case MVT::i16: Reg = X86::AX;  size = 2; break;
29215   case MVT::i32: Reg = X86::EAX; size = 4; break;
29216   case MVT::i64:
29217     assert(Subtarget.is64Bit() && "Node not type legal!");
29218     Reg = X86::RAX; size = 8;
29219     break;
29220   }
29221   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
29222                                   Op.getOperand(2), SDValue());
29223   SDValue Ops[] = { cpIn.getValue(0),
29224                     Op.getOperand(1),
29225                     Op.getOperand(3),
29226                     DAG.getTargetConstant(size, DL, MVT::i8),
29227                     cpIn.getValue(1) };
29228   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
29229   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
29230   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
29231                                            Ops, T, MMO);
29232 
29233   SDValue cpOut =
29234     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
29235   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
29236                                       MVT::i32, cpOut.getValue(2));
29237   SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
29238 
29239   return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
29240                      cpOut, Success, EFLAGS.getValue(1));
29241 }
29242 
29243 // Create MOVMSKB, taking into account whether we need to split for AVX1.
getPMOVMSKB(const SDLoc & DL,SDValue V,SelectionDAG & DAG,const X86Subtarget & Subtarget)29244 static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
29245                            const X86Subtarget &Subtarget) {
29246   MVT InVT = V.getSimpleValueType();
29247 
29248   if (InVT == MVT::v64i8) {
29249     SDValue Lo, Hi;
29250     std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
29251     Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
29252     Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
29253     Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
29254     Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
29255     Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
29256                      DAG.getConstant(32, DL, MVT::i8));
29257     return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
29258   }
29259   if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
29260     SDValue Lo, Hi;
29261     std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
29262     Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
29263     Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
29264     Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
29265                      DAG.getConstant(16, DL, MVT::i8));
29266     return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
29267   }
29268 
29269   return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
29270 }
29271 
LowerBITCAST(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)29272 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
29273                             SelectionDAG &DAG) {
29274   SDValue Src = Op.getOperand(0);
29275   MVT SrcVT = Src.getSimpleValueType();
29276   MVT DstVT = Op.getSimpleValueType();
29277 
29278   // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
29279   // half to v32i1 and concatenating the result.
29280   if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
29281     assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
29282     assert(Subtarget.hasBWI() && "Expected BWI target");
29283     SDLoc dl(Op);
29284     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
29285                              DAG.getIntPtrConstant(0, dl));
29286     Lo = DAG.getBitcast(MVT::v32i1, Lo);
29287     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
29288                              DAG.getIntPtrConstant(1, dl));
29289     Hi = DAG.getBitcast(MVT::v32i1, Hi);
29290     return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
29291   }
29292 
29293   // Use MOVMSK for vector to scalar conversion to prevent scalarization.
29294   if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
29295     assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
29296     MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
29297     SDLoc DL(Op);
29298     SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
29299     V = getPMOVMSKB(DL, V, DAG, Subtarget);
29300     return DAG.getZExtOrTrunc(V, DL, DstVT);
29301   }
29302 
29303   assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
29304           SrcVT == MVT::i64) && "Unexpected VT!");
29305 
29306   assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
29307   if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
29308       !(DstVT == MVT::x86mmx && SrcVT.isVector()))
29309     // This conversion needs to be expanded.
29310     return SDValue();
29311 
29312   SDLoc dl(Op);
29313   if (SrcVT.isVector()) {
29314     // Widen the vector in input in the case of MVT::v2i32.
29315     // Example: from MVT::v2i32 to MVT::v4i32.
29316     MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
29317                                  SrcVT.getVectorNumElements() * 2);
29318     Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
29319                       DAG.getUNDEF(SrcVT));
29320   } else {
29321     assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
29322            "Unexpected source type in LowerBITCAST");
29323     Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
29324   }
29325 
29326   MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
29327   Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
29328 
29329   if (DstVT == MVT::x86mmx)
29330     return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
29331 
29332   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
29333                      DAG.getIntPtrConstant(0, dl));
29334 }
29335 
29336 /// Compute the horizontal sum of bytes in V for the elements of VT.
29337 ///
29338 /// Requires V to be a byte vector and VT to be an integer vector type with
29339 /// wider elements than V's type. The width of the elements of VT determines
29340 /// how many bytes of V are summed horizontally to produce each element of the
29341 /// result.
LowerHorizontalByteSum(SDValue V,MVT VT,const X86Subtarget & Subtarget,SelectionDAG & DAG)29342 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
29343                                       const X86Subtarget &Subtarget,
29344                                       SelectionDAG &DAG) {
29345   SDLoc DL(V);
29346   MVT ByteVecVT = V.getSimpleValueType();
29347   MVT EltVT = VT.getVectorElementType();
29348   assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
29349          "Expected value to have byte element type.");
29350   assert(EltVT != MVT::i8 &&
29351          "Horizontal byte sum only makes sense for wider elements!");
29352   unsigned VecSize = VT.getSizeInBits();
29353   assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
29354 
29355   // PSADBW instruction horizontally add all bytes and leave the result in i64
29356   // chunks, thus directly computes the pop count for v2i64 and v4i64.
29357   if (EltVT == MVT::i64) {
29358     SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
29359     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
29360     V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
29361     return DAG.getBitcast(VT, V);
29362   }
29363 
29364   if (EltVT == MVT::i32) {
29365     // We unpack the low half and high half into i32s interleaved with zeros so
29366     // that we can use PSADBW to horizontally sum them. The most useful part of
29367     // this is that it lines up the results of two PSADBW instructions to be
29368     // two v2i64 vectors which concatenated are the 4 population counts. We can
29369     // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
29370     SDValue Zeros = DAG.getConstant(0, DL, VT);
29371     SDValue V32 = DAG.getBitcast(VT, V);
29372     SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
29373     SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
29374 
29375     // Do the horizontal sums into two v2i64s.
29376     Zeros = DAG.getConstant(0, DL, ByteVecVT);
29377     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
29378     Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
29379                       DAG.getBitcast(ByteVecVT, Low), Zeros);
29380     High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
29381                        DAG.getBitcast(ByteVecVT, High), Zeros);
29382 
29383     // Merge them together.
29384     MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
29385     V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
29386                     DAG.getBitcast(ShortVecVT, Low),
29387                     DAG.getBitcast(ShortVecVT, High));
29388 
29389     return DAG.getBitcast(VT, V);
29390   }
29391 
29392   // The only element type left is i16.
29393   assert(EltVT == MVT::i16 && "Unknown how to handle type");
29394 
29395   // To obtain pop count for each i16 element starting from the pop count for
29396   // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
29397   // right by 8. It is important to shift as i16s as i8 vector shift isn't
29398   // directly supported.
29399   SDValue ShifterV = DAG.getConstant(8, DL, VT);
29400   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
29401   V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
29402                   DAG.getBitcast(ByteVecVT, V));
29403   return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
29404 }
29405 
LowerVectorCTPOPInRegLUT(SDValue Op,const SDLoc & DL,const X86Subtarget & Subtarget,SelectionDAG & DAG)29406 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
29407                                         const X86Subtarget &Subtarget,
29408                                         SelectionDAG &DAG) {
29409   MVT VT = Op.getSimpleValueType();
29410   MVT EltVT = VT.getVectorElementType();
29411   int NumElts = VT.getVectorNumElements();
29412   (void)EltVT;
29413   assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
29414 
29415   // Implement a lookup table in register by using an algorithm based on:
29416   // http://wm.ite.pl/articles/sse-popcount.html
29417   //
29418   // The general idea is that every lower byte nibble in the input vector is an
29419   // index into a in-register pre-computed pop count table. We then split up the
29420   // input vector in two new ones: (1) a vector with only the shifted-right
29421   // higher nibbles for each byte and (2) a vector with the lower nibbles (and
29422   // masked out higher ones) for each byte. PSHUFB is used separately with both
29423   // to index the in-register table. Next, both are added and the result is a
29424   // i8 vector where each element contains the pop count for input byte.
29425   const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
29426                        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
29427                        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
29428                        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
29429 
29430   SmallVector<SDValue, 64> LUTVec;
29431   for (int i = 0; i < NumElts; ++i)
29432     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
29433   SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
29434   SDValue M0F = DAG.getConstant(0x0F, DL, VT);
29435 
29436   // High nibbles
29437   SDValue FourV = DAG.getConstant(4, DL, VT);
29438   SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
29439 
29440   // Low nibbles
29441   SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
29442 
29443   // The input vector is used as the shuffle mask that index elements into the
29444   // LUT. After counting low and high nibbles, add the vector to obtain the
29445   // final pop count per i8 element.
29446   SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
29447   SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
29448   return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
29449 }
29450 
29451 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
29452 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
LowerVectorCTPOP(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)29453 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
29454                                 SelectionDAG &DAG) {
29455   MVT VT = Op.getSimpleValueType();
29456   assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
29457          "Unknown CTPOP type to handle");
29458   SDLoc DL(Op.getNode());
29459   SDValue Op0 = Op.getOperand(0);
29460 
29461   // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
29462   if (Subtarget.hasVPOPCNTDQ()) {
29463     unsigned NumElems = VT.getVectorNumElements();
29464     assert((VT.getVectorElementType() == MVT::i8 ||
29465             VT.getVectorElementType() == MVT::i16) && "Unexpected type");
29466     if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
29467       MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
29468       Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
29469       Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
29470       return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
29471     }
29472   }
29473 
29474   // Decompose 256-bit ops into smaller 128-bit ops.
29475   if (VT.is256BitVector() && !Subtarget.hasInt256())
29476     return splitVectorIntUnary(Op, DAG);
29477 
29478   // Decompose 512-bit ops into smaller 256-bit ops.
29479   if (VT.is512BitVector() && !Subtarget.hasBWI())
29480     return splitVectorIntUnary(Op, DAG);
29481 
29482   // For element types greater than i8, do vXi8 pop counts and a bytesum.
29483   if (VT.getScalarType() != MVT::i8) {
29484     MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
29485     SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
29486     SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
29487     return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
29488   }
29489 
29490   // We can't use the fast LUT approach, so fall back on LegalizeDAG.
29491   if (!Subtarget.hasSSSE3())
29492     return SDValue();
29493 
29494   return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
29495 }
29496 
LowerCTPOP(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)29497 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
29498                           SelectionDAG &DAG) {
29499   assert(Op.getSimpleValueType().isVector() &&
29500          "We only do custom lowering for vector population count.");
29501   return LowerVectorCTPOP(Op, Subtarget, DAG);
29502 }
29503 
LowerBITREVERSE_XOP(SDValue Op,SelectionDAG & DAG)29504 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
29505   MVT VT = Op.getSimpleValueType();
29506   SDValue In = Op.getOperand(0);
29507   SDLoc DL(Op);
29508 
29509   // For scalars, its still beneficial to transfer to/from the SIMD unit to
29510   // perform the BITREVERSE.
29511   if (!VT.isVector()) {
29512     MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
29513     SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
29514     Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
29515     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
29516                        DAG.getIntPtrConstant(0, DL));
29517   }
29518 
29519   int NumElts = VT.getVectorNumElements();
29520   int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
29521 
29522   // Decompose 256-bit ops into smaller 128-bit ops.
29523   if (VT.is256BitVector())
29524     return splitVectorIntUnary(Op, DAG);
29525 
29526   assert(VT.is128BitVector() &&
29527          "Only 128-bit vector bitreverse lowering supported.");
29528 
29529   // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
29530   // perform the BSWAP in the shuffle.
29531   // Its best to shuffle using the second operand as this will implicitly allow
29532   // memory folding for multiple vectors.
29533   SmallVector<SDValue, 16> MaskElts;
29534   for (int i = 0; i != NumElts; ++i) {
29535     for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
29536       int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
29537       int PermuteByte = SourceByte | (2 << 5);
29538       MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
29539     }
29540   }
29541 
29542   SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
29543   SDValue Res = DAG.getBitcast(MVT::v16i8, In);
29544   Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
29545                     Res, Mask);
29546   return DAG.getBitcast(VT, Res);
29547 }
29548 
LowerBITREVERSE(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)29549 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
29550                                SelectionDAG &DAG) {
29551   MVT VT = Op.getSimpleValueType();
29552 
29553   if (Subtarget.hasXOP() && !VT.is512BitVector())
29554     return LowerBITREVERSE_XOP(Op, DAG);
29555 
29556   assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
29557 
29558   SDValue In = Op.getOperand(0);
29559   SDLoc DL(Op);
29560 
29561   assert(VT.getScalarType() == MVT::i8 &&
29562          "Only byte vector BITREVERSE supported");
29563 
29564   // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
29565   if (VT == MVT::v64i8 && !Subtarget.hasBWI())
29566     return splitVectorIntUnary(Op, DAG);
29567 
29568   // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
29569   if (VT == MVT::v32i8 && !Subtarget.hasInt256())
29570     return splitVectorIntUnary(Op, DAG);
29571 
29572   unsigned NumElts = VT.getVectorNumElements();
29573 
29574   // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
29575   if (Subtarget.hasGFNI()) {
29576     MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
29577     SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
29578     Matrix = DAG.getBitcast(VT, Matrix);
29579     return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
29580                        DAG.getTargetConstant(0, DL, MVT::i8));
29581   }
29582 
29583   // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
29584   // two nibbles and a PSHUFB lookup to find the bitreverse of each
29585   // 0-15 value (moved to the other nibble).
29586   SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
29587   SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
29588   SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
29589 
29590   const int LoLUT[16] = {
29591       /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
29592       /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
29593       /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
29594       /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
29595   const int HiLUT[16] = {
29596       /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
29597       /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
29598       /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
29599       /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
29600 
29601   SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
29602   for (unsigned i = 0; i < NumElts; ++i) {
29603     LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
29604     HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
29605   }
29606 
29607   SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
29608   SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
29609   Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
29610   Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
29611   return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
29612 }
29613 
LowerPARITY(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)29614 static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
29615                            SelectionDAG &DAG) {
29616   SDLoc DL(Op);
29617   SDValue X = Op.getOperand(0);
29618   MVT VT = Op.getSimpleValueType();
29619 
29620   // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
29621   if (VT == MVT::i8 ||
29622       DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
29623     X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
29624     SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
29625                                 DAG.getConstant(0, DL, MVT::i8));
29626     // Copy the inverse of the parity flag into a register with setcc.
29627     SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
29628     // Extend to the original type.
29629     return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
29630   }
29631 
29632   if (VT == MVT::i64) {
29633     // Xor the high and low 16-bits together using a 32-bit operation.
29634     SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
29635                              DAG.getNode(ISD::SRL, DL, MVT::i64, X,
29636                                          DAG.getConstant(32, DL, MVT::i8)));
29637     SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
29638     X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
29639   }
29640 
29641   if (VT != MVT::i16) {
29642     // Xor the high and low 16-bits together using a 32-bit operation.
29643     SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
29644                                DAG.getConstant(16, DL, MVT::i8));
29645     X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
29646   } else {
29647     // If the input is 16-bits, we need to extend to use an i32 shift below.
29648     X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
29649   }
29650 
29651   // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
29652   // This should allow an h-reg to be used to save a shift.
29653   SDValue Hi = DAG.getNode(
29654       ISD::TRUNCATE, DL, MVT::i8,
29655       DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
29656   SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
29657   SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
29658   SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
29659 
29660   // Copy the inverse of the parity flag into a register with setcc.
29661   SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
29662   // Extend to the original type.
29663   return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
29664 }
29665 
lowerAtomicArithWithLOCK(SDValue N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29666 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
29667                                         const X86Subtarget &Subtarget) {
29668   unsigned NewOpc = 0;
29669   switch (N->getOpcode()) {
29670   case ISD::ATOMIC_LOAD_ADD:
29671     NewOpc = X86ISD::LADD;
29672     break;
29673   case ISD::ATOMIC_LOAD_SUB:
29674     NewOpc = X86ISD::LSUB;
29675     break;
29676   case ISD::ATOMIC_LOAD_OR:
29677     NewOpc = X86ISD::LOR;
29678     break;
29679   case ISD::ATOMIC_LOAD_XOR:
29680     NewOpc = X86ISD::LXOR;
29681     break;
29682   case ISD::ATOMIC_LOAD_AND:
29683     NewOpc = X86ISD::LAND;
29684     break;
29685   default:
29686     llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
29687   }
29688 
29689   MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
29690 
29691   return DAG.getMemIntrinsicNode(
29692       NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
29693       {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
29694       /*MemVT=*/N->getSimpleValueType(0), MMO);
29695 }
29696 
29697 /// Lower atomic_load_ops into LOCK-prefixed operations.
lowerAtomicArith(SDValue N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29698 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
29699                                 const X86Subtarget &Subtarget) {
29700   AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
29701   SDValue Chain = N->getOperand(0);
29702   SDValue LHS = N->getOperand(1);
29703   SDValue RHS = N->getOperand(2);
29704   unsigned Opc = N->getOpcode();
29705   MVT VT = N->getSimpleValueType(0);
29706   SDLoc DL(N);
29707 
29708   // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
29709   // can only be lowered when the result is unused.  They should have already
29710   // been transformed into a cmpxchg loop in AtomicExpand.
29711   if (N->hasAnyUseOfValue(0)) {
29712     // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
29713     // select LXADD if LOCK_SUB can't be selected.
29714     if (Opc == ISD::ATOMIC_LOAD_SUB) {
29715       RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
29716       return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
29717                            RHS, AN->getMemOperand());
29718     }
29719     assert(Opc == ISD::ATOMIC_LOAD_ADD &&
29720            "Used AtomicRMW ops other than Add should have been expanded!");
29721     return N;
29722   }
29723 
29724   // Specialized lowering for the canonical form of an idemptotent atomicrmw.
29725   // The core idea here is that since the memory location isn't actually
29726   // changing, all we need is a lowering for the *ordering* impacts of the
29727   // atomicrmw.  As such, we can chose a different operation and memory
29728   // location to minimize impact on other code.
29729   if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
29730     // On X86, the only ordering which actually requires an instruction is
29731     // seq_cst which isn't SingleThread, everything just needs to be preserved
29732     // during codegen and then dropped. Note that we expect (but don't assume),
29733     // that orderings other than seq_cst and acq_rel have been canonicalized to
29734     // a store or load.
29735     if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
29736         AN->getSyncScopeID() == SyncScope::System) {
29737       // Prefer a locked operation against a stack location to minimize cache
29738       // traffic.  This assumes that stack locations are very likely to be
29739       // accessed only by the owning thread.
29740       SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
29741       assert(!N->hasAnyUseOfValue(0));
29742       // NOTE: The getUNDEF is needed to give something for the unused result 0.
29743       return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
29744                          DAG.getUNDEF(VT), NewChain);
29745     }
29746     // MEMBARRIER is a compiler barrier; it codegens to a no-op.
29747     SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
29748     assert(!N->hasAnyUseOfValue(0));
29749     // NOTE: The getUNDEF is needed to give something for the unused result 0.
29750     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
29751                        DAG.getUNDEF(VT), NewChain);
29752   }
29753 
29754   SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
29755   // RAUW the chain, but don't worry about the result, as it's unused.
29756   assert(!N->hasAnyUseOfValue(0));
29757   // NOTE: The getUNDEF is needed to give something for the unused result 0.
29758   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
29759                      DAG.getUNDEF(VT), LockOp.getValue(1));
29760 }
29761 
LowerATOMIC_STORE(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)29762 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
29763                                  const X86Subtarget &Subtarget) {
29764   auto *Node = cast<AtomicSDNode>(Op.getNode());
29765   SDLoc dl(Node);
29766   EVT VT = Node->getMemoryVT();
29767 
29768   bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent;
29769   bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
29770 
29771   // If this store is not sequentially consistent and the type is legal
29772   // we can just keep it.
29773   if (!IsSeqCst && IsTypeLegal)
29774     return Op;
29775 
29776   if (VT == MVT::i64 && !IsTypeLegal) {
29777     // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
29778     // is enabled.
29779     bool NoImplicitFloatOps =
29780         DAG.getMachineFunction().getFunction().hasFnAttribute(
29781             Attribute::NoImplicitFloat);
29782     if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
29783       SDValue Chain;
29784       if (Subtarget.hasSSE1()) {
29785         SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
29786                                        Node->getOperand(2));
29787         MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
29788         SclToVec = DAG.getBitcast(StVT, SclToVec);
29789         SDVTList Tys = DAG.getVTList(MVT::Other);
29790         SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
29791         Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
29792                                         MVT::i64, Node->getMemOperand());
29793       } else if (Subtarget.hasX87()) {
29794         // First load this into an 80-bit X87 register using a stack temporary.
29795         // This will put the whole integer into the significand.
29796         SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
29797         int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29798         MachinePointerInfo MPI =
29799             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
29800         Chain =
29801             DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
29802                          MPI, MaybeAlign(), MachineMemOperand::MOStore);
29803         SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
29804         SDValue LdOps[] = {Chain, StackPtr};
29805         SDValue Value =
29806             DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
29807                                     /*Align*/ None, MachineMemOperand::MOLoad);
29808         Chain = Value.getValue(1);
29809 
29810         // Now use an FIST to do the atomic store.
29811         SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
29812         Chain =
29813             DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
29814                                     StoreOps, MVT::i64, Node->getMemOperand());
29815       }
29816 
29817       if (Chain) {
29818         // If this is a sequentially consistent store, also emit an appropriate
29819         // barrier.
29820         if (IsSeqCst)
29821           Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
29822 
29823         return Chain;
29824       }
29825     }
29826   }
29827 
29828   // Convert seq_cst store -> xchg
29829   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
29830   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
29831   SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
29832                                Node->getMemoryVT(),
29833                                Node->getOperand(0),
29834                                Node->getOperand(1), Node->getOperand(2),
29835                                Node->getMemOperand());
29836   return Swap.getValue(1);
29837 }
29838 
LowerADDSUBCARRY(SDValue Op,SelectionDAG & DAG)29839 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
29840   SDNode *N = Op.getNode();
29841   MVT VT = N->getSimpleValueType(0);
29842   unsigned Opc = Op.getOpcode();
29843 
29844   // Let legalize expand this if it isn't a legal type yet.
29845   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
29846     return SDValue();
29847 
29848   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29849   SDLoc DL(N);
29850 
29851   // Set the carry flag.
29852   SDValue Carry = Op.getOperand(2);
29853   EVT CarryVT = Carry.getValueType();
29854   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
29855                       Carry, DAG.getAllOnesConstant(DL, CarryVT));
29856 
29857   bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;
29858   SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
29859                             Op.getOperand(0), Op.getOperand(1),
29860                             Carry.getValue(1));
29861 
29862   bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
29863   SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
29864                            Sum.getValue(1), DL, DAG);
29865   if (N->getValueType(1) == MVT::i1)
29866     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
29867 
29868   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
29869 }
29870 
LowerFSINCOS(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)29871 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
29872                             SelectionDAG &DAG) {
29873   assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
29874 
29875   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
29876   // which returns the values as { float, float } (in XMM0) or
29877   // { double, double } (which is returned in XMM0, XMM1).
29878   SDLoc dl(Op);
29879   SDValue Arg = Op.getOperand(0);
29880   EVT ArgVT = Arg.getValueType();
29881   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
29882 
29883   TargetLowering::ArgListTy Args;
29884   TargetLowering::ArgListEntry Entry;
29885 
29886   Entry.Node = Arg;
29887   Entry.Ty = ArgTy;
29888   Entry.IsSExt = false;
29889   Entry.IsZExt = false;
29890   Args.push_back(Entry);
29891 
29892   bool isF64 = ArgVT == MVT::f64;
29893   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
29894   // the small struct {f32, f32} is returned in (eax, edx). For f64,
29895   // the results are returned via SRet in memory.
29896   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29897   RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
29898   const char *LibcallName = TLI.getLibcallName(LC);
29899   SDValue Callee =
29900       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
29901 
29902   Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
29903                       : (Type *)FixedVectorType::get(ArgTy, 4);
29904 
29905   TargetLowering::CallLoweringInfo CLI(DAG);
29906   CLI.setDebugLoc(dl)
29907       .setChain(DAG.getEntryNode())
29908       .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
29909 
29910   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
29911 
29912   if (isF64)
29913     // Returned in xmm0 and xmm1.
29914     return CallResult.first;
29915 
29916   // Returned in bits 0:31 and 32:64 xmm0.
29917   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
29918                                CallResult.first, DAG.getIntPtrConstant(0, dl));
29919   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
29920                                CallResult.first, DAG.getIntPtrConstant(1, dl));
29921   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
29922   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
29923 }
29924 
29925 /// Widen a vector input to a vector of NVT.  The
29926 /// input vector must have the same element type as NVT.
ExtendToType(SDValue InOp,MVT NVT,SelectionDAG & DAG,bool FillWithZeroes=false)29927 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
29928                             bool FillWithZeroes = false) {
29929   // Check if InOp already has the right width.
29930   MVT InVT = InOp.getSimpleValueType();
29931   if (InVT == NVT)
29932     return InOp;
29933 
29934   if (InOp.isUndef())
29935     return DAG.getUNDEF(NVT);
29936 
29937   assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
29938          "input and widen element type must match");
29939 
29940   unsigned InNumElts = InVT.getVectorNumElements();
29941   unsigned WidenNumElts = NVT.getVectorNumElements();
29942   assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
29943          "Unexpected request for vector widening");
29944 
29945   SDLoc dl(InOp);
29946   if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
29947       InOp.getNumOperands() == 2) {
29948     SDValue N1 = InOp.getOperand(1);
29949     if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
29950         N1.isUndef()) {
29951       InOp = InOp.getOperand(0);
29952       InVT = InOp.getSimpleValueType();
29953       InNumElts = InVT.getVectorNumElements();
29954     }
29955   }
29956   if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
29957       ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
29958     SmallVector<SDValue, 16> Ops;
29959     for (unsigned i = 0; i < InNumElts; ++i)
29960       Ops.push_back(InOp.getOperand(i));
29961 
29962     EVT EltVT = InOp.getOperand(0).getValueType();
29963 
29964     SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
29965       DAG.getUNDEF(EltVT);
29966     for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
29967       Ops.push_back(FillVal);
29968     return DAG.getBuildVector(NVT, dl, Ops);
29969   }
29970   SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
29971     DAG.getUNDEF(NVT);
29972   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
29973                      InOp, DAG.getIntPtrConstant(0, dl));
29974 }
29975 
LowerMSCATTER(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)29976 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
29977                              SelectionDAG &DAG) {
29978   assert(Subtarget.hasAVX512() &&
29979          "MGATHER/MSCATTER are supported on AVX-512 arch only");
29980 
29981   MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
29982   SDValue Src = N->getValue();
29983   MVT VT = Src.getSimpleValueType();
29984   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
29985   SDLoc dl(Op);
29986 
29987   SDValue Scale = N->getScale();
29988   SDValue Index = N->getIndex();
29989   SDValue Mask = N->getMask();
29990   SDValue Chain = N->getChain();
29991   SDValue BasePtr = N->getBasePtr();
29992 
29993   if (VT == MVT::v2f32 || VT == MVT::v2i32) {
29994     assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
29995     // If the index is v2i64 and we have VLX we can use xmm for data and index.
29996     if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
29997       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29998       EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
29999       Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
30000       SDVTList VTs = DAG.getVTList(MVT::Other);
30001       SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
30002       return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
30003                                      N->getMemoryVT(), N->getMemOperand());
30004     }
30005     return SDValue();
30006   }
30007 
30008   MVT IndexVT = Index.getSimpleValueType();
30009 
30010   // If the index is v2i32, we're being called by type legalization and we
30011   // should just let the default handling take care of it.
30012   if (IndexVT == MVT::v2i32)
30013     return SDValue();
30014 
30015   // If we don't have VLX and neither the passthru or index is 512-bits, we
30016   // need to widen until one is.
30017   if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
30018       !Index.getSimpleValueType().is512BitVector()) {
30019     // Determine how much we need to widen by to get a 512-bit type.
30020     unsigned Factor = std::min(512/VT.getSizeInBits(),
30021                                512/IndexVT.getSizeInBits());
30022     unsigned NumElts = VT.getVectorNumElements() * Factor;
30023 
30024     VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
30025     IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
30026     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
30027 
30028     Src = ExtendToType(Src, VT, DAG);
30029     Index = ExtendToType(Index, IndexVT, DAG);
30030     Mask = ExtendToType(Mask, MaskVT, DAG, true);
30031   }
30032 
30033   SDVTList VTs = DAG.getVTList(MVT::Other);
30034   SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
30035   return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
30036                                  N->getMemoryVT(), N->getMemOperand());
30037 }
30038 
LowerMLOAD(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)30039 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
30040                           SelectionDAG &DAG) {
30041 
30042   MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
30043   MVT VT = Op.getSimpleValueType();
30044   MVT ScalarVT = VT.getScalarType();
30045   SDValue Mask = N->getMask();
30046   MVT MaskVT = Mask.getSimpleValueType();
30047   SDValue PassThru = N->getPassThru();
30048   SDLoc dl(Op);
30049 
30050   // Handle AVX masked loads which don't support passthru other than 0.
30051   if (MaskVT.getVectorElementType() != MVT::i1) {
30052     // We also allow undef in the isel pattern.
30053     if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
30054       return Op;
30055 
30056     SDValue NewLoad = DAG.getMaskedLoad(
30057         VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
30058         getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
30059         N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
30060         N->isExpandingLoad());
30061     // Emit a blend.
30062     SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
30063     return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
30064   }
30065 
30066   assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
30067          "Expanding masked load is supported on AVX-512 target only!");
30068 
30069   assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
30070          "Expanding masked load is supported for 32 and 64-bit types only!");
30071 
30072   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
30073          "Cannot lower masked load op.");
30074 
30075   assert((ScalarVT.getSizeInBits() >= 32 ||
30076           (Subtarget.hasBWI() &&
30077               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
30078          "Unsupported masked load op.");
30079 
30080   // This operation is legal for targets with VLX, but without
30081   // VLX the vector should be widened to 512 bit
30082   unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
30083   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
30084   PassThru = ExtendToType(PassThru, WideDataVT, DAG);
30085 
30086   // Mask element has to be i1.
30087   assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
30088          "Unexpected mask type");
30089 
30090   MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
30091 
30092   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
30093   SDValue NewLoad = DAG.getMaskedLoad(
30094       WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
30095       PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
30096       N->getExtensionType(), N->isExpandingLoad());
30097 
30098   SDValue Extract =
30099       DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
30100                   DAG.getIntPtrConstant(0, dl));
30101   SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
30102   return DAG.getMergeValues(RetOps, dl);
30103 }
30104 
LowerMSTORE(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)30105 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
30106                            SelectionDAG &DAG) {
30107   MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
30108   SDValue DataToStore = N->getValue();
30109   MVT VT = DataToStore.getSimpleValueType();
30110   MVT ScalarVT = VT.getScalarType();
30111   SDValue Mask = N->getMask();
30112   SDLoc dl(Op);
30113 
30114   assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
30115          "Expanding masked load is supported on AVX-512 target only!");
30116 
30117   assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
30118          "Expanding masked load is supported for 32 and 64-bit types only!");
30119 
30120   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
30121          "Cannot lower masked store op.");
30122 
30123   assert((ScalarVT.getSizeInBits() >= 32 ||
30124           (Subtarget.hasBWI() &&
30125               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
30126           "Unsupported masked store op.");
30127 
30128   // This operation is legal for targets with VLX, but without
30129   // VLX the vector should be widened to 512 bit
30130   unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
30131   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
30132 
30133   // Mask element has to be i1.
30134   assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
30135          "Unexpected mask type");
30136 
30137   MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
30138 
30139   DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
30140   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
30141   return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
30142                             N->getOffset(), Mask, N->getMemoryVT(),
30143                             N->getMemOperand(), N->getAddressingMode(),
30144                             N->isTruncatingStore(), N->isCompressingStore());
30145 }
30146 
LowerMGATHER(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)30147 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
30148                             SelectionDAG &DAG) {
30149   assert(Subtarget.hasAVX2() &&
30150          "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
30151 
30152   MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
30153   SDLoc dl(Op);
30154   MVT VT = Op.getSimpleValueType();
30155   SDValue Index = N->getIndex();
30156   SDValue Mask = N->getMask();
30157   SDValue PassThru = N->getPassThru();
30158   MVT IndexVT = Index.getSimpleValueType();
30159 
30160   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
30161 
30162   // If the index is v2i32, we're being called by type legalization.
30163   if (IndexVT == MVT::v2i32)
30164     return SDValue();
30165 
30166   // If we don't have VLX and neither the passthru or index is 512-bits, we
30167   // need to widen until one is.
30168   MVT OrigVT = VT;
30169   if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
30170       !IndexVT.is512BitVector()) {
30171     // Determine how much we need to widen by to get a 512-bit type.
30172     unsigned Factor = std::min(512/VT.getSizeInBits(),
30173                                512/IndexVT.getSizeInBits());
30174 
30175     unsigned NumElts = VT.getVectorNumElements() * Factor;
30176 
30177     VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
30178     IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
30179     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
30180 
30181     PassThru = ExtendToType(PassThru, VT, DAG);
30182     Index = ExtendToType(Index, IndexVT, DAG);
30183     Mask = ExtendToType(Mask, MaskVT, DAG, true);
30184   }
30185 
30186   SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
30187                     N->getScale() };
30188   SDValue NewGather = DAG.getMemIntrinsicNode(
30189       X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
30190       N->getMemOperand());
30191   SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
30192                                 NewGather, DAG.getIntPtrConstant(0, dl));
30193   return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
30194 }
30195 
LowerADDRSPACECAST(SDValue Op,SelectionDAG & DAG)30196 static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
30197   SDLoc dl(Op);
30198   SDValue Src = Op.getOperand(0);
30199   MVT DstVT = Op.getSimpleValueType();
30200 
30201   AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
30202   unsigned SrcAS = N->getSrcAddressSpace();
30203 
30204   assert(SrcAS != N->getDestAddressSpace() &&
30205          "addrspacecast must be between different address spaces");
30206 
30207   if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
30208     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
30209   } else if (DstVT == MVT::i64) {
30210     Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
30211   } else if (DstVT == MVT::i32) {
30212     Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
30213   } else {
30214     report_fatal_error("Bad address space in addrspacecast");
30215   }
30216   return Op;
30217 }
30218 
LowerGC_TRANSITION(SDValue Op,SelectionDAG & DAG) const30219 SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
30220                                               SelectionDAG &DAG) const {
30221   // TODO: Eventually, the lowering of these nodes should be informed by or
30222   // deferred to the GC strategy for the function in which they appear. For
30223   // now, however, they must be lowered to something. Since they are logically
30224   // no-ops in the case of a null GC strategy (or a GC strategy which does not
30225   // require special handling for these nodes), lower them as literal NOOPs for
30226   // the time being.
30227   SmallVector<SDValue, 2> Ops;
30228 
30229   Ops.push_back(Op.getOperand(0));
30230   if (Op->getGluedNode())
30231     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
30232 
30233   SDLoc OpDL(Op);
30234   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
30235   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
30236 
30237   return NOOP;
30238 }
30239 
30240 // Custom split CVTPS2PH with wide types.
LowerCVTPS2PH(SDValue Op,SelectionDAG & DAG)30241 static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
30242   SDLoc dl(Op);
30243   EVT VT = Op.getValueType();
30244   SDValue Lo, Hi;
30245   std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
30246   EVT LoVT, HiVT;
30247   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
30248   SDValue RC = Op.getOperand(1);
30249   Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
30250   Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
30251   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30252 }
30253 
30254 /// Provide custom lowering hooks for some operations.
LowerOperation(SDValue Op,SelectionDAG & DAG) const30255 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
30256   switch (Op.getOpcode()) {
30257   default: llvm_unreachable("Should not custom lower this!");
30258   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
30259   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
30260     return LowerCMP_SWAP(Op, Subtarget, DAG);
30261   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
30262   case ISD::ATOMIC_LOAD_ADD:
30263   case ISD::ATOMIC_LOAD_SUB:
30264   case ISD::ATOMIC_LOAD_OR:
30265   case ISD::ATOMIC_LOAD_XOR:
30266   case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);
30267   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG, Subtarget);
30268   case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);
30269   case ISD::PARITY:             return LowerPARITY(Op, Subtarget, DAG);
30270   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
30271   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
30272   case ISD::VECTOR_SHUFFLE:     return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
30273   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
30274   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
30275   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
30276   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
30277   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
30278   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
30279   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
30280   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
30281   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
30282   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
30283   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
30284   case ISD::SHL_PARTS:
30285   case ISD::SRA_PARTS:
30286   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
30287   case ISD::FSHL:
30288   case ISD::FSHR:               return LowerFunnelShift(Op, Subtarget, DAG);
30289   case ISD::STRICT_SINT_TO_FP:
30290   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
30291   case ISD::STRICT_UINT_TO_FP:
30292   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
30293   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
30294   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
30295   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
30296   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
30297   case ISD::ZERO_EXTEND_VECTOR_INREG:
30298   case ISD::SIGN_EXTEND_VECTOR_INREG:
30299     return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
30300   case ISD::FP_TO_SINT:
30301   case ISD::STRICT_FP_TO_SINT:
30302   case ISD::FP_TO_UINT:
30303   case ISD::STRICT_FP_TO_UINT:  return LowerFP_TO_INT(Op, DAG);
30304   case ISD::FP_TO_SINT_SAT:
30305   case ISD::FP_TO_UINT_SAT:     return LowerFP_TO_INT_SAT(Op, DAG);
30306   case ISD::FP_EXTEND:
30307   case ISD::STRICT_FP_EXTEND:   return LowerFP_EXTEND(Op, DAG);
30308   case ISD::FP_ROUND:
30309   case ISD::STRICT_FP_ROUND:    return LowerFP_ROUND(Op, DAG);
30310   case ISD::FP16_TO_FP:
30311   case ISD::STRICT_FP16_TO_FP:  return LowerFP16_TO_FP(Op, DAG);
30312   case ISD::FP_TO_FP16:
30313   case ISD::STRICT_FP_TO_FP16:  return LowerFP_TO_FP16(Op, DAG);
30314   case ISD::LOAD:               return LowerLoad(Op, Subtarget, DAG);
30315   case ISD::STORE:              return LowerStore(Op, Subtarget, DAG);
30316   case ISD::FADD:
30317   case ISD::FSUB:               return lowerFaddFsub(Op, DAG);
30318   case ISD::FROUND:             return LowerFROUND(Op, DAG);
30319   case ISD::FABS:
30320   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
30321   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
30322   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
30323   case ISD::LRINT:
30324   case ISD::LLRINT:             return LowerLRINT_LLRINT(Op, DAG);
30325   case ISD::SETCC:
30326   case ISD::STRICT_FSETCC:
30327   case ISD::STRICT_FSETCCS:     return LowerSETCC(Op, DAG);
30328   case ISD::SETCCCARRY:         return LowerSETCCCARRY(Op, DAG);
30329   case ISD::SELECT:             return LowerSELECT(Op, DAG);
30330   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
30331   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
30332   case ISD::VASTART:            return LowerVASTART(Op, DAG);
30333   case ISD::VAARG:              return LowerVAARG(Op, DAG);
30334   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
30335   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
30336   case ISD::INTRINSIC_VOID:
30337   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
30338   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
30339   case ISD::ADDROFRETURNADDR:   return LowerADDROFRETURNADDR(Op, DAG);
30340   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
30341   case ISD::FRAME_TO_ARGS_OFFSET:
30342                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
30343   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
30344   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
30345   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
30346   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
30347   case ISD::EH_SJLJ_SETUP_DISPATCH:
30348     return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
30349   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
30350   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
30351   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
30352   case ISD::SET_ROUNDING:       return LowerSET_ROUNDING(Op, DAG);
30353   case ISD::CTLZ:
30354   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);
30355   case ISD::CTTZ:
30356   case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, Subtarget, DAG);
30357   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
30358   case ISD::MULHS:
30359   case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
30360   case ISD::ROTL:
30361   case ISD::ROTR:               return LowerRotate(Op, Subtarget, DAG);
30362   case ISD::SRA:
30363   case ISD::SRL:
30364   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
30365   case ISD::SADDO:
30366   case ISD::UADDO:
30367   case ISD::SSUBO:
30368   case ISD::USUBO:              return LowerXALUO(Op, DAG);
30369   case ISD::SMULO:
30370   case ISD::UMULO:              return LowerMULO(Op, Subtarget, DAG);
30371   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
30372   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
30373   case ISD::SADDO_CARRY:
30374   case ISD::SSUBO_CARRY:
30375   case ISD::ADDCARRY:
30376   case ISD::SUBCARRY:           return LowerADDSUBCARRY(Op, DAG);
30377   case ISD::ADD:
30378   case ISD::SUB:                return lowerAddSub(Op, DAG, Subtarget);
30379   case ISD::UADDSAT:
30380   case ISD::SADDSAT:
30381   case ISD::USUBSAT:
30382   case ISD::SSUBSAT:            return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
30383   case ISD::SMAX:
30384   case ISD::SMIN:
30385   case ISD::UMAX:
30386   case ISD::UMIN:               return LowerMINMAX(Op, DAG);
30387   case ISD::ABS:                return LowerABS(Op, Subtarget, DAG);
30388   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
30389   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
30390   case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
30391   case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
30392   case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
30393   case ISD::GC_TRANSITION_START:
30394   case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION(Op, DAG);
30395   case ISD::ADDRSPACECAST:      return LowerADDRSPACECAST(Op, DAG);
30396   case X86ISD::CVTPS2PH:        return LowerCVTPS2PH(Op, DAG);
30397   }
30398 }
30399 
30400 /// Replace a node with an illegal result type with a new node built out of
30401 /// custom code.
ReplaceNodeResults(SDNode * N,SmallVectorImpl<SDValue> & Results,SelectionDAG & DAG) const30402 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
30403                                            SmallVectorImpl<SDValue>&Results,
30404                                            SelectionDAG &DAG) const {
30405   SDLoc dl(N);
30406   switch (N->getOpcode()) {
30407   default:
30408 #ifndef NDEBUG
30409     dbgs() << "ReplaceNodeResults: ";
30410     N->dump(&DAG);
30411 #endif
30412     llvm_unreachable("Do not know how to custom type legalize this operation!");
30413   case X86ISD::CVTPH2PS: {
30414     EVT VT = N->getValueType(0);
30415     SDValue Lo, Hi;
30416     std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
30417     EVT LoVT, HiVT;
30418     std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
30419     Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
30420     Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
30421     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30422     Results.push_back(Res);
30423     return;
30424   }
30425   case X86ISD::STRICT_CVTPH2PS: {
30426     EVT VT = N->getValueType(0);
30427     SDValue Lo, Hi;
30428     std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
30429     EVT LoVT, HiVT;
30430     std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
30431     Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
30432                      {N->getOperand(0), Lo});
30433     Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
30434                      {N->getOperand(0), Hi});
30435     SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
30436                                 Lo.getValue(1), Hi.getValue(1));
30437     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30438     Results.push_back(Res);
30439     Results.push_back(Chain);
30440     return;
30441   }
30442   case X86ISD::CVTPS2PH:
30443     Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
30444     return;
30445   case ISD::CTPOP: {
30446     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
30447     // Use a v2i64 if possible.
30448     bool NoImplicitFloatOps =
30449         DAG.getMachineFunction().getFunction().hasFnAttribute(
30450             Attribute::NoImplicitFloat);
30451     if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
30452       SDValue Wide =
30453           DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
30454       Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
30455       // Bit count should fit in 32-bits, extract it as that and then zero
30456       // extend to i64. Otherwise we end up extracting bits 63:32 separately.
30457       Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
30458       Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
30459                          DAG.getIntPtrConstant(0, dl));
30460       Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
30461       Results.push_back(Wide);
30462     }
30463     return;
30464   }
30465   case ISD::MUL: {
30466     EVT VT = N->getValueType(0);
30467     assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
30468            VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
30469     // Pre-promote these to vXi16 to avoid op legalization thinking all 16
30470     // elements are needed.
30471     MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
30472     SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
30473     SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
30474     SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
30475     Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
30476     unsigned NumConcats = 16 / VT.getVectorNumElements();
30477     SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
30478     ConcatOps[0] = Res;
30479     Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
30480     Results.push_back(Res);
30481     return;
30482   }
30483   case X86ISD::VPMADDWD:
30484   case X86ISD::AVG: {
30485     // Legalize types for X86ISD::AVG/VPMADDWD by widening.
30486     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
30487 
30488     EVT VT = N->getValueType(0);
30489     EVT InVT = N->getOperand(0).getValueType();
30490     assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
30491            "Expected a VT that divides into 128 bits.");
30492     assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
30493            "Unexpected type action!");
30494     unsigned NumConcat = 128 / InVT.getSizeInBits();
30495 
30496     EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
30497                                     InVT.getVectorElementType(),
30498                                     NumConcat * InVT.getVectorNumElements());
30499     EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
30500                                   VT.getVectorElementType(),
30501                                   NumConcat * VT.getVectorNumElements());
30502 
30503     SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
30504     Ops[0] = N->getOperand(0);
30505     SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
30506     Ops[0] = N->getOperand(1);
30507     SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
30508 
30509     SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
30510     Results.push_back(Res);
30511     return;
30512   }
30513   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
30514   case X86ISD::FMINC:
30515   case X86ISD::FMIN:
30516   case X86ISD::FMAXC:
30517   case X86ISD::FMAX: {
30518     EVT VT = N->getValueType(0);
30519     assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
30520     SDValue UNDEF = DAG.getUNDEF(VT);
30521     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
30522                               N->getOperand(0), UNDEF);
30523     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
30524                               N->getOperand(1), UNDEF);
30525     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
30526     return;
30527   }
30528   case ISD::SDIV:
30529   case ISD::UDIV:
30530   case ISD::SREM:
30531   case ISD::UREM: {
30532     EVT VT = N->getValueType(0);
30533     if (VT.isVector()) {
30534       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
30535              "Unexpected type action!");
30536       // If this RHS is a constant splat vector we can widen this and let
30537       // division/remainder by constant optimize it.
30538       // TODO: Can we do something for non-splat?
30539       APInt SplatVal;
30540       if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
30541         unsigned NumConcats = 128 / VT.getSizeInBits();
30542         SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
30543         Ops0[0] = N->getOperand(0);
30544         EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
30545         SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
30546         SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
30547         SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
30548         Results.push_back(Res);
30549       }
30550       return;
30551     }
30552 
30553     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
30554     Results.push_back(V);
30555     return;
30556   }
30557   case ISD::TRUNCATE: {
30558     MVT VT = N->getSimpleValueType(0);
30559     if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
30560       return;
30561 
30562     // The generic legalizer will try to widen the input type to the same
30563     // number of elements as the widened result type. But this isn't always
30564     // the best thing so do some custom legalization to avoid some cases.
30565     MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
30566     SDValue In = N->getOperand(0);
30567     EVT InVT = In.getValueType();
30568 
30569     unsigned InBits = InVT.getSizeInBits();
30570     if (128 % InBits == 0) {
30571       // 128 bit and smaller inputs should avoid truncate all together and
30572       // just use a build_vector that will become a shuffle.
30573       // TODO: Widen and use a shuffle directly?
30574       MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
30575       EVT EltVT = VT.getVectorElementType();
30576       unsigned WidenNumElts = WidenVT.getVectorNumElements();
30577       SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
30578       // Use the original element count so we don't do more scalar opts than
30579       // necessary.
30580       unsigned MinElts = VT.getVectorNumElements();
30581       for (unsigned i=0; i < MinElts; ++i) {
30582         SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
30583                                   DAG.getIntPtrConstant(i, dl));
30584         Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
30585       }
30586       Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
30587       return;
30588     }
30589     // With AVX512 there are some cases that can use a target specific
30590     // truncate node to go from 256/512 to less than 128 with zeros in the
30591     // upper elements of the 128 bit result.
30592     if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
30593       // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
30594       if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
30595         Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
30596         return;
30597       }
30598       // There's one case we can widen to 512 bits and use VTRUNC.
30599       if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
30600         In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
30601                          DAG.getUNDEF(MVT::v4i64));
30602         Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
30603         return;
30604       }
30605     }
30606     if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
30607         getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
30608         isTypeLegal(MVT::v4i64)) {
30609       // Input needs to be split and output needs to widened. Let's use two
30610       // VTRUNCs, and shuffle their results together into the wider type.
30611       SDValue Lo, Hi;
30612       std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
30613 
30614       Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
30615       Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
30616       SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
30617                                          { 0,  1,  2,  3, 16, 17, 18, 19,
30618                                           -1, -1, -1, -1, -1, -1, -1, -1 });
30619       Results.push_back(Res);
30620       return;
30621     }
30622 
30623     return;
30624   }
30625   case ISD::ANY_EXTEND:
30626     // Right now, only MVT::v8i8 has Custom action for an illegal type.
30627     // It's intended to custom handle the input type.
30628     assert(N->getValueType(0) == MVT::v8i8 &&
30629            "Do not know how to legalize this Node");
30630     return;
30631   case ISD::SIGN_EXTEND:
30632   case ISD::ZERO_EXTEND: {
30633     EVT VT = N->getValueType(0);
30634     SDValue In = N->getOperand(0);
30635     EVT InVT = In.getValueType();
30636     if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
30637         (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
30638       assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
30639              "Unexpected type action!");
30640       assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
30641       // Custom split this so we can extend i8/i16->i32 invec. This is better
30642       // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
30643       // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
30644       // we allow the sra from the extend to i32 to be shared by the split.
30645       In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
30646 
30647       // Fill a vector with sign bits for each element.
30648       SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
30649       SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
30650 
30651       // Create an unpackl and unpackh to interleave the sign bits then bitcast
30652       // to v2i64.
30653       SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
30654                                         {0, 4, 1, 5});
30655       Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
30656       SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
30657                                         {2, 6, 3, 7});
30658       Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
30659 
30660       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30661       Results.push_back(Res);
30662       return;
30663     }
30664 
30665     if (VT == MVT::v16i32 || VT == MVT::v8i64) {
30666       if (!InVT.is128BitVector()) {
30667         // Not a 128 bit vector, but maybe type legalization will promote
30668         // it to 128 bits.
30669         if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
30670           return;
30671         InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
30672         if (!InVT.is128BitVector())
30673           return;
30674 
30675         // Promote the input to 128 bits. Type legalization will turn this into
30676         // zext_inreg/sext_inreg.
30677         In = DAG.getNode(N->getOpcode(), dl, InVT, In);
30678       }
30679 
30680       // Perform custom splitting instead of the two stage extend we would get
30681       // by default.
30682       EVT LoVT, HiVT;
30683       std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
30684       assert(isTypeLegal(LoVT) && "Split VT not legal?");
30685 
30686       SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
30687 
30688       // We need to shift the input over by half the number of elements.
30689       unsigned NumElts = InVT.getVectorNumElements();
30690       unsigned HalfNumElts = NumElts / 2;
30691       SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
30692       for (unsigned i = 0; i != HalfNumElts; ++i)
30693         ShufMask[i] = i + HalfNumElts;
30694 
30695       SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
30696       Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
30697 
30698       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30699       Results.push_back(Res);
30700     }
30701     return;
30702   }
30703   case ISD::FP_TO_SINT:
30704   case ISD::STRICT_FP_TO_SINT:
30705   case ISD::FP_TO_UINT:
30706   case ISD::STRICT_FP_TO_UINT: {
30707     bool IsStrict = N->isStrictFPOpcode();
30708     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
30709                     N->getOpcode() == ISD::STRICT_FP_TO_SINT;
30710     EVT VT = N->getValueType(0);
30711     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
30712     EVT SrcVT = Src.getValueType();
30713 
30714     if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
30715       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
30716              "Unexpected type action!");
30717 
30718       // Try to create a 128 bit vector, but don't exceed a 32 bit element.
30719       unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
30720       MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
30721                                        VT.getVectorNumElements());
30722       SDValue Res;
30723       SDValue Chain;
30724       if (IsStrict) {
30725         Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
30726                           {N->getOperand(0), Src});
30727         Chain = Res.getValue(1);
30728       } else
30729         Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
30730 
30731       // Preserve what we know about the size of the original result. Except
30732       // when the result is v2i32 since we can't widen the assert.
30733       if (PromoteVT != MVT::v2i32)
30734         Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext,
30735                           dl, PromoteVT, Res,
30736                           DAG.getValueType(VT.getVectorElementType()));
30737 
30738       // Truncate back to the original width.
30739       Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
30740 
30741       // Now widen to 128 bits.
30742       unsigned NumConcats = 128 / VT.getSizeInBits();
30743       MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
30744                                       VT.getVectorNumElements() * NumConcats);
30745       SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
30746       ConcatOps[0] = Res;
30747       Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
30748       Results.push_back(Res);
30749       if (IsStrict)
30750         Results.push_back(Chain);
30751       return;
30752     }
30753 
30754 
30755     if (VT == MVT::v2i32) {
30756       assert((IsSigned || Subtarget.hasAVX512()) &&
30757              "Can only handle signed conversion without AVX512");
30758       assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
30759       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
30760              "Unexpected type action!");
30761       if (Src.getValueType() == MVT::v2f64) {
30762         unsigned Opc;
30763         if (IsStrict)
30764           Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
30765         else
30766           Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
30767 
30768         // If we have VLX we can emit a target specific FP_TO_UINT node,.
30769         if (!IsSigned && !Subtarget.hasVLX()) {
30770           // Otherwise we can defer to the generic legalizer which will widen
30771           // the input as well. This will be further widened during op
30772           // legalization to v8i32<-v8f64.
30773           // For strict nodes we'll need to widen ourselves.
30774           // FIXME: Fix the type legalizer to safely widen strict nodes?
30775           if (!IsStrict)
30776             return;
30777           Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
30778                             DAG.getConstantFP(0.0, dl, MVT::v2f64));
30779           Opc = N->getOpcode();
30780         }
30781         SDValue Res;
30782         SDValue Chain;
30783         if (IsStrict) {
30784           Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
30785                             {N->getOperand(0), Src});
30786           Chain = Res.getValue(1);
30787         } else {
30788           Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
30789         }
30790         Results.push_back(Res);
30791         if (IsStrict)
30792           Results.push_back(Chain);
30793         return;
30794       }
30795 
30796       // Custom widen strict v2f32->v2i32 by padding with zeros.
30797       // FIXME: Should generic type legalizer do this?
30798       if (Src.getValueType() == MVT::v2f32 && IsStrict) {
30799         Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
30800                           DAG.getConstantFP(0.0, dl, MVT::v2f32));
30801         SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
30802                                   {N->getOperand(0), Src});
30803         Results.push_back(Res);
30804         Results.push_back(Res.getValue(1));
30805         return;
30806       }
30807 
30808       // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
30809       // so early out here.
30810       return;
30811     }
30812 
30813     assert(!VT.isVector() && "Vectors should have been handled above!");
30814 
30815     if (Subtarget.hasDQI() && VT == MVT::i64 &&
30816         (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
30817       assert(!Subtarget.is64Bit() && "i64 should be legal");
30818       unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
30819       // If we use a 128-bit result we might need to use a target specific node.
30820       unsigned SrcElts =
30821           std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
30822       MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
30823       MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
30824       unsigned Opc = N->getOpcode();
30825       if (NumElts != SrcElts) {
30826         if (IsStrict)
30827           Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
30828         else
30829           Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
30830       }
30831 
30832       SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
30833       SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
30834                                 DAG.getConstantFP(0.0, dl, VecInVT), Src,
30835                                 ZeroIdx);
30836       SDValue Chain;
30837       if (IsStrict) {
30838         SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
30839         Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
30840         Chain = Res.getValue(1);
30841       } else
30842         Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
30843       Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
30844       Results.push_back(Res);
30845       if (IsStrict)
30846         Results.push_back(Chain);
30847       return;
30848     }
30849 
30850     SDValue Chain;
30851     if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
30852       Results.push_back(V);
30853       if (IsStrict)
30854         Results.push_back(Chain);
30855     }
30856     return;
30857   }
30858   case ISD::LRINT:
30859   case ISD::LLRINT: {
30860     if (SDValue V = LRINT_LLRINTHelper(N, DAG))
30861       Results.push_back(V);
30862     return;
30863   }
30864 
30865   case ISD::SINT_TO_FP:
30866   case ISD::STRICT_SINT_TO_FP:
30867   case ISD::UINT_TO_FP:
30868   case ISD::STRICT_UINT_TO_FP: {
30869     bool IsStrict = N->isStrictFPOpcode();
30870     bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
30871                     N->getOpcode() == ISD::STRICT_SINT_TO_FP;
30872     EVT VT = N->getValueType(0);
30873     if (VT != MVT::v2f32)
30874       return;
30875     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
30876     EVT SrcVT = Src.getValueType();
30877     if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
30878       if (IsStrict) {
30879         unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
30880                                 : X86ISD::STRICT_CVTUI2P;
30881         SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
30882                                   {N->getOperand(0), Src});
30883         Results.push_back(Res);
30884         Results.push_back(Res.getValue(1));
30885       } else {
30886         unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
30887         Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
30888       }
30889       return;
30890     }
30891     if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
30892         Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
30893       SDValue Zero = DAG.getConstant(0, dl, SrcVT);
30894       SDValue One  = DAG.getConstant(1, dl, SrcVT);
30895       SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
30896                                  DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
30897                                  DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
30898       SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
30899       SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
30900       SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
30901       for (int i = 0; i != 2; ++i) {
30902         SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
30903                                   SignSrc, DAG.getIntPtrConstant(i, dl));
30904         if (IsStrict)
30905           SignCvts[i] =
30906               DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
30907                           {N->getOperand(0), Elt});
30908         else
30909           SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
30910       };
30911       SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
30912       SDValue Slow, Chain;
30913       if (IsStrict) {
30914         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
30915                             SignCvts[0].getValue(1), SignCvts[1].getValue(1));
30916         Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
30917                            {Chain, SignCvt, SignCvt});
30918         Chain = Slow.getValue(1);
30919       } else {
30920         Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
30921       }
30922       IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
30923       IsNeg =
30924           DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
30925       SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
30926       Results.push_back(Cvt);
30927       if (IsStrict)
30928         Results.push_back(Chain);
30929       return;
30930     }
30931 
30932     if (SrcVT != MVT::v2i32)
30933       return;
30934 
30935     if (IsSigned || Subtarget.hasAVX512()) {
30936       if (!IsStrict)
30937         return;
30938 
30939       // Custom widen strict v2i32->v2f32 to avoid scalarization.
30940       // FIXME: Should generic type legalizer do this?
30941       Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
30942                         DAG.getConstant(0, dl, MVT::v2i32));
30943       SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
30944                                 {N->getOperand(0), Src});
30945       Results.push_back(Res);
30946       Results.push_back(Res.getValue(1));
30947       return;
30948     }
30949 
30950     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
30951     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
30952     SDValue VBias =
30953         DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
30954     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
30955                              DAG.getBitcast(MVT::v2i64, VBias));
30956     Or = DAG.getBitcast(MVT::v2f64, Or);
30957     if (IsStrict) {
30958       SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
30959                                 {N->getOperand(0), Or, VBias});
30960       SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
30961                                 {MVT::v4f32, MVT::Other},
30962                                 {Sub.getValue(1), Sub});
30963       Results.push_back(Res);
30964       Results.push_back(Res.getValue(1));
30965     } else {
30966       // TODO: Are there any fast-math-flags to propagate here?
30967       SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
30968       Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
30969     }
30970     return;
30971   }
30972   case ISD::STRICT_FP_ROUND:
30973   case ISD::FP_ROUND: {
30974     bool IsStrict = N->isStrictFPOpcode();
30975     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
30976     if (!isTypeLegal(Src.getValueType()))
30977       return;
30978     SDValue V;
30979     if (IsStrict)
30980       V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other},
30981                       {N->getOperand(0), N->getOperand(1)});
30982     else
30983       V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
30984     Results.push_back(V);
30985     if (IsStrict)
30986       Results.push_back(V.getValue(1));
30987     return;
30988   }
30989   case ISD::FP_EXTEND:
30990   case ISD::STRICT_FP_EXTEND: {
30991     // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
30992     // No other ValueType for FP_EXTEND should reach this point.
30993     assert(N->getValueType(0) == MVT::v2f32 &&
30994            "Do not know how to legalize this Node");
30995     return;
30996   }
30997   case ISD::INTRINSIC_W_CHAIN: {
30998     unsigned IntNo = N->getConstantOperandVal(1);
30999     switch (IntNo) {
31000     default : llvm_unreachable("Do not know how to custom type "
31001                                "legalize this intrinsic operation!");
31002     case Intrinsic::x86_rdtsc:
31003       return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
31004                                      Results);
31005     case Intrinsic::x86_rdtscp:
31006       return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
31007                                      Results);
31008     case Intrinsic::x86_rdpmc:
31009       expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
31010                                   Results);
31011       return;
31012     case Intrinsic::x86_xgetbv:
31013       expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
31014                                   Results);
31015       return;
31016     }
31017   }
31018   case ISD::READCYCLECOUNTER: {
31019     return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
31020   }
31021   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
31022     EVT T = N->getValueType(0);
31023     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
31024     bool Regs64bit = T == MVT::i128;
31025     assert((!Regs64bit || Subtarget.hasCmpxchg16b()) &&
31026            "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
31027     MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
31028     SDValue cpInL, cpInH;
31029     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
31030                         DAG.getConstant(0, dl, HalfT));
31031     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
31032                         DAG.getConstant(1, dl, HalfT));
31033     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
31034                              Regs64bit ? X86::RAX : X86::EAX,
31035                              cpInL, SDValue());
31036     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
31037                              Regs64bit ? X86::RDX : X86::EDX,
31038                              cpInH, cpInL.getValue(1));
31039     SDValue swapInL, swapInH;
31040     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
31041                           DAG.getConstant(0, dl, HalfT));
31042     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
31043                           DAG.getConstant(1, dl, HalfT));
31044     swapInH =
31045         DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
31046                          swapInH, cpInH.getValue(1));
31047 
31048     // In 64-bit mode we might need the base pointer in RBX, but we can't know
31049     // until later. So we keep the RBX input in a vreg and use a custom
31050     // inserter.
31051     // Since RBX will be a reserved register the register allocator will not
31052     // make sure its value will be properly saved and restored around this
31053     // live-range.
31054     SDValue Result;
31055     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
31056     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
31057     if (Regs64bit) {
31058       SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
31059                        swapInH.getValue(1)};
31060       Result =
31061           DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
31062     } else {
31063       swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
31064                                  swapInH.getValue(1));
31065       SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
31066                        swapInL.getValue(1)};
31067       Result =
31068           DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
31069     }
31070 
31071     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
31072                                         Regs64bit ? X86::RAX : X86::EAX,
31073                                         HalfT, Result.getValue(1));
31074     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
31075                                         Regs64bit ? X86::RDX : X86::EDX,
31076                                         HalfT, cpOutL.getValue(2));
31077     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
31078 
31079     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
31080                                         MVT::i32, cpOutH.getValue(2));
31081     SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
31082     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
31083 
31084     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
31085     Results.push_back(Success);
31086     Results.push_back(EFLAGS.getValue(1));
31087     return;
31088   }
31089   case ISD::ATOMIC_LOAD: {
31090     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
31091     bool NoImplicitFloatOps =
31092         DAG.getMachineFunction().getFunction().hasFnAttribute(
31093             Attribute::NoImplicitFloat);
31094     if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
31095       auto *Node = cast<AtomicSDNode>(N);
31096       if (Subtarget.hasSSE1()) {
31097         // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
31098         // Then extract the lower 64-bits.
31099         MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
31100         SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
31101         SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
31102         SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
31103                                              MVT::i64, Node->getMemOperand());
31104         if (Subtarget.hasSSE2()) {
31105           SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
31106                                     DAG.getIntPtrConstant(0, dl));
31107           Results.push_back(Res);
31108           Results.push_back(Ld.getValue(1));
31109           return;
31110         }
31111         // We use an alternative sequence for SSE1 that extracts as v2f32 and
31112         // then casts to i64. This avoids a 128-bit stack temporary being
31113         // created by type legalization if we were to cast v4f32->v2i64.
31114         SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
31115                                   DAG.getIntPtrConstant(0, dl));
31116         Res = DAG.getBitcast(MVT::i64, Res);
31117         Results.push_back(Res);
31118         Results.push_back(Ld.getValue(1));
31119         return;
31120       }
31121       if (Subtarget.hasX87()) {
31122         // First load this into an 80-bit X87 register. This will put the whole
31123         // integer into the significand.
31124         SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
31125         SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
31126         SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
31127                                                  dl, Tys, Ops, MVT::i64,
31128                                                  Node->getMemOperand());
31129         SDValue Chain = Result.getValue(1);
31130 
31131         // Now store the X87 register to a stack temporary and convert to i64.
31132         // This store is not atomic and doesn't need to be.
31133         // FIXME: We don't need a stack temporary if the result of the load
31134         // is already being stored. We could just directly store there.
31135         SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
31136         int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
31137         MachinePointerInfo MPI =
31138             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
31139         SDValue StoreOps[] = { Chain, Result, StackPtr };
31140         Chain = DAG.getMemIntrinsicNode(
31141             X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
31142             MPI, None /*Align*/, MachineMemOperand::MOStore);
31143 
31144         // Finally load the value back from the stack temporary and return it.
31145         // This load is not atomic and doesn't need to be.
31146         // This load will be further type legalized.
31147         Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
31148         Results.push_back(Result);
31149         Results.push_back(Result.getValue(1));
31150         return;
31151       }
31152     }
31153     // TODO: Use MOVLPS when SSE1 is available?
31154     // Delegate to generic TypeLegalization. Situations we can really handle
31155     // should have already been dealt with by AtomicExpandPass.cpp.
31156     break;
31157   }
31158   case ISD::ATOMIC_SWAP:
31159   case ISD::ATOMIC_LOAD_ADD:
31160   case ISD::ATOMIC_LOAD_SUB:
31161   case ISD::ATOMIC_LOAD_AND:
31162   case ISD::ATOMIC_LOAD_OR:
31163   case ISD::ATOMIC_LOAD_XOR:
31164   case ISD::ATOMIC_LOAD_NAND:
31165   case ISD::ATOMIC_LOAD_MIN:
31166   case ISD::ATOMIC_LOAD_MAX:
31167   case ISD::ATOMIC_LOAD_UMIN:
31168   case ISD::ATOMIC_LOAD_UMAX:
31169     // Delegate to generic TypeLegalization. Situations we can really handle
31170     // should have already been dealt with by AtomicExpandPass.cpp.
31171     break;
31172 
31173   case ISD::BITCAST: {
31174     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
31175     EVT DstVT = N->getValueType(0);
31176     EVT SrcVT = N->getOperand(0).getValueType();
31177 
31178     // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
31179     // we can split using the k-register rather than memory.
31180     if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
31181       assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
31182       SDValue Lo, Hi;
31183       std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
31184       Lo = DAG.getBitcast(MVT::i32, Lo);
31185       Hi = DAG.getBitcast(MVT::i32, Hi);
31186       SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
31187       Results.push_back(Res);
31188       return;
31189     }
31190 
31191     if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
31192       // FIXME: Use v4f32 for SSE1?
31193       assert(Subtarget.hasSSE2() && "Requires SSE2");
31194       assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
31195              "Unexpected type action!");
31196       EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
31197       SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
31198                                 N->getOperand(0));
31199       Res = DAG.getBitcast(WideVT, Res);
31200       Results.push_back(Res);
31201       return;
31202     }
31203 
31204     return;
31205   }
31206   case ISD::MGATHER: {
31207     EVT VT = N->getValueType(0);
31208     if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
31209         (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
31210       auto *Gather = cast<MaskedGatherSDNode>(N);
31211       SDValue Index = Gather->getIndex();
31212       if (Index.getValueType() != MVT::v2i64)
31213         return;
31214       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
31215              "Unexpected type action!");
31216       EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
31217       SDValue Mask = Gather->getMask();
31218       assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
31219       SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
31220                                      Gather->getPassThru(),
31221                                      DAG.getUNDEF(VT));
31222       if (!Subtarget.hasVLX()) {
31223         // We need to widen the mask, but the instruction will only use 2
31224         // of its elements. So we can use undef.
31225         Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
31226                            DAG.getUNDEF(MVT::v2i1));
31227         Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
31228       }
31229       SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
31230                         Gather->getBasePtr(), Index, Gather->getScale() };
31231       SDValue Res = DAG.getMemIntrinsicNode(
31232           X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
31233           Gather->getMemoryVT(), Gather->getMemOperand());
31234       Results.push_back(Res);
31235       Results.push_back(Res.getValue(1));
31236       return;
31237     }
31238     return;
31239   }
31240   case ISD::LOAD: {
31241     // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
31242     // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
31243     // cast since type legalization will try to use an i64 load.
31244     MVT VT = N->getSimpleValueType(0);
31245     assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
31246     assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
31247            "Unexpected type action!");
31248     if (!ISD::isNON_EXTLoad(N))
31249       return;
31250     auto *Ld = cast<LoadSDNode>(N);
31251     if (Subtarget.hasSSE2()) {
31252       MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
31253       SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
31254                                 Ld->getPointerInfo(), Ld->getOriginalAlign(),
31255                                 Ld->getMemOperand()->getFlags());
31256       SDValue Chain = Res.getValue(1);
31257       MVT VecVT = MVT::getVectorVT(LdVT, 2);
31258       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
31259       EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
31260       Res = DAG.getBitcast(WideVT, Res);
31261       Results.push_back(Res);
31262       Results.push_back(Chain);
31263       return;
31264     }
31265     assert(Subtarget.hasSSE1() && "Expected SSE");
31266     SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
31267     SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
31268     SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
31269                                           MVT::i64, Ld->getMemOperand());
31270     Results.push_back(Res);
31271     Results.push_back(Res.getValue(1));
31272     return;
31273   }
31274   case ISD::ADDRSPACECAST: {
31275     SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
31276     Results.push_back(V);
31277     return;
31278   }
31279   case ISD::BITREVERSE:
31280     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
31281     assert(Subtarget.hasXOP() && "Expected XOP");
31282     // We can use VPPERM by copying to a vector register and back. We'll need
31283     // to move the scalar in two i32 pieces.
31284     Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
31285     return;
31286   }
31287 }
31288 
getTargetNodeName(unsigned Opcode) const31289 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
31290   switch ((X86ISD::NodeType)Opcode) {
31291   case X86ISD::FIRST_NUMBER:       break;
31292 #define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
31293   NODE_NAME_CASE(BSF)
31294   NODE_NAME_CASE(BSR)
31295   NODE_NAME_CASE(FSHL)
31296   NODE_NAME_CASE(FSHR)
31297   NODE_NAME_CASE(FAND)
31298   NODE_NAME_CASE(FANDN)
31299   NODE_NAME_CASE(FOR)
31300   NODE_NAME_CASE(FXOR)
31301   NODE_NAME_CASE(FILD)
31302   NODE_NAME_CASE(FIST)
31303   NODE_NAME_CASE(FP_TO_INT_IN_MEM)
31304   NODE_NAME_CASE(FLD)
31305   NODE_NAME_CASE(FST)
31306   NODE_NAME_CASE(CALL)
31307   NODE_NAME_CASE(CALL_RVMARKER)
31308   NODE_NAME_CASE(BT)
31309   NODE_NAME_CASE(CMP)
31310   NODE_NAME_CASE(FCMP)
31311   NODE_NAME_CASE(STRICT_FCMP)
31312   NODE_NAME_CASE(STRICT_FCMPS)
31313   NODE_NAME_CASE(COMI)
31314   NODE_NAME_CASE(UCOMI)
31315   NODE_NAME_CASE(CMPM)
31316   NODE_NAME_CASE(CMPMM)
31317   NODE_NAME_CASE(STRICT_CMPM)
31318   NODE_NAME_CASE(CMPMM_SAE)
31319   NODE_NAME_CASE(SETCC)
31320   NODE_NAME_CASE(SETCC_CARRY)
31321   NODE_NAME_CASE(FSETCC)
31322   NODE_NAME_CASE(FSETCCM)
31323   NODE_NAME_CASE(FSETCCM_SAE)
31324   NODE_NAME_CASE(CMOV)
31325   NODE_NAME_CASE(BRCOND)
31326   NODE_NAME_CASE(RET_FLAG)
31327   NODE_NAME_CASE(IRET)
31328   NODE_NAME_CASE(REP_STOS)
31329   NODE_NAME_CASE(REP_MOVS)
31330   NODE_NAME_CASE(GlobalBaseReg)
31331   NODE_NAME_CASE(Wrapper)
31332   NODE_NAME_CASE(WrapperRIP)
31333   NODE_NAME_CASE(MOVQ2DQ)
31334   NODE_NAME_CASE(MOVDQ2Q)
31335   NODE_NAME_CASE(MMX_MOVD2W)
31336   NODE_NAME_CASE(MMX_MOVW2D)
31337   NODE_NAME_CASE(PEXTRB)
31338   NODE_NAME_CASE(PEXTRW)
31339   NODE_NAME_CASE(INSERTPS)
31340   NODE_NAME_CASE(PINSRB)
31341   NODE_NAME_CASE(PINSRW)
31342   NODE_NAME_CASE(PSHUFB)
31343   NODE_NAME_CASE(ANDNP)
31344   NODE_NAME_CASE(BLENDI)
31345   NODE_NAME_CASE(BLENDV)
31346   NODE_NAME_CASE(HADD)
31347   NODE_NAME_CASE(HSUB)
31348   NODE_NAME_CASE(FHADD)
31349   NODE_NAME_CASE(FHSUB)
31350   NODE_NAME_CASE(CONFLICT)
31351   NODE_NAME_CASE(FMAX)
31352   NODE_NAME_CASE(FMAXS)
31353   NODE_NAME_CASE(FMAX_SAE)
31354   NODE_NAME_CASE(FMAXS_SAE)
31355   NODE_NAME_CASE(FMIN)
31356   NODE_NAME_CASE(FMINS)
31357   NODE_NAME_CASE(FMIN_SAE)
31358   NODE_NAME_CASE(FMINS_SAE)
31359   NODE_NAME_CASE(FMAXC)
31360   NODE_NAME_CASE(FMINC)
31361   NODE_NAME_CASE(FRSQRT)
31362   NODE_NAME_CASE(FRCP)
31363   NODE_NAME_CASE(EXTRQI)
31364   NODE_NAME_CASE(INSERTQI)
31365   NODE_NAME_CASE(TLSADDR)
31366   NODE_NAME_CASE(TLSBASEADDR)
31367   NODE_NAME_CASE(TLSCALL)
31368   NODE_NAME_CASE(EH_SJLJ_SETJMP)
31369   NODE_NAME_CASE(EH_SJLJ_LONGJMP)
31370   NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
31371   NODE_NAME_CASE(EH_RETURN)
31372   NODE_NAME_CASE(TC_RETURN)
31373   NODE_NAME_CASE(FNSTCW16m)
31374   NODE_NAME_CASE(FLDCW16m)
31375   NODE_NAME_CASE(LCMPXCHG_DAG)
31376   NODE_NAME_CASE(LCMPXCHG8_DAG)
31377   NODE_NAME_CASE(LCMPXCHG16_DAG)
31378   NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
31379   NODE_NAME_CASE(LADD)
31380   NODE_NAME_CASE(LSUB)
31381   NODE_NAME_CASE(LOR)
31382   NODE_NAME_CASE(LXOR)
31383   NODE_NAME_CASE(LAND)
31384   NODE_NAME_CASE(VZEXT_MOVL)
31385   NODE_NAME_CASE(VZEXT_LOAD)
31386   NODE_NAME_CASE(VEXTRACT_STORE)
31387   NODE_NAME_CASE(VTRUNC)
31388   NODE_NAME_CASE(VTRUNCS)
31389   NODE_NAME_CASE(VTRUNCUS)
31390   NODE_NAME_CASE(VMTRUNC)
31391   NODE_NAME_CASE(VMTRUNCS)
31392   NODE_NAME_CASE(VMTRUNCUS)
31393   NODE_NAME_CASE(VTRUNCSTORES)
31394   NODE_NAME_CASE(VTRUNCSTOREUS)
31395   NODE_NAME_CASE(VMTRUNCSTORES)
31396   NODE_NAME_CASE(VMTRUNCSTOREUS)
31397   NODE_NAME_CASE(VFPEXT)
31398   NODE_NAME_CASE(STRICT_VFPEXT)
31399   NODE_NAME_CASE(VFPEXT_SAE)
31400   NODE_NAME_CASE(VFPEXTS)
31401   NODE_NAME_CASE(VFPEXTS_SAE)
31402   NODE_NAME_CASE(VFPROUND)
31403   NODE_NAME_CASE(STRICT_VFPROUND)
31404   NODE_NAME_CASE(VMFPROUND)
31405   NODE_NAME_CASE(VFPROUND_RND)
31406   NODE_NAME_CASE(VFPROUNDS)
31407   NODE_NAME_CASE(VFPROUNDS_RND)
31408   NODE_NAME_CASE(VSHLDQ)
31409   NODE_NAME_CASE(VSRLDQ)
31410   NODE_NAME_CASE(VSHL)
31411   NODE_NAME_CASE(VSRL)
31412   NODE_NAME_CASE(VSRA)
31413   NODE_NAME_CASE(VSHLI)
31414   NODE_NAME_CASE(VSRLI)
31415   NODE_NAME_CASE(VSRAI)
31416   NODE_NAME_CASE(VSHLV)
31417   NODE_NAME_CASE(VSRLV)
31418   NODE_NAME_CASE(VSRAV)
31419   NODE_NAME_CASE(VROTLI)
31420   NODE_NAME_CASE(VROTRI)
31421   NODE_NAME_CASE(VPPERM)
31422   NODE_NAME_CASE(CMPP)
31423   NODE_NAME_CASE(STRICT_CMPP)
31424   NODE_NAME_CASE(PCMPEQ)
31425   NODE_NAME_CASE(PCMPGT)
31426   NODE_NAME_CASE(PHMINPOS)
31427   NODE_NAME_CASE(ADD)
31428   NODE_NAME_CASE(SUB)
31429   NODE_NAME_CASE(ADC)
31430   NODE_NAME_CASE(SBB)
31431   NODE_NAME_CASE(SMUL)
31432   NODE_NAME_CASE(UMUL)
31433   NODE_NAME_CASE(OR)
31434   NODE_NAME_CASE(XOR)
31435   NODE_NAME_CASE(AND)
31436   NODE_NAME_CASE(BEXTR)
31437   NODE_NAME_CASE(BEXTRI)
31438   NODE_NAME_CASE(BZHI)
31439   NODE_NAME_CASE(PDEP)
31440   NODE_NAME_CASE(PEXT)
31441   NODE_NAME_CASE(MUL_IMM)
31442   NODE_NAME_CASE(MOVMSK)
31443   NODE_NAME_CASE(PTEST)
31444   NODE_NAME_CASE(TESTP)
31445   NODE_NAME_CASE(KORTEST)
31446   NODE_NAME_CASE(KTEST)
31447   NODE_NAME_CASE(KADD)
31448   NODE_NAME_CASE(KSHIFTL)
31449   NODE_NAME_CASE(KSHIFTR)
31450   NODE_NAME_CASE(PACKSS)
31451   NODE_NAME_CASE(PACKUS)
31452   NODE_NAME_CASE(PALIGNR)
31453   NODE_NAME_CASE(VALIGN)
31454   NODE_NAME_CASE(VSHLD)
31455   NODE_NAME_CASE(VSHRD)
31456   NODE_NAME_CASE(VSHLDV)
31457   NODE_NAME_CASE(VSHRDV)
31458   NODE_NAME_CASE(PSHUFD)
31459   NODE_NAME_CASE(PSHUFHW)
31460   NODE_NAME_CASE(PSHUFLW)
31461   NODE_NAME_CASE(SHUFP)
31462   NODE_NAME_CASE(SHUF128)
31463   NODE_NAME_CASE(MOVLHPS)
31464   NODE_NAME_CASE(MOVHLPS)
31465   NODE_NAME_CASE(MOVDDUP)
31466   NODE_NAME_CASE(MOVSHDUP)
31467   NODE_NAME_CASE(MOVSLDUP)
31468   NODE_NAME_CASE(MOVSD)
31469   NODE_NAME_CASE(MOVSS)
31470   NODE_NAME_CASE(UNPCKL)
31471   NODE_NAME_CASE(UNPCKH)
31472   NODE_NAME_CASE(VBROADCAST)
31473   NODE_NAME_CASE(VBROADCAST_LOAD)
31474   NODE_NAME_CASE(VBROADCASTM)
31475   NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
31476   NODE_NAME_CASE(VPERMILPV)
31477   NODE_NAME_CASE(VPERMILPI)
31478   NODE_NAME_CASE(VPERM2X128)
31479   NODE_NAME_CASE(VPERMV)
31480   NODE_NAME_CASE(VPERMV3)
31481   NODE_NAME_CASE(VPERMI)
31482   NODE_NAME_CASE(VPTERNLOG)
31483   NODE_NAME_CASE(VFIXUPIMM)
31484   NODE_NAME_CASE(VFIXUPIMM_SAE)
31485   NODE_NAME_CASE(VFIXUPIMMS)
31486   NODE_NAME_CASE(VFIXUPIMMS_SAE)
31487   NODE_NAME_CASE(VRANGE)
31488   NODE_NAME_CASE(VRANGE_SAE)
31489   NODE_NAME_CASE(VRANGES)
31490   NODE_NAME_CASE(VRANGES_SAE)
31491   NODE_NAME_CASE(PMULUDQ)
31492   NODE_NAME_CASE(PMULDQ)
31493   NODE_NAME_CASE(PSADBW)
31494   NODE_NAME_CASE(DBPSADBW)
31495   NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
31496   NODE_NAME_CASE(VAARG_64)
31497   NODE_NAME_CASE(VAARG_X32)
31498   NODE_NAME_CASE(WIN_ALLOCA)
31499   NODE_NAME_CASE(MEMBARRIER)
31500   NODE_NAME_CASE(MFENCE)
31501   NODE_NAME_CASE(SEG_ALLOCA)
31502   NODE_NAME_CASE(PROBED_ALLOCA)
31503   NODE_NAME_CASE(RDRAND)
31504   NODE_NAME_CASE(RDSEED)
31505   NODE_NAME_CASE(RDPKRU)
31506   NODE_NAME_CASE(WRPKRU)
31507   NODE_NAME_CASE(VPMADDUBSW)
31508   NODE_NAME_CASE(VPMADDWD)
31509   NODE_NAME_CASE(VPSHA)
31510   NODE_NAME_CASE(VPSHL)
31511   NODE_NAME_CASE(VPCOM)
31512   NODE_NAME_CASE(VPCOMU)
31513   NODE_NAME_CASE(VPERMIL2)
31514   NODE_NAME_CASE(FMSUB)
31515   NODE_NAME_CASE(STRICT_FMSUB)
31516   NODE_NAME_CASE(FNMADD)
31517   NODE_NAME_CASE(STRICT_FNMADD)
31518   NODE_NAME_CASE(FNMSUB)
31519   NODE_NAME_CASE(STRICT_FNMSUB)
31520   NODE_NAME_CASE(FMADDSUB)
31521   NODE_NAME_CASE(FMSUBADD)
31522   NODE_NAME_CASE(FMADD_RND)
31523   NODE_NAME_CASE(FNMADD_RND)
31524   NODE_NAME_CASE(FMSUB_RND)
31525   NODE_NAME_CASE(FNMSUB_RND)
31526   NODE_NAME_CASE(FMADDSUB_RND)
31527   NODE_NAME_CASE(FMSUBADD_RND)
31528   NODE_NAME_CASE(VPMADD52H)
31529   NODE_NAME_CASE(VPMADD52L)
31530   NODE_NAME_CASE(VRNDSCALE)
31531   NODE_NAME_CASE(STRICT_VRNDSCALE)
31532   NODE_NAME_CASE(VRNDSCALE_SAE)
31533   NODE_NAME_CASE(VRNDSCALES)
31534   NODE_NAME_CASE(VRNDSCALES_SAE)
31535   NODE_NAME_CASE(VREDUCE)
31536   NODE_NAME_CASE(VREDUCE_SAE)
31537   NODE_NAME_CASE(VREDUCES)
31538   NODE_NAME_CASE(VREDUCES_SAE)
31539   NODE_NAME_CASE(VGETMANT)
31540   NODE_NAME_CASE(VGETMANT_SAE)
31541   NODE_NAME_CASE(VGETMANTS)
31542   NODE_NAME_CASE(VGETMANTS_SAE)
31543   NODE_NAME_CASE(PCMPESTR)
31544   NODE_NAME_CASE(PCMPISTR)
31545   NODE_NAME_CASE(XTEST)
31546   NODE_NAME_CASE(COMPRESS)
31547   NODE_NAME_CASE(EXPAND)
31548   NODE_NAME_CASE(SELECTS)
31549   NODE_NAME_CASE(ADDSUB)
31550   NODE_NAME_CASE(RCP14)
31551   NODE_NAME_CASE(RCP14S)
31552   NODE_NAME_CASE(RCP28)
31553   NODE_NAME_CASE(RCP28_SAE)
31554   NODE_NAME_CASE(RCP28S)
31555   NODE_NAME_CASE(RCP28S_SAE)
31556   NODE_NAME_CASE(EXP2)
31557   NODE_NAME_CASE(EXP2_SAE)
31558   NODE_NAME_CASE(RSQRT14)
31559   NODE_NAME_CASE(RSQRT14S)
31560   NODE_NAME_CASE(RSQRT28)
31561   NODE_NAME_CASE(RSQRT28_SAE)
31562   NODE_NAME_CASE(RSQRT28S)
31563   NODE_NAME_CASE(RSQRT28S_SAE)
31564   NODE_NAME_CASE(FADD_RND)
31565   NODE_NAME_CASE(FADDS)
31566   NODE_NAME_CASE(FADDS_RND)
31567   NODE_NAME_CASE(FSUB_RND)
31568   NODE_NAME_CASE(FSUBS)
31569   NODE_NAME_CASE(FSUBS_RND)
31570   NODE_NAME_CASE(FMUL_RND)
31571   NODE_NAME_CASE(FMULS)
31572   NODE_NAME_CASE(FMULS_RND)
31573   NODE_NAME_CASE(FDIV_RND)
31574   NODE_NAME_CASE(FDIVS)
31575   NODE_NAME_CASE(FDIVS_RND)
31576   NODE_NAME_CASE(FSQRT_RND)
31577   NODE_NAME_CASE(FSQRTS)
31578   NODE_NAME_CASE(FSQRTS_RND)
31579   NODE_NAME_CASE(FGETEXP)
31580   NODE_NAME_CASE(FGETEXP_SAE)
31581   NODE_NAME_CASE(FGETEXPS)
31582   NODE_NAME_CASE(FGETEXPS_SAE)
31583   NODE_NAME_CASE(SCALEF)
31584   NODE_NAME_CASE(SCALEF_RND)
31585   NODE_NAME_CASE(SCALEFS)
31586   NODE_NAME_CASE(SCALEFS_RND)
31587   NODE_NAME_CASE(AVG)
31588   NODE_NAME_CASE(MULHRS)
31589   NODE_NAME_CASE(SINT_TO_FP_RND)
31590   NODE_NAME_CASE(UINT_TO_FP_RND)
31591   NODE_NAME_CASE(CVTTP2SI)
31592   NODE_NAME_CASE(CVTTP2UI)
31593   NODE_NAME_CASE(STRICT_CVTTP2SI)
31594   NODE_NAME_CASE(STRICT_CVTTP2UI)
31595   NODE_NAME_CASE(MCVTTP2SI)
31596   NODE_NAME_CASE(MCVTTP2UI)
31597   NODE_NAME_CASE(CVTTP2SI_SAE)
31598   NODE_NAME_CASE(CVTTP2UI_SAE)
31599   NODE_NAME_CASE(CVTTS2SI)
31600   NODE_NAME_CASE(CVTTS2UI)
31601   NODE_NAME_CASE(CVTTS2SI_SAE)
31602   NODE_NAME_CASE(CVTTS2UI_SAE)
31603   NODE_NAME_CASE(CVTSI2P)
31604   NODE_NAME_CASE(CVTUI2P)
31605   NODE_NAME_CASE(STRICT_CVTSI2P)
31606   NODE_NAME_CASE(STRICT_CVTUI2P)
31607   NODE_NAME_CASE(MCVTSI2P)
31608   NODE_NAME_CASE(MCVTUI2P)
31609   NODE_NAME_CASE(VFPCLASS)
31610   NODE_NAME_CASE(VFPCLASSS)
31611   NODE_NAME_CASE(MULTISHIFT)
31612   NODE_NAME_CASE(SCALAR_SINT_TO_FP)
31613   NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
31614   NODE_NAME_CASE(SCALAR_UINT_TO_FP)
31615   NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
31616   NODE_NAME_CASE(CVTPS2PH)
31617   NODE_NAME_CASE(STRICT_CVTPS2PH)
31618   NODE_NAME_CASE(MCVTPS2PH)
31619   NODE_NAME_CASE(CVTPH2PS)
31620   NODE_NAME_CASE(STRICT_CVTPH2PS)
31621   NODE_NAME_CASE(CVTPH2PS_SAE)
31622   NODE_NAME_CASE(CVTP2SI)
31623   NODE_NAME_CASE(CVTP2UI)
31624   NODE_NAME_CASE(MCVTP2SI)
31625   NODE_NAME_CASE(MCVTP2UI)
31626   NODE_NAME_CASE(CVTP2SI_RND)
31627   NODE_NAME_CASE(CVTP2UI_RND)
31628   NODE_NAME_CASE(CVTS2SI)
31629   NODE_NAME_CASE(CVTS2UI)
31630   NODE_NAME_CASE(CVTS2SI_RND)
31631   NODE_NAME_CASE(CVTS2UI_RND)
31632   NODE_NAME_CASE(CVTNE2PS2BF16)
31633   NODE_NAME_CASE(CVTNEPS2BF16)
31634   NODE_NAME_CASE(MCVTNEPS2BF16)
31635   NODE_NAME_CASE(DPBF16PS)
31636   NODE_NAME_CASE(LWPINS)
31637   NODE_NAME_CASE(MGATHER)
31638   NODE_NAME_CASE(MSCATTER)
31639   NODE_NAME_CASE(VPDPBUSD)
31640   NODE_NAME_CASE(VPDPBUSDS)
31641   NODE_NAME_CASE(VPDPWSSD)
31642   NODE_NAME_CASE(VPDPWSSDS)
31643   NODE_NAME_CASE(VPSHUFBITQMB)
31644   NODE_NAME_CASE(GF2P8MULB)
31645   NODE_NAME_CASE(GF2P8AFFINEQB)
31646   NODE_NAME_CASE(GF2P8AFFINEINVQB)
31647   NODE_NAME_CASE(NT_CALL)
31648   NODE_NAME_CASE(NT_BRIND)
31649   NODE_NAME_CASE(UMWAIT)
31650   NODE_NAME_CASE(TPAUSE)
31651   NODE_NAME_CASE(ENQCMD)
31652   NODE_NAME_CASE(ENQCMDS)
31653   NODE_NAME_CASE(VP2INTERSECT)
31654   NODE_NAME_CASE(AESENC128KL)
31655   NODE_NAME_CASE(AESDEC128KL)
31656   NODE_NAME_CASE(AESENC256KL)
31657   NODE_NAME_CASE(AESDEC256KL)
31658   NODE_NAME_CASE(AESENCWIDE128KL)
31659   NODE_NAME_CASE(AESDECWIDE128KL)
31660   NODE_NAME_CASE(AESENCWIDE256KL)
31661   NODE_NAME_CASE(AESDECWIDE256KL)
31662   NODE_NAME_CASE(TESTUI)
31663   }
31664   return nullptr;
31665 #undef NODE_NAME_CASE
31666 }
31667 
31668 /// Return true if the addressing mode represented by AM is legal for this
31669 /// target, for a load/store of the specified type.
isLegalAddressingMode(const DataLayout & DL,const AddrMode & AM,Type * Ty,unsigned AS,Instruction * I) const31670 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
31671                                               const AddrMode &AM, Type *Ty,
31672                                               unsigned AS,
31673                                               Instruction *I) const {
31674   // X86 supports extremely general addressing modes.
31675   CodeModel::Model M = getTargetMachine().getCodeModel();
31676 
31677   // X86 allows a sign-extended 32-bit immediate field as a displacement.
31678   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
31679     return false;
31680 
31681   if (AM.BaseGV) {
31682     unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
31683 
31684     // If a reference to this global requires an extra load, we can't fold it.
31685     if (isGlobalStubReference(GVFlags))
31686       return false;
31687 
31688     // If BaseGV requires a register for the PIC base, we cannot also have a
31689     // BaseReg specified.
31690     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
31691       return false;
31692 
31693     // If lower 4G is not available, then we must use rip-relative addressing.
31694     if ((M != CodeModel::Small || isPositionIndependent()) &&
31695         Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
31696       return false;
31697   }
31698 
31699   switch (AM.Scale) {
31700   case 0:
31701   case 1:
31702   case 2:
31703   case 4:
31704   case 8:
31705     // These scales always work.
31706     break;
31707   case 3:
31708   case 5:
31709   case 9:
31710     // These scales are formed with basereg+scalereg.  Only accept if there is
31711     // no basereg yet.
31712     if (AM.HasBaseReg)
31713       return false;
31714     break;
31715   default:  // Other stuff never works.
31716     return false;
31717   }
31718 
31719   return true;
31720 }
31721 
isVectorShiftByScalarCheap(Type * Ty) const31722 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
31723   unsigned Bits = Ty->getScalarSizeInBits();
31724 
31725   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
31726   // particularly cheaper than those without.
31727   if (Bits == 8)
31728     return false;
31729 
31730   // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
31731   // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
31732   if (Subtarget.hasXOP() &&
31733       (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
31734     return false;
31735 
31736   // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
31737   // shifts just as cheap as scalar ones.
31738   if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
31739     return false;
31740 
31741   // AVX512BW has shifts such as vpsllvw.
31742   if (Subtarget.hasBWI() && Bits == 16)
31743       return false;
31744 
31745   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
31746   // fully general vector.
31747   return true;
31748 }
31749 
isBinOp(unsigned Opcode) const31750 bool X86TargetLowering::isBinOp(unsigned Opcode) const {
31751   switch (Opcode) {
31752   // These are non-commutative binops.
31753   // TODO: Add more X86ISD opcodes once we have test coverage.
31754   case X86ISD::ANDNP:
31755   case X86ISD::PCMPGT:
31756   case X86ISD::FMAX:
31757   case X86ISD::FMIN:
31758   case X86ISD::FANDN:
31759     return true;
31760   }
31761 
31762   return TargetLoweringBase::isBinOp(Opcode);
31763 }
31764 
isCommutativeBinOp(unsigned Opcode) const31765 bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
31766   switch (Opcode) {
31767   // TODO: Add more X86ISD opcodes once we have test coverage.
31768   case X86ISD::PCMPEQ:
31769   case X86ISD::PMULDQ:
31770   case X86ISD::PMULUDQ:
31771   case X86ISD::FMAXC:
31772   case X86ISD::FMINC:
31773   case X86ISD::FAND:
31774   case X86ISD::FOR:
31775   case X86ISD::FXOR:
31776     return true;
31777   }
31778 
31779   return TargetLoweringBase::isCommutativeBinOp(Opcode);
31780 }
31781 
isTruncateFree(Type * Ty1,Type * Ty2) const31782 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
31783   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
31784     return false;
31785   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
31786   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
31787   return NumBits1 > NumBits2;
31788 }
31789 
allowTruncateForTailCall(Type * Ty1,Type * Ty2) const31790 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
31791   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
31792     return false;
31793 
31794   if (!isTypeLegal(EVT::getEVT(Ty1)))
31795     return false;
31796 
31797   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
31798 
31799   // Assuming the caller doesn't have a zeroext or signext return parameter,
31800   // truncation all the way down to i1 is valid.
31801   return true;
31802 }
31803 
isLegalICmpImmediate(int64_t Imm) const31804 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
31805   return isInt<32>(Imm);
31806 }
31807 
isLegalAddImmediate(int64_t Imm) const31808 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
31809   // Can also use sub to handle negated immediates.
31810   return isInt<32>(Imm);
31811 }
31812 
isLegalStoreImmediate(int64_t Imm) const31813 bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
31814   return isInt<32>(Imm);
31815 }
31816 
isTruncateFree(EVT VT1,EVT VT2) const31817 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
31818   if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
31819     return false;
31820   unsigned NumBits1 = VT1.getSizeInBits();
31821   unsigned NumBits2 = VT2.getSizeInBits();
31822   return NumBits1 > NumBits2;
31823 }
31824 
isZExtFree(Type * Ty1,Type * Ty2) const31825 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
31826   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
31827   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
31828 }
31829 
isZExtFree(EVT VT1,EVT VT2) const31830 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
31831   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
31832   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
31833 }
31834 
isZExtFree(SDValue Val,EVT VT2) const31835 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
31836   EVT VT1 = Val.getValueType();
31837   if (isZExtFree(VT1, VT2))
31838     return true;
31839 
31840   if (Val.getOpcode() != ISD::LOAD)
31841     return false;
31842 
31843   if (!VT1.isSimple() || !VT1.isInteger() ||
31844       !VT2.isSimple() || !VT2.isInteger())
31845     return false;
31846 
31847   switch (VT1.getSimpleVT().SimpleTy) {
31848   default: break;
31849   case MVT::i8:
31850   case MVT::i16:
31851   case MVT::i32:
31852     // X86 has 8, 16, and 32-bit zero-extending loads.
31853     return true;
31854   }
31855 
31856   return false;
31857 }
31858 
shouldSinkOperands(Instruction * I,SmallVectorImpl<Use * > & Ops) const31859 bool X86TargetLowering::shouldSinkOperands(Instruction *I,
31860                                            SmallVectorImpl<Use *> &Ops) const {
31861   // A uniform shift amount in a vector shift or funnel shift may be much
31862   // cheaper than a generic variable vector shift, so make that pattern visible
31863   // to SDAG by sinking the shuffle instruction next to the shift.
31864   int ShiftAmountOpNum = -1;
31865   if (I->isShift())
31866     ShiftAmountOpNum = 1;
31867   else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
31868     if (II->getIntrinsicID() == Intrinsic::fshl ||
31869         II->getIntrinsicID() == Intrinsic::fshr)
31870       ShiftAmountOpNum = 2;
31871   }
31872 
31873   if (ShiftAmountOpNum == -1)
31874     return false;
31875 
31876   auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
31877   if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
31878       isVectorShiftByScalarCheap(I->getType())) {
31879     Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
31880     return true;
31881   }
31882 
31883   return false;
31884 }
31885 
shouldConvertPhiType(Type * From,Type * To) const31886 bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
31887   if (!Subtarget.is64Bit())
31888     return false;
31889   return TargetLowering::shouldConvertPhiType(From, To);
31890 }
31891 
isVectorLoadExtDesirable(SDValue ExtVal) const31892 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
31893   if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
31894     return false;
31895 
31896   EVT SrcVT = ExtVal.getOperand(0).getValueType();
31897 
31898   // There is no extending load for vXi1.
31899   if (SrcVT.getScalarType() == MVT::i1)
31900     return false;
31901 
31902   return true;
31903 }
31904 
isFMAFasterThanFMulAndFAdd(const MachineFunction & MF,EVT VT) const31905 bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
31906                                                    EVT VT) const {
31907   if (!Subtarget.hasAnyFMA())
31908     return false;
31909 
31910   VT = VT.getScalarType();
31911 
31912   if (!VT.isSimple())
31913     return false;
31914 
31915   switch (VT.getSimpleVT().SimpleTy) {
31916   case MVT::f32:
31917   case MVT::f64:
31918     return true;
31919   default:
31920     break;
31921   }
31922 
31923   return false;
31924 }
31925 
isNarrowingProfitable(EVT VT1,EVT VT2) const31926 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
31927   // i16 instructions are longer (0x66 prefix) and potentially slower.
31928   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
31929 }
31930 
31931 /// Targets can use this to indicate that they only support *some*
31932 /// VECTOR_SHUFFLE operations, those with specific masks.
31933 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
31934 /// are assumed to be legal.
isShuffleMaskLegal(ArrayRef<int> Mask,EVT VT) const31935 bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
31936   if (!VT.isSimple())
31937     return false;
31938 
31939   // Not for i1 vectors
31940   if (VT.getSimpleVT().getScalarType() == MVT::i1)
31941     return false;
31942 
31943   // Very little shuffling can be done for 64-bit vectors right now.
31944   if (VT.getSimpleVT().getSizeInBits() == 64)
31945     return false;
31946 
31947   // We only care that the types being shuffled are legal. The lowering can
31948   // handle any possible shuffle mask that results.
31949   return isTypeLegal(VT.getSimpleVT());
31950 }
31951 
isVectorClearMaskLegal(ArrayRef<int> Mask,EVT VT) const31952 bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
31953                                                EVT VT) const {
31954   // Don't convert an 'and' into a shuffle that we don't directly support.
31955   // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
31956   if (!Subtarget.hasAVX2())
31957     if (VT == MVT::v32i8 || VT == MVT::v16i16)
31958       return false;
31959 
31960   // Just delegate to the generic legality, clear masks aren't special.
31961   return isShuffleMaskLegal(Mask, VT);
31962 }
31963 
areJTsAllowed(const Function * Fn) const31964 bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
31965   // If the subtarget is using thunks, we need to not generate jump tables.
31966   if (Subtarget.useIndirectThunkBranches())
31967     return false;
31968 
31969   // Otherwise, fallback on the generic logic.
31970   return TargetLowering::areJTsAllowed(Fn);
31971 }
31972 
31973 //===----------------------------------------------------------------------===//
31974 //                           X86 Scheduler Hooks
31975 //===----------------------------------------------------------------------===//
31976 
31977 // Returns true if EFLAG is consumed after this iterator in the rest of the
31978 // basic block or any successors of the basic block.
isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,MachineBasicBlock * BB)31979 static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
31980                               MachineBasicBlock *BB) {
31981   // Scan forward through BB for a use/def of EFLAGS.
31982   for (MachineBasicBlock::iterator miI = std::next(Itr), miE = BB->end();
31983          miI != miE; ++miI) {
31984     const MachineInstr& mi = *miI;
31985     if (mi.readsRegister(X86::EFLAGS))
31986       return true;
31987     // If we found a def, we can stop searching.
31988     if (mi.definesRegister(X86::EFLAGS))
31989       return false;
31990   }
31991 
31992   // If we hit the end of the block, check whether EFLAGS is live into a
31993   // successor.
31994   for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
31995                                         sEnd = BB->succ_end();
31996        sItr != sEnd; ++sItr) {
31997     MachineBasicBlock* succ = *sItr;
31998     if (succ->isLiveIn(X86::EFLAGS))
31999       return true;
32000   }
32001 
32002   return false;
32003 }
32004 
32005 /// Utility function to emit xbegin specifying the start of an RTM region.
emitXBegin(MachineInstr & MI,MachineBasicBlock * MBB,const TargetInstrInfo * TII)32006 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
32007                                      const TargetInstrInfo *TII) {
32008   const DebugLoc &DL = MI.getDebugLoc();
32009 
32010   const BasicBlock *BB = MBB->getBasicBlock();
32011   MachineFunction::iterator I = ++MBB->getIterator();
32012 
32013   // For the v = xbegin(), we generate
32014   //
32015   // thisMBB:
32016   //  xbegin sinkMBB
32017   //
32018   // mainMBB:
32019   //  s0 = -1
32020   //
32021   // fallBB:
32022   //  eax = # XABORT_DEF
32023   //  s1 = eax
32024   //
32025   // sinkMBB:
32026   //  v = phi(s0/mainBB, s1/fallBB)
32027 
32028   MachineBasicBlock *thisMBB = MBB;
32029   MachineFunction *MF = MBB->getParent();
32030   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
32031   MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
32032   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
32033   MF->insert(I, mainMBB);
32034   MF->insert(I, fallMBB);
32035   MF->insert(I, sinkMBB);
32036 
32037   if (isEFLAGSLiveAfter(MI, MBB)) {
32038     mainMBB->addLiveIn(X86::EFLAGS);
32039     fallMBB->addLiveIn(X86::EFLAGS);
32040     sinkMBB->addLiveIn(X86::EFLAGS);
32041   }
32042 
32043   // Transfer the remainder of BB and its successor edges to sinkMBB.
32044   sinkMBB->splice(sinkMBB->begin(), MBB,
32045                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
32046   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
32047 
32048   MachineRegisterInfo &MRI = MF->getRegInfo();
32049   Register DstReg = MI.getOperand(0).getReg();
32050   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
32051   Register mainDstReg = MRI.createVirtualRegister(RC);
32052   Register fallDstReg = MRI.createVirtualRegister(RC);
32053 
32054   // thisMBB:
32055   //  xbegin fallMBB
32056   //  # fallthrough to mainMBB
32057   //  # abortion to fallMBB
32058   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
32059   thisMBB->addSuccessor(mainMBB);
32060   thisMBB->addSuccessor(fallMBB);
32061 
32062   // mainMBB:
32063   //  mainDstReg := -1
32064   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
32065   BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
32066   mainMBB->addSuccessor(sinkMBB);
32067 
32068   // fallMBB:
32069   //  ; pseudo instruction to model hardware's definition from XABORT
32070   //  EAX := XABORT_DEF
32071   //  fallDstReg := EAX
32072   BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
32073   BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
32074       .addReg(X86::EAX);
32075   fallMBB->addSuccessor(sinkMBB);
32076 
32077   // sinkMBB:
32078   //  DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
32079   BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
32080       .addReg(mainDstReg).addMBB(mainMBB)
32081       .addReg(fallDstReg).addMBB(fallMBB);
32082 
32083   MI.eraseFromParent();
32084   return sinkMBB;
32085 }
32086 
32087 MachineBasicBlock *
EmitVAARGWithCustomInserter(MachineInstr & MI,MachineBasicBlock * MBB) const32088 X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
32089                                                MachineBasicBlock *MBB) const {
32090   // Emit va_arg instruction on X86-64.
32091 
32092   // Operands to this pseudo-instruction:
32093   // 0  ) Output        : destination address (reg)
32094   // 1-5) Input         : va_list address (addr, i64mem)
32095   // 6  ) ArgSize       : Size (in bytes) of vararg type
32096   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
32097   // 8  ) Align         : Alignment of type
32098   // 9  ) EFLAGS (implicit-def)
32099 
32100   assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
32101   static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
32102 
32103   Register DestReg = MI.getOperand(0).getReg();
32104   MachineOperand &Base = MI.getOperand(1);
32105   MachineOperand &Scale = MI.getOperand(2);
32106   MachineOperand &Index = MI.getOperand(3);
32107   MachineOperand &Disp = MI.getOperand(4);
32108   MachineOperand &Segment = MI.getOperand(5);
32109   unsigned ArgSize = MI.getOperand(6).getImm();
32110   unsigned ArgMode = MI.getOperand(7).getImm();
32111   Align Alignment = Align(MI.getOperand(8).getImm());
32112 
32113   MachineFunction *MF = MBB->getParent();
32114 
32115   // Memory Reference
32116   assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
32117 
32118   MachineMemOperand *OldMMO = MI.memoperands().front();
32119 
32120   // Clone the MMO into two separate MMOs for loading and storing
32121   MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
32122       OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
32123   MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
32124       OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
32125 
32126   // Machine Information
32127   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32128   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
32129   const TargetRegisterClass *AddrRegClass =
32130       getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
32131   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
32132   const DebugLoc &DL = MI.getDebugLoc();
32133 
32134   // struct va_list {
32135   //   i32   gp_offset
32136   //   i32   fp_offset
32137   //   i64   overflow_area (address)
32138   //   i64   reg_save_area (address)
32139   // }
32140   // sizeof(va_list) = 24
32141   // alignment(va_list) = 8
32142 
32143   unsigned TotalNumIntRegs = 6;
32144   unsigned TotalNumXMMRegs = 8;
32145   bool UseGPOffset = (ArgMode == 1);
32146   bool UseFPOffset = (ArgMode == 2);
32147   unsigned MaxOffset = TotalNumIntRegs * 8 +
32148                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
32149 
32150   /* Align ArgSize to a multiple of 8 */
32151   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
32152   bool NeedsAlign = (Alignment > 8);
32153 
32154   MachineBasicBlock *thisMBB = MBB;
32155   MachineBasicBlock *overflowMBB;
32156   MachineBasicBlock *offsetMBB;
32157   MachineBasicBlock *endMBB;
32158 
32159   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
32160   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
32161   unsigned OffsetReg = 0;
32162 
32163   if (!UseGPOffset && !UseFPOffset) {
32164     // If we only pull from the overflow region, we don't create a branch.
32165     // We don't need to alter control flow.
32166     OffsetDestReg = 0; // unused
32167     OverflowDestReg = DestReg;
32168 
32169     offsetMBB = nullptr;
32170     overflowMBB = thisMBB;
32171     endMBB = thisMBB;
32172   } else {
32173     // First emit code to check if gp_offset (or fp_offset) is below the bound.
32174     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
32175     // If not, pull from overflow_area. (branch to overflowMBB)
32176     //
32177     //       thisMBB
32178     //         |     .
32179     //         |        .
32180     //     offsetMBB   overflowMBB
32181     //         |        .
32182     //         |     .
32183     //        endMBB
32184 
32185     // Registers for the PHI in endMBB
32186     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
32187     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
32188 
32189     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
32190     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32191     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32192     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32193 
32194     MachineFunction::iterator MBBIter = ++MBB->getIterator();
32195 
32196     // Insert the new basic blocks
32197     MF->insert(MBBIter, offsetMBB);
32198     MF->insert(MBBIter, overflowMBB);
32199     MF->insert(MBBIter, endMBB);
32200 
32201     // Transfer the remainder of MBB and its successor edges to endMBB.
32202     endMBB->splice(endMBB->begin(), thisMBB,
32203                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
32204     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
32205 
32206     // Make offsetMBB and overflowMBB successors of thisMBB
32207     thisMBB->addSuccessor(offsetMBB);
32208     thisMBB->addSuccessor(overflowMBB);
32209 
32210     // endMBB is a successor of both offsetMBB and overflowMBB
32211     offsetMBB->addSuccessor(endMBB);
32212     overflowMBB->addSuccessor(endMBB);
32213 
32214     // Load the offset value into a register
32215     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
32216     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
32217         .add(Base)
32218         .add(Scale)
32219         .add(Index)
32220         .addDisp(Disp, UseFPOffset ? 4 : 0)
32221         .add(Segment)
32222         .setMemRefs(LoadOnlyMMO);
32223 
32224     // Check if there is enough room left to pull this argument.
32225     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
32226       .addReg(OffsetReg)
32227       .addImm(MaxOffset + 8 - ArgSizeA8);
32228 
32229     // Branch to "overflowMBB" if offset >= max
32230     // Fall through to "offsetMBB" otherwise
32231     BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
32232       .addMBB(overflowMBB).addImm(X86::COND_AE);
32233   }
32234 
32235   // In offsetMBB, emit code to use the reg_save_area.
32236   if (offsetMBB) {
32237     assert(OffsetReg != 0);
32238 
32239     // Read the reg_save_area address.
32240     Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
32241     BuildMI(
32242         offsetMBB, DL,
32243         TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
32244         RegSaveReg)
32245         .add(Base)
32246         .add(Scale)
32247         .add(Index)
32248         .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
32249         .add(Segment)
32250         .setMemRefs(LoadOnlyMMO);
32251 
32252     if (Subtarget.isTarget64BitLP64()) {
32253       // Zero-extend the offset
32254       Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
32255       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
32256           .addImm(0)
32257           .addReg(OffsetReg)
32258           .addImm(X86::sub_32bit);
32259 
32260       // Add the offset to the reg_save_area to get the final address.
32261       BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
32262           .addReg(OffsetReg64)
32263           .addReg(RegSaveReg);
32264     } else {
32265       // Add the offset to the reg_save_area to get the final address.
32266       BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)
32267           .addReg(OffsetReg)
32268           .addReg(RegSaveReg);
32269     }
32270 
32271     // Compute the offset for the next argument
32272     Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
32273     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
32274       .addReg(OffsetReg)
32275       .addImm(UseFPOffset ? 16 : 8);
32276 
32277     // Store it back into the va_list.
32278     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
32279         .add(Base)
32280         .add(Scale)
32281         .add(Index)
32282         .addDisp(Disp, UseFPOffset ? 4 : 0)
32283         .add(Segment)
32284         .addReg(NextOffsetReg)
32285         .setMemRefs(StoreOnlyMMO);
32286 
32287     // Jump to endMBB
32288     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
32289       .addMBB(endMBB);
32290   }
32291 
32292   //
32293   // Emit code to use overflow area
32294   //
32295 
32296   // Load the overflow_area address into a register.
32297   Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
32298   BuildMI(overflowMBB, DL,
32299           TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
32300           OverflowAddrReg)
32301       .add(Base)
32302       .add(Scale)
32303       .add(Index)
32304       .addDisp(Disp, 8)
32305       .add(Segment)
32306       .setMemRefs(LoadOnlyMMO);
32307 
32308   // If we need to align it, do so. Otherwise, just copy the address
32309   // to OverflowDestReg.
32310   if (NeedsAlign) {
32311     // Align the overflow address
32312     Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
32313 
32314     // aligned_addr = (addr + (align-1)) & ~(align-1)
32315     BuildMI(
32316         overflowMBB, DL,
32317         TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
32318         TmpReg)
32319         .addReg(OverflowAddrReg)
32320         .addImm(Alignment.value() - 1);
32321 
32322     BuildMI(
32323         overflowMBB, DL,
32324         TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
32325         OverflowDestReg)
32326         .addReg(TmpReg)
32327         .addImm(~(uint64_t)(Alignment.value() - 1));
32328   } else {
32329     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
32330       .addReg(OverflowAddrReg);
32331   }
32332 
32333   // Compute the next overflow address after this argument.
32334   // (the overflow address should be kept 8-byte aligned)
32335   Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
32336   BuildMI(
32337       overflowMBB, DL,
32338       TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
32339       NextAddrReg)
32340       .addReg(OverflowDestReg)
32341       .addImm(ArgSizeA8);
32342 
32343   // Store the new overflow address.
32344   BuildMI(overflowMBB, DL,
32345           TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
32346       .add(Base)
32347       .add(Scale)
32348       .add(Index)
32349       .addDisp(Disp, 8)
32350       .add(Segment)
32351       .addReg(NextAddrReg)
32352       .setMemRefs(StoreOnlyMMO);
32353 
32354   // If we branched, emit the PHI to the front of endMBB.
32355   if (offsetMBB) {
32356     BuildMI(*endMBB, endMBB->begin(), DL,
32357             TII->get(X86::PHI), DestReg)
32358       .addReg(OffsetDestReg).addMBB(offsetMBB)
32359       .addReg(OverflowDestReg).addMBB(overflowMBB);
32360   }
32361 
32362   // Erase the pseudo instruction
32363   MI.eraseFromParent();
32364 
32365   return endMBB;
32366 }
32367 
32368 // The EFLAGS operand of SelectItr might be missing a kill marker
32369 // because there were multiple uses of EFLAGS, and ISel didn't know
32370 // which to mark. Figure out whether SelectItr should have had a
32371 // kill marker, and set it if it should. Returns the correct kill
32372 // marker value.
checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,MachineBasicBlock * BB,const TargetRegisterInfo * TRI)32373 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
32374                                      MachineBasicBlock* BB,
32375                                      const TargetRegisterInfo* TRI) {
32376   if (isEFLAGSLiveAfter(SelectItr, BB))
32377     return false;
32378 
32379   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
32380   // out. SelectMI should have a kill flag on EFLAGS.
32381   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
32382   return true;
32383 }
32384 
32385 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
32386 // together with other CMOV pseudo-opcodes into a single basic-block with
32387 // conditional jump around it.
isCMOVPseudo(MachineInstr & MI)32388 static bool isCMOVPseudo(MachineInstr &MI) {
32389   switch (MI.getOpcode()) {
32390   case X86::CMOV_FR32:
32391   case X86::CMOV_FR32X:
32392   case X86::CMOV_FR64:
32393   case X86::CMOV_FR64X:
32394   case X86::CMOV_GR8:
32395   case X86::CMOV_GR16:
32396   case X86::CMOV_GR32:
32397   case X86::CMOV_RFP32:
32398   case X86::CMOV_RFP64:
32399   case X86::CMOV_RFP80:
32400   case X86::CMOV_VR64:
32401   case X86::CMOV_VR128:
32402   case X86::CMOV_VR128X:
32403   case X86::CMOV_VR256:
32404   case X86::CMOV_VR256X:
32405   case X86::CMOV_VR512:
32406   case X86::CMOV_VK1:
32407   case X86::CMOV_VK2:
32408   case X86::CMOV_VK4:
32409   case X86::CMOV_VK8:
32410   case X86::CMOV_VK16:
32411   case X86::CMOV_VK32:
32412   case X86::CMOV_VK64:
32413     return true;
32414 
32415   default:
32416     return false;
32417   }
32418 }
32419 
32420 // Helper function, which inserts PHI functions into SinkMBB:
32421 //   %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
32422 // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
32423 // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
32424 // the last PHI function inserted.
createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin,MachineBasicBlock::iterator MIItEnd,MachineBasicBlock * TrueMBB,MachineBasicBlock * FalseMBB,MachineBasicBlock * SinkMBB)32425 static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
32426     MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
32427     MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
32428     MachineBasicBlock *SinkMBB) {
32429   MachineFunction *MF = TrueMBB->getParent();
32430   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
32431   const DebugLoc &DL = MIItBegin->getDebugLoc();
32432 
32433   X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
32434   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
32435 
32436   MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
32437 
32438   // As we are creating the PHIs, we have to be careful if there is more than
32439   // one.  Later CMOVs may reference the results of earlier CMOVs, but later
32440   // PHIs have to reference the individual true/false inputs from earlier PHIs.
32441   // That also means that PHI construction must work forward from earlier to
32442   // later, and that the code must maintain a mapping from earlier PHI's
32443   // destination registers, and the registers that went into the PHI.
32444   DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
32445   MachineInstrBuilder MIB;
32446 
32447   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
32448     Register DestReg = MIIt->getOperand(0).getReg();
32449     Register Op1Reg = MIIt->getOperand(1).getReg();
32450     Register Op2Reg = MIIt->getOperand(2).getReg();
32451 
32452     // If this CMOV we are generating is the opposite condition from
32453     // the jump we generated, then we have to swap the operands for the
32454     // PHI that is going to be generated.
32455     if (MIIt->getOperand(3).getImm() == OppCC)
32456       std::swap(Op1Reg, Op2Reg);
32457 
32458     if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
32459       Op1Reg = RegRewriteTable[Op1Reg].first;
32460 
32461     if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
32462       Op2Reg = RegRewriteTable[Op2Reg].second;
32463 
32464     MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
32465               .addReg(Op1Reg)
32466               .addMBB(FalseMBB)
32467               .addReg(Op2Reg)
32468               .addMBB(TrueMBB);
32469 
32470     // Add this PHI to the rewrite table.
32471     RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
32472   }
32473 
32474   return MIB;
32475 }
32476 
32477 // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
32478 MachineBasicBlock *
EmitLoweredCascadedSelect(MachineInstr & FirstCMOV,MachineInstr & SecondCascadedCMOV,MachineBasicBlock * ThisMBB) const32479 X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
32480                                              MachineInstr &SecondCascadedCMOV,
32481                                              MachineBasicBlock *ThisMBB) const {
32482   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32483   const DebugLoc &DL = FirstCMOV.getDebugLoc();
32484 
32485   // We lower cascaded CMOVs such as
32486   //
32487   //   (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
32488   //
32489   // to two successive branches.
32490   //
32491   // Without this, we would add a PHI between the two jumps, which ends up
32492   // creating a few copies all around. For instance, for
32493   //
32494   //    (sitofp (zext (fcmp une)))
32495   //
32496   // we would generate:
32497   //
32498   //         ucomiss %xmm1, %xmm0
32499   //         movss  <1.0f>, %xmm0
32500   //         movaps  %xmm0, %xmm1
32501   //         jne     .LBB5_2
32502   //         xorps   %xmm1, %xmm1
32503   // .LBB5_2:
32504   //         jp      .LBB5_4
32505   //         movaps  %xmm1, %xmm0
32506   // .LBB5_4:
32507   //         retq
32508   //
32509   // because this custom-inserter would have generated:
32510   //
32511   //   A
32512   //   | \
32513   //   |  B
32514   //   | /
32515   //   C
32516   //   | \
32517   //   |  D
32518   //   | /
32519   //   E
32520   //
32521   // A: X = ...; Y = ...
32522   // B: empty
32523   // C: Z = PHI [X, A], [Y, B]
32524   // D: empty
32525   // E: PHI [X, C], [Z, D]
32526   //
32527   // If we lower both CMOVs in a single step, we can instead generate:
32528   //
32529   //   A
32530   //   | \
32531   //   |  C
32532   //   | /|
32533   //   |/ |
32534   //   |  |
32535   //   |  D
32536   //   | /
32537   //   E
32538   //
32539   // A: X = ...; Y = ...
32540   // D: empty
32541   // E: PHI [X, A], [X, C], [Y, D]
32542   //
32543   // Which, in our sitofp/fcmp example, gives us something like:
32544   //
32545   //         ucomiss %xmm1, %xmm0
32546   //         movss  <1.0f>, %xmm0
32547   //         jne     .LBB5_4
32548   //         jp      .LBB5_4
32549   //         xorps   %xmm0, %xmm0
32550   // .LBB5_4:
32551   //         retq
32552   //
32553 
32554   // We lower cascaded CMOV into two successive branches to the same block.
32555   // EFLAGS is used by both, so mark it as live in the second.
32556   const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
32557   MachineFunction *F = ThisMBB->getParent();
32558   MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
32559   MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
32560   MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
32561 
32562   MachineFunction::iterator It = ++ThisMBB->getIterator();
32563   F->insert(It, FirstInsertedMBB);
32564   F->insert(It, SecondInsertedMBB);
32565   F->insert(It, SinkMBB);
32566 
32567   // For a cascaded CMOV, we lower it to two successive branches to
32568   // the same block (SinkMBB).  EFLAGS is used by both, so mark it as live in
32569   // the FirstInsertedMBB.
32570   FirstInsertedMBB->addLiveIn(X86::EFLAGS);
32571 
32572   // If the EFLAGS register isn't dead in the terminator, then claim that it's
32573   // live into the sink and copy blocks.
32574   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
32575   if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
32576       !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
32577     SecondInsertedMBB->addLiveIn(X86::EFLAGS);
32578     SinkMBB->addLiveIn(X86::EFLAGS);
32579   }
32580 
32581   // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
32582   SinkMBB->splice(SinkMBB->begin(), ThisMBB,
32583                   std::next(MachineBasicBlock::iterator(FirstCMOV)),
32584                   ThisMBB->end());
32585   SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
32586 
32587   // Fallthrough block for ThisMBB.
32588   ThisMBB->addSuccessor(FirstInsertedMBB);
32589   // The true block target of the first branch is always SinkMBB.
32590   ThisMBB->addSuccessor(SinkMBB);
32591   // Fallthrough block for FirstInsertedMBB.
32592   FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
32593   // The true block for the branch of FirstInsertedMBB.
32594   FirstInsertedMBB->addSuccessor(SinkMBB);
32595   // This is fallthrough.
32596   SecondInsertedMBB->addSuccessor(SinkMBB);
32597 
32598   // Create the conditional branch instructions.
32599   X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
32600   BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
32601 
32602   X86::CondCode SecondCC =
32603       X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
32604   BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
32605 
32606   //  SinkMBB:
32607   //   %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
32608   Register DestReg = FirstCMOV.getOperand(0).getReg();
32609   Register Op1Reg = FirstCMOV.getOperand(1).getReg();
32610   Register Op2Reg = FirstCMOV.getOperand(2).getReg();
32611   MachineInstrBuilder MIB =
32612       BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
32613           .addReg(Op1Reg)
32614           .addMBB(SecondInsertedMBB)
32615           .addReg(Op2Reg)
32616           .addMBB(ThisMBB);
32617 
32618   // The second SecondInsertedMBB provides the same incoming value as the
32619   // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
32620   MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
32621   // Copy the PHI result to the register defined by the second CMOV.
32622   BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
32623           TII->get(TargetOpcode::COPY),
32624           SecondCascadedCMOV.getOperand(0).getReg())
32625       .addReg(FirstCMOV.getOperand(0).getReg());
32626 
32627   // Now remove the CMOVs.
32628   FirstCMOV.eraseFromParent();
32629   SecondCascadedCMOV.eraseFromParent();
32630 
32631   return SinkMBB;
32632 }
32633 
32634 MachineBasicBlock *
EmitLoweredSelect(MachineInstr & MI,MachineBasicBlock * ThisMBB) const32635 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
32636                                      MachineBasicBlock *ThisMBB) const {
32637   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32638   const DebugLoc &DL = MI.getDebugLoc();
32639 
32640   // To "insert" a SELECT_CC instruction, we actually have to insert the
32641   // diamond control-flow pattern.  The incoming instruction knows the
32642   // destination vreg to set, the condition code register to branch on, the
32643   // true/false values to select between and a branch opcode to use.
32644 
32645   //  ThisMBB:
32646   //  ...
32647   //   TrueVal = ...
32648   //   cmpTY ccX, r1, r2
32649   //   bCC copy1MBB
32650   //   fallthrough --> FalseMBB
32651 
32652   // This code lowers all pseudo-CMOV instructions. Generally it lowers these
32653   // as described above, by inserting a BB, and then making a PHI at the join
32654   // point to select the true and false operands of the CMOV in the PHI.
32655   //
32656   // The code also handles two different cases of multiple CMOV opcodes
32657   // in a row.
32658   //
32659   // Case 1:
32660   // In this case, there are multiple CMOVs in a row, all which are based on
32661   // the same condition setting (or the exact opposite condition setting).
32662   // In this case we can lower all the CMOVs using a single inserted BB, and
32663   // then make a number of PHIs at the join point to model the CMOVs. The only
32664   // trickiness here, is that in a case like:
32665   //
32666   // t2 = CMOV cond1 t1, f1
32667   // t3 = CMOV cond1 t2, f2
32668   //
32669   // when rewriting this into PHIs, we have to perform some renaming on the
32670   // temps since you cannot have a PHI operand refer to a PHI result earlier
32671   // in the same block.  The "simple" but wrong lowering would be:
32672   //
32673   // t2 = PHI t1(BB1), f1(BB2)
32674   // t3 = PHI t2(BB1), f2(BB2)
32675   //
32676   // but clearly t2 is not defined in BB1, so that is incorrect. The proper
32677   // renaming is to note that on the path through BB1, t2 is really just a
32678   // copy of t1, and do that renaming, properly generating:
32679   //
32680   // t2 = PHI t1(BB1), f1(BB2)
32681   // t3 = PHI t1(BB1), f2(BB2)
32682   //
32683   // Case 2:
32684   // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
32685   // function - EmitLoweredCascadedSelect.
32686 
32687   X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
32688   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
32689   MachineInstr *LastCMOV = &MI;
32690   MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
32691 
32692   // Check for case 1, where there are multiple CMOVs with the same condition
32693   // first.  Of the two cases of multiple CMOV lowerings, case 1 reduces the
32694   // number of jumps the most.
32695 
32696   if (isCMOVPseudo(MI)) {
32697     // See if we have a string of CMOVS with the same condition. Skip over
32698     // intervening debug insts.
32699     while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
32700            (NextMIIt->getOperand(3).getImm() == CC ||
32701             NextMIIt->getOperand(3).getImm() == OppCC)) {
32702       LastCMOV = &*NextMIIt;
32703       NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
32704     }
32705   }
32706 
32707   // This checks for case 2, but only do this if we didn't already find
32708   // case 1, as indicated by LastCMOV == MI.
32709   if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
32710       NextMIIt->getOpcode() == MI.getOpcode() &&
32711       NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
32712       NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
32713       NextMIIt->getOperand(1).isKill()) {
32714     return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
32715   }
32716 
32717   const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
32718   MachineFunction *F = ThisMBB->getParent();
32719   MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
32720   MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
32721 
32722   MachineFunction::iterator It = ++ThisMBB->getIterator();
32723   F->insert(It, FalseMBB);
32724   F->insert(It, SinkMBB);
32725 
32726   // If the EFLAGS register isn't dead in the terminator, then claim that it's
32727   // live into the sink and copy blocks.
32728   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
32729   if (!LastCMOV->killsRegister(X86::EFLAGS) &&
32730       !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
32731     FalseMBB->addLiveIn(X86::EFLAGS);
32732     SinkMBB->addLiveIn(X86::EFLAGS);
32733   }
32734 
32735   // Transfer any debug instructions inside the CMOV sequence to the sunk block.
32736   auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);
32737   auto DbgIt = MachineBasicBlock::iterator(MI);
32738   while (DbgIt != DbgEnd) {
32739     auto Next = std::next(DbgIt);
32740     if (DbgIt->isDebugInstr())
32741       SinkMBB->push_back(DbgIt->removeFromParent());
32742     DbgIt = Next;
32743   }
32744 
32745   // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
32746   SinkMBB->splice(SinkMBB->end(), ThisMBB,
32747                   std::next(MachineBasicBlock::iterator(LastCMOV)),
32748                   ThisMBB->end());
32749   SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
32750 
32751   // Fallthrough block for ThisMBB.
32752   ThisMBB->addSuccessor(FalseMBB);
32753   // The true block target of the first (or only) branch is always a SinkMBB.
32754   ThisMBB->addSuccessor(SinkMBB);
32755   // Fallthrough block for FalseMBB.
32756   FalseMBB->addSuccessor(SinkMBB);
32757 
32758   // Create the conditional branch instruction.
32759   BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
32760 
32761   //  SinkMBB:
32762   //   %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
32763   //  ...
32764   MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
32765   MachineBasicBlock::iterator MIItEnd =
32766       std::next(MachineBasicBlock::iterator(LastCMOV));
32767   createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
32768 
32769   // Now remove the CMOV(s).
32770   ThisMBB->erase(MIItBegin, MIItEnd);
32771 
32772   return SinkMBB;
32773 }
32774 
getSUBriOpcode(bool IsLP64,int64_t Imm)32775 static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
32776   if (IsLP64) {
32777     if (isInt<8>(Imm))
32778       return X86::SUB64ri8;
32779     return X86::SUB64ri32;
32780   } else {
32781     if (isInt<8>(Imm))
32782       return X86::SUB32ri8;
32783     return X86::SUB32ri;
32784   }
32785 }
32786 
32787 MachineBasicBlock *
EmitLoweredProbedAlloca(MachineInstr & MI,MachineBasicBlock * MBB) const32788 X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
32789                                            MachineBasicBlock *MBB) const {
32790   MachineFunction *MF = MBB->getParent();
32791   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32792   const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
32793   const DebugLoc &DL = MI.getDebugLoc();
32794   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
32795 
32796   const unsigned ProbeSize = getStackProbeSize(*MF);
32797 
32798   MachineRegisterInfo &MRI = MF->getRegInfo();
32799   MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32800   MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32801   MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32802 
32803   MachineFunction::iterator MBBIter = ++MBB->getIterator();
32804   MF->insert(MBBIter, testMBB);
32805   MF->insert(MBBIter, blockMBB);
32806   MF->insert(MBBIter, tailMBB);
32807 
32808   Register sizeVReg = MI.getOperand(1).getReg();
32809 
32810   Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
32811 
32812   Register TmpStackPtr = MRI.createVirtualRegister(
32813       TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
32814   Register FinalStackPtr = MRI.createVirtualRegister(
32815       TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
32816 
32817   BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
32818       .addReg(physSPReg);
32819   {
32820     const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
32821     BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
32822         .addReg(TmpStackPtr)
32823         .addReg(sizeVReg);
32824   }
32825 
32826   // test rsp size
32827 
32828   BuildMI(testMBB, DL,
32829           TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
32830       .addReg(FinalStackPtr)
32831       .addReg(physSPReg);
32832 
32833   BuildMI(testMBB, DL, TII->get(X86::JCC_1))
32834       .addMBB(tailMBB)
32835       .addImm(X86::COND_GE);
32836   testMBB->addSuccessor(blockMBB);
32837   testMBB->addSuccessor(tailMBB);
32838 
32839   // Touch the block then extend it. This is done on the opposite side of
32840   // static probe where we allocate then touch, to avoid the need of probing the
32841   // tail of the static alloca. Possible scenarios are:
32842   //
32843   //       + ---- <- ------------ <- ------------- <- ------------ +
32844   //       |                                                       |
32845   // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
32846   //                                                               |                                                               |
32847   //                                                               + <- ----------- <- ------------ <- ----------- <- ------------ +
32848   //
32849   // The property we want to enforce is to never have more than [page alloc] between two probes.
32850 
32851   const unsigned XORMIOpc =
32852       TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;
32853   addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)
32854       .addImm(0);
32855 
32856   BuildMI(blockMBB, DL,
32857           TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
32858       .addReg(physSPReg)
32859       .addImm(ProbeSize);
32860 
32861 
32862   BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
32863   blockMBB->addSuccessor(testMBB);
32864 
32865   // Replace original instruction by the expected stack ptr
32866   BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
32867       .addReg(FinalStackPtr);
32868 
32869   tailMBB->splice(tailMBB->end(), MBB,
32870                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
32871   tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
32872   MBB->addSuccessor(testMBB);
32873 
32874   // Delete the original pseudo instruction.
32875   MI.eraseFromParent();
32876 
32877   // And we're done.
32878   return tailMBB;
32879 }
32880 
32881 MachineBasicBlock *
EmitLoweredSegAlloca(MachineInstr & MI,MachineBasicBlock * BB) const32882 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
32883                                         MachineBasicBlock *BB) const {
32884   MachineFunction *MF = BB->getParent();
32885   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32886   const DebugLoc &DL = MI.getDebugLoc();
32887   const BasicBlock *LLVM_BB = BB->getBasicBlock();
32888 
32889   assert(MF->shouldSplitStack());
32890 
32891   const bool Is64Bit = Subtarget.is64Bit();
32892   const bool IsLP64 = Subtarget.isTarget64BitLP64();
32893 
32894   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
32895   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
32896 
32897   // BB:
32898   //  ... [Till the alloca]
32899   // If stacklet is not large enough, jump to mallocMBB
32900   //
32901   // bumpMBB:
32902   //  Allocate by subtracting from RSP
32903   //  Jump to continueMBB
32904   //
32905   // mallocMBB:
32906   //  Allocate by call to runtime
32907   //
32908   // continueMBB:
32909   //  ...
32910   //  [rest of original BB]
32911   //
32912 
32913   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32914   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32915   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32916 
32917   MachineRegisterInfo &MRI = MF->getRegInfo();
32918   const TargetRegisterClass *AddrRegClass =
32919       getRegClassFor(getPointerTy(MF->getDataLayout()));
32920 
32921   Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
32922            bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
32923            tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
32924            SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
32925            sizeVReg = MI.getOperand(1).getReg(),
32926            physSPReg =
32927                IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
32928 
32929   MachineFunction::iterator MBBIter = ++BB->getIterator();
32930 
32931   MF->insert(MBBIter, bumpMBB);
32932   MF->insert(MBBIter, mallocMBB);
32933   MF->insert(MBBIter, continueMBB);
32934 
32935   continueMBB->splice(continueMBB->begin(), BB,
32936                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
32937   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
32938 
32939   // Add code to the main basic block to check if the stack limit has been hit,
32940   // and if so, jump to mallocMBB otherwise to bumpMBB.
32941   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
32942   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
32943     .addReg(tmpSPVReg).addReg(sizeVReg);
32944   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
32945     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
32946     .addReg(SPLimitVReg);
32947   BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
32948 
32949   // bumpMBB simply decreases the stack pointer, since we know the current
32950   // stacklet has enough space.
32951   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
32952     .addReg(SPLimitVReg);
32953   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
32954     .addReg(SPLimitVReg);
32955   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
32956 
32957   // Calls into a routine in libgcc to allocate more space from the heap.
32958   const uint32_t *RegMask =
32959       Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
32960   if (IsLP64) {
32961     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
32962       .addReg(sizeVReg);
32963     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
32964       .addExternalSymbol("__morestack_allocate_stack_space")
32965       .addRegMask(RegMask)
32966       .addReg(X86::RDI, RegState::Implicit)
32967       .addReg(X86::RAX, RegState::ImplicitDefine);
32968   } else if (Is64Bit) {
32969     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
32970       .addReg(sizeVReg);
32971     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
32972       .addExternalSymbol("__morestack_allocate_stack_space")
32973       .addRegMask(RegMask)
32974       .addReg(X86::EDI, RegState::Implicit)
32975       .addReg(X86::EAX, RegState::ImplicitDefine);
32976   } else {
32977     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
32978       .addImm(12);
32979     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
32980     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
32981       .addExternalSymbol("__morestack_allocate_stack_space")
32982       .addRegMask(RegMask)
32983       .addReg(X86::EAX, RegState::ImplicitDefine);
32984   }
32985 
32986   if (!Is64Bit)
32987     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
32988       .addImm(16);
32989 
32990   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
32991     .addReg(IsLP64 ? X86::RAX : X86::EAX);
32992   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
32993 
32994   // Set up the CFG correctly.
32995   BB->addSuccessor(bumpMBB);
32996   BB->addSuccessor(mallocMBB);
32997   mallocMBB->addSuccessor(continueMBB);
32998   bumpMBB->addSuccessor(continueMBB);
32999 
33000   // Take care of the PHI nodes.
33001   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
33002           MI.getOperand(0).getReg())
33003       .addReg(mallocPtrVReg)
33004       .addMBB(mallocMBB)
33005       .addReg(bumpSPPtrVReg)
33006       .addMBB(bumpMBB);
33007 
33008   // Delete the original pseudo instruction.
33009   MI.eraseFromParent();
33010 
33011   // And we're done.
33012   return continueMBB;
33013 }
33014 
33015 MachineBasicBlock *
EmitLoweredCatchRet(MachineInstr & MI,MachineBasicBlock * BB) const33016 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
33017                                        MachineBasicBlock *BB) const {
33018   MachineFunction *MF = BB->getParent();
33019   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
33020   MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
33021   const DebugLoc &DL = MI.getDebugLoc();
33022 
33023   assert(!isAsynchronousEHPersonality(
33024              classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
33025          "SEH does not use catchret!");
33026 
33027   // Only 32-bit EH needs to worry about manually restoring stack pointers.
33028   if (!Subtarget.is32Bit())
33029     return BB;
33030 
33031   // C++ EH creates a new target block to hold the restore code, and wires up
33032   // the new block to the return destination with a normal JMP_4.
33033   MachineBasicBlock *RestoreMBB =
33034       MF->CreateMachineBasicBlock(BB->getBasicBlock());
33035   assert(BB->succ_size() == 1);
33036   MF->insert(std::next(BB->getIterator()), RestoreMBB);
33037   RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
33038   BB->addSuccessor(RestoreMBB);
33039   MI.getOperand(0).setMBB(RestoreMBB);
33040 
33041   // Marking this as an EH pad but not a funclet entry block causes PEI to
33042   // restore stack pointers in the block.
33043   RestoreMBB->setIsEHPad(true);
33044 
33045   auto RestoreMBBI = RestoreMBB->begin();
33046   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
33047   return BB;
33048 }
33049 
33050 MachineBasicBlock *
EmitLoweredTLSAddr(MachineInstr & MI,MachineBasicBlock * BB) const33051 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
33052                                       MachineBasicBlock *BB) const {
33053   // So, here we replace TLSADDR with the sequence:
33054   // adjust_stackdown -> TLSADDR -> adjust_stackup.
33055   // We need this because TLSADDR is lowered into calls
33056   // inside MC, therefore without the two markers shrink-wrapping
33057   // may push the prologue/epilogue pass them.
33058   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
33059   const DebugLoc &DL = MI.getDebugLoc();
33060   MachineFunction &MF = *BB->getParent();
33061 
33062   // Emit CALLSEQ_START right before the instruction.
33063   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
33064   MachineInstrBuilder CallseqStart =
33065     BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
33066   BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
33067 
33068   // Emit CALLSEQ_END right after the instruction.
33069   // We don't call erase from parent because we want to keep the
33070   // original instruction around.
33071   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
33072   MachineInstrBuilder CallseqEnd =
33073     BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
33074   BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
33075 
33076   return BB;
33077 }
33078 
33079 MachineBasicBlock *
EmitLoweredTLSCall(MachineInstr & MI,MachineBasicBlock * BB) const33080 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
33081                                       MachineBasicBlock *BB) const {
33082   // This is pretty easy.  We're taking the value that we received from
33083   // our load from the relocation, sticking it in either RDI (x86-64)
33084   // or EAX and doing an indirect call.  The return value will then
33085   // be in the normal return register.
33086   MachineFunction *F = BB->getParent();
33087   const X86InstrInfo *TII = Subtarget.getInstrInfo();
33088   const DebugLoc &DL = MI.getDebugLoc();
33089 
33090   assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
33091   assert(MI.getOperand(3).isGlobal() && "This should be a global");
33092 
33093   // Get a register mask for the lowered call.
33094   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
33095   // proper register mask.
33096   const uint32_t *RegMask =
33097       Subtarget.is64Bit() ?
33098       Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
33099       Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
33100   if (Subtarget.is64Bit()) {
33101     MachineInstrBuilder MIB =
33102         BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
33103             .addReg(X86::RIP)
33104             .addImm(0)
33105             .addReg(0)
33106             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
33107                               MI.getOperand(3).getTargetFlags())
33108             .addReg(0);
33109     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
33110     addDirectMem(MIB, X86::RDI);
33111     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
33112   } else if (!isPositionIndependent()) {
33113     MachineInstrBuilder MIB =
33114         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
33115             .addReg(0)
33116             .addImm(0)
33117             .addReg(0)
33118             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
33119                               MI.getOperand(3).getTargetFlags())
33120             .addReg(0);
33121     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
33122     addDirectMem(MIB, X86::EAX);
33123     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
33124   } else {
33125     MachineInstrBuilder MIB =
33126         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
33127             .addReg(TII->getGlobalBaseReg(F))
33128             .addImm(0)
33129             .addReg(0)
33130             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
33131                               MI.getOperand(3).getTargetFlags())
33132             .addReg(0);
33133     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
33134     addDirectMem(MIB, X86::EAX);
33135     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
33136   }
33137 
33138   MI.eraseFromParent(); // The pseudo instruction is gone now.
33139   return BB;
33140 }
33141 
getOpcodeForIndirectThunk(unsigned RPOpc)33142 static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
33143   switch (RPOpc) {
33144   case X86::INDIRECT_THUNK_CALL32:
33145     return X86::CALLpcrel32;
33146   case X86::INDIRECT_THUNK_CALL64:
33147     return X86::CALL64pcrel32;
33148   case X86::INDIRECT_THUNK_TCRETURN32:
33149     return X86::TCRETURNdi;
33150   case X86::INDIRECT_THUNK_TCRETURN64:
33151     return X86::TCRETURNdi64;
33152   }
33153   llvm_unreachable("not indirect thunk opcode");
33154 }
33155 
getIndirectThunkSymbol(const X86Subtarget & Subtarget,unsigned Reg)33156 static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
33157                                           unsigned Reg) {
33158   if (Subtarget.useRetpolineExternalThunk()) {
33159     // When using an external thunk for retpolines, we pick names that match the
33160     // names GCC happens to use as well. This helps simplify the implementation
33161     // of the thunks for kernels where they have no easy ability to create
33162     // aliases and are doing non-trivial configuration of the thunk's body. For
33163     // example, the Linux kernel will do boot-time hot patching of the thunk
33164     // bodies and cannot easily export aliases of these to loaded modules.
33165     //
33166     // Note that at any point in the future, we may need to change the semantics
33167     // of how we implement retpolines and at that time will likely change the
33168     // name of the called thunk. Essentially, there is no hard guarantee that
33169     // LLVM will generate calls to specific thunks, we merely make a best-effort
33170     // attempt to help out kernels and other systems where duplicating the
33171     // thunks is costly.
33172     switch (Reg) {
33173     case X86::EAX:
33174       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33175       return "__x86_indirect_thunk_eax";
33176     case X86::ECX:
33177       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33178       return "__x86_indirect_thunk_ecx";
33179     case X86::EDX:
33180       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33181       return "__x86_indirect_thunk_edx";
33182     case X86::EDI:
33183       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33184       return "__x86_indirect_thunk_edi";
33185     case X86::R11:
33186       assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
33187       return "__x86_indirect_thunk_r11";
33188     }
33189     llvm_unreachable("unexpected reg for external indirect thunk");
33190   }
33191 
33192   if (Subtarget.useRetpolineIndirectCalls() ||
33193       Subtarget.useRetpolineIndirectBranches()) {
33194     // When targeting an internal COMDAT thunk use an LLVM-specific name.
33195     switch (Reg) {
33196     case X86::EAX:
33197       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33198       return "__llvm_retpoline_eax";
33199     case X86::ECX:
33200       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33201       return "__llvm_retpoline_ecx";
33202     case X86::EDX:
33203       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33204       return "__llvm_retpoline_edx";
33205     case X86::EDI:
33206       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33207       return "__llvm_retpoline_edi";
33208     case X86::R11:
33209       assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
33210       return "__llvm_retpoline_r11";
33211     }
33212     llvm_unreachable("unexpected reg for retpoline");
33213   }
33214 
33215   if (Subtarget.useLVIControlFlowIntegrity()) {
33216     assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
33217     return "__llvm_lvi_thunk_r11";
33218   }
33219   llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
33220 }
33221 
33222 MachineBasicBlock *
EmitLoweredIndirectThunk(MachineInstr & MI,MachineBasicBlock * BB) const33223 X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
33224                                             MachineBasicBlock *BB) const {
33225   // Copy the virtual register into the R11 physical register and
33226   // call the retpoline thunk.
33227   const DebugLoc &DL = MI.getDebugLoc();
33228   const X86InstrInfo *TII = Subtarget.getInstrInfo();
33229   Register CalleeVReg = MI.getOperand(0).getReg();
33230   unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
33231 
33232   // Find an available scratch register to hold the callee. On 64-bit, we can
33233   // just use R11, but we scan for uses anyway to ensure we don't generate
33234   // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
33235   // already a register use operand to the call to hold the callee. If none
33236   // are available, use EDI instead. EDI is chosen because EBX is the PIC base
33237   // register and ESI is the base pointer to realigned stack frames with VLAs.
33238   SmallVector<unsigned, 3> AvailableRegs;
33239   if (Subtarget.is64Bit())
33240     AvailableRegs.push_back(X86::R11);
33241   else
33242     AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
33243 
33244   // Zero out any registers that are already used.
33245   for (const auto &MO : MI.operands()) {
33246     if (MO.isReg() && MO.isUse())
33247       for (unsigned &Reg : AvailableRegs)
33248         if (Reg == MO.getReg())
33249           Reg = 0;
33250   }
33251 
33252   // Choose the first remaining non-zero available register.
33253   unsigned AvailableReg = 0;
33254   for (unsigned MaybeReg : AvailableRegs) {
33255     if (MaybeReg) {
33256       AvailableReg = MaybeReg;
33257       break;
33258     }
33259   }
33260   if (!AvailableReg)
33261     report_fatal_error("calling convention incompatible with retpoline, no "
33262                        "available registers");
33263 
33264   const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
33265 
33266   BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
33267       .addReg(CalleeVReg);
33268   MI.getOperand(0).ChangeToES(Symbol);
33269   MI.setDesc(TII->get(Opc));
33270   MachineInstrBuilder(*BB->getParent(), &MI)
33271       .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
33272   return BB;
33273 }
33274 
33275 /// SetJmp implies future control flow change upon calling the corresponding
33276 /// LongJmp.
33277 /// Instead of using the 'return' instruction, the long jump fixes the stack and
33278 /// performs an indirect branch. To do so it uses the registers that were stored
33279 /// in the jump buffer (when calling SetJmp).
33280 /// In case the shadow stack is enabled we need to fix it as well, because some
33281 /// return addresses will be skipped.
33282 /// The function will save the SSP for future fixing in the function
33283 /// emitLongJmpShadowStackFix.
33284 /// \sa emitLongJmpShadowStackFix
33285 /// \param [in] MI The temporary Machine Instruction for the builtin.
33286 /// \param [in] MBB The Machine Basic Block that will be modified.
emitSetJmpShadowStackFix(MachineInstr & MI,MachineBasicBlock * MBB) const33287 void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
33288                                                  MachineBasicBlock *MBB) const {
33289   const DebugLoc &DL = MI.getDebugLoc();
33290   MachineFunction *MF = MBB->getParent();
33291   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33292   MachineRegisterInfo &MRI = MF->getRegInfo();
33293   MachineInstrBuilder MIB;
33294 
33295   // Memory Reference.
33296   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33297                                            MI.memoperands_end());
33298 
33299   // Initialize a register with zero.
33300   MVT PVT = getPointerTy(MF->getDataLayout());
33301   const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
33302   Register ZReg = MRI.createVirtualRegister(PtrRC);
33303   unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
33304   BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
33305       .addDef(ZReg)
33306       .addReg(ZReg, RegState::Undef)
33307       .addReg(ZReg, RegState::Undef);
33308 
33309   // Read the current SSP Register value to the zeroed register.
33310   Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
33311   unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
33312   BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
33313 
33314   // Write the SSP register value to offset 3 in input memory buffer.
33315   unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
33316   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
33317   const int64_t SSPOffset = 3 * PVT.getStoreSize();
33318   const unsigned MemOpndSlot = 1;
33319   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33320     if (i == X86::AddrDisp)
33321       MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
33322     else
33323       MIB.add(MI.getOperand(MemOpndSlot + i));
33324   }
33325   MIB.addReg(SSPCopyReg);
33326   MIB.setMemRefs(MMOs);
33327 }
33328 
33329 MachineBasicBlock *
emitEHSjLjSetJmp(MachineInstr & MI,MachineBasicBlock * MBB) const33330 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
33331                                     MachineBasicBlock *MBB) const {
33332   const DebugLoc &DL = MI.getDebugLoc();
33333   MachineFunction *MF = MBB->getParent();
33334   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33335   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
33336   MachineRegisterInfo &MRI = MF->getRegInfo();
33337 
33338   const BasicBlock *BB = MBB->getBasicBlock();
33339   MachineFunction::iterator I = ++MBB->getIterator();
33340 
33341   // Memory Reference
33342   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33343                                            MI.memoperands_end());
33344 
33345   unsigned DstReg;
33346   unsigned MemOpndSlot = 0;
33347 
33348   unsigned CurOp = 0;
33349 
33350   DstReg = MI.getOperand(CurOp++).getReg();
33351   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
33352   assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
33353   (void)TRI;
33354   Register mainDstReg = MRI.createVirtualRegister(RC);
33355   Register restoreDstReg = MRI.createVirtualRegister(RC);
33356 
33357   MemOpndSlot = CurOp;
33358 
33359   MVT PVT = getPointerTy(MF->getDataLayout());
33360   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
33361          "Invalid Pointer Size!");
33362 
33363   // For v = setjmp(buf), we generate
33364   //
33365   // thisMBB:
33366   //  buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
33367   //  SjLjSetup restoreMBB
33368   //
33369   // mainMBB:
33370   //  v_main = 0
33371   //
33372   // sinkMBB:
33373   //  v = phi(main, restore)
33374   //
33375   // restoreMBB:
33376   //  if base pointer being used, load it from frame
33377   //  v_restore = 1
33378 
33379   MachineBasicBlock *thisMBB = MBB;
33380   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
33381   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
33382   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
33383   MF->insert(I, mainMBB);
33384   MF->insert(I, sinkMBB);
33385   MF->push_back(restoreMBB);
33386   restoreMBB->setHasAddressTaken();
33387 
33388   MachineInstrBuilder MIB;
33389 
33390   // Transfer the remainder of BB and its successor edges to sinkMBB.
33391   sinkMBB->splice(sinkMBB->begin(), MBB,
33392                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
33393   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
33394 
33395   // thisMBB:
33396   unsigned PtrStoreOpc = 0;
33397   unsigned LabelReg = 0;
33398   const int64_t LabelOffset = 1 * PVT.getStoreSize();
33399   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
33400                      !isPositionIndependent();
33401 
33402   // Prepare IP either in reg or imm.
33403   if (!UseImmLabel) {
33404     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
33405     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
33406     LabelReg = MRI.createVirtualRegister(PtrRC);
33407     if (Subtarget.is64Bit()) {
33408       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
33409               .addReg(X86::RIP)
33410               .addImm(0)
33411               .addReg(0)
33412               .addMBB(restoreMBB)
33413               .addReg(0);
33414     } else {
33415       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
33416       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
33417               .addReg(XII->getGlobalBaseReg(MF))
33418               .addImm(0)
33419               .addReg(0)
33420               .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
33421               .addReg(0);
33422     }
33423   } else
33424     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
33425   // Store IP
33426   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
33427   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33428     if (i == X86::AddrDisp)
33429       MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
33430     else
33431       MIB.add(MI.getOperand(MemOpndSlot + i));
33432   }
33433   if (!UseImmLabel)
33434     MIB.addReg(LabelReg);
33435   else
33436     MIB.addMBB(restoreMBB);
33437   MIB.setMemRefs(MMOs);
33438 
33439   if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
33440     emitSetJmpShadowStackFix(MI, thisMBB);
33441   }
33442 
33443   // Setup
33444   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
33445           .addMBB(restoreMBB);
33446 
33447   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
33448   MIB.addRegMask(RegInfo->getNoPreservedMask());
33449   thisMBB->addSuccessor(mainMBB);
33450   thisMBB->addSuccessor(restoreMBB);
33451 
33452   // mainMBB:
33453   //  EAX = 0
33454   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
33455   mainMBB->addSuccessor(sinkMBB);
33456 
33457   // sinkMBB:
33458   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
33459           TII->get(X86::PHI), DstReg)
33460     .addReg(mainDstReg).addMBB(mainMBB)
33461     .addReg(restoreDstReg).addMBB(restoreMBB);
33462 
33463   // restoreMBB:
33464   if (RegInfo->hasBasePointer(*MF)) {
33465     const bool Uses64BitFramePtr =
33466         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
33467     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
33468     X86FI->setRestoreBasePointer(MF);
33469     Register FramePtr = RegInfo->getFrameRegister(*MF);
33470     Register BasePtr = RegInfo->getBaseRegister();
33471     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
33472     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
33473                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
33474       .setMIFlag(MachineInstr::FrameSetup);
33475   }
33476   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
33477   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
33478   restoreMBB->addSuccessor(sinkMBB);
33479 
33480   MI.eraseFromParent();
33481   return sinkMBB;
33482 }
33483 
33484 /// Fix the shadow stack using the previously saved SSP pointer.
33485 /// \sa emitSetJmpShadowStackFix
33486 /// \param [in] MI The temporary Machine Instruction for the builtin.
33487 /// \param [in] MBB The Machine Basic Block that will be modified.
33488 /// \return The sink MBB that will perform the future indirect branch.
33489 MachineBasicBlock *
emitLongJmpShadowStackFix(MachineInstr & MI,MachineBasicBlock * MBB) const33490 X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
33491                                              MachineBasicBlock *MBB) const {
33492   const DebugLoc &DL = MI.getDebugLoc();
33493   MachineFunction *MF = MBB->getParent();
33494   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33495   MachineRegisterInfo &MRI = MF->getRegInfo();
33496 
33497   // Memory Reference
33498   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33499                                            MI.memoperands_end());
33500 
33501   MVT PVT = getPointerTy(MF->getDataLayout());
33502   const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
33503 
33504   // checkSspMBB:
33505   //         xor vreg1, vreg1
33506   //         rdssp vreg1
33507   //         test vreg1, vreg1
33508   //         je sinkMBB   # Jump if Shadow Stack is not supported
33509   // fallMBB:
33510   //         mov buf+24/12(%rip), vreg2
33511   //         sub vreg1, vreg2
33512   //         jbe sinkMBB  # No need to fix the Shadow Stack
33513   // fixShadowMBB:
33514   //         shr 3/2, vreg2
33515   //         incssp vreg2  # fix the SSP according to the lower 8 bits
33516   //         shr 8, vreg2
33517   //         je sinkMBB
33518   // fixShadowLoopPrepareMBB:
33519   //         shl vreg2
33520   //         mov 128, vreg3
33521   // fixShadowLoopMBB:
33522   //         incssp vreg3
33523   //         dec vreg2
33524   //         jne fixShadowLoopMBB # Iterate until you finish fixing
33525   //                              # the Shadow Stack
33526   // sinkMBB:
33527 
33528   MachineFunction::iterator I = ++MBB->getIterator();
33529   const BasicBlock *BB = MBB->getBasicBlock();
33530 
33531   MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
33532   MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
33533   MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
33534   MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
33535   MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
33536   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
33537   MF->insert(I, checkSspMBB);
33538   MF->insert(I, fallMBB);
33539   MF->insert(I, fixShadowMBB);
33540   MF->insert(I, fixShadowLoopPrepareMBB);
33541   MF->insert(I, fixShadowLoopMBB);
33542   MF->insert(I, sinkMBB);
33543 
33544   // Transfer the remainder of BB and its successor edges to sinkMBB.
33545   sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
33546                   MBB->end());
33547   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
33548 
33549   MBB->addSuccessor(checkSspMBB);
33550 
33551   // Initialize a register with zero.
33552   Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
33553   BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);
33554 
33555   if (PVT == MVT::i64) {
33556     Register TmpZReg = MRI.createVirtualRegister(PtrRC);
33557     BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
33558       .addImm(0)
33559       .addReg(ZReg)
33560       .addImm(X86::sub_32bit);
33561     ZReg = TmpZReg;
33562   }
33563 
33564   // Read the current SSP Register value to the zeroed register.
33565   Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
33566   unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
33567   BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
33568 
33569   // Check whether the result of the SSP register is zero and jump directly
33570   // to the sink.
33571   unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
33572   BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
33573       .addReg(SSPCopyReg)
33574       .addReg(SSPCopyReg);
33575   BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
33576   checkSspMBB->addSuccessor(sinkMBB);
33577   checkSspMBB->addSuccessor(fallMBB);
33578 
33579   // Reload the previously saved SSP register value.
33580   Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
33581   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
33582   const int64_t SPPOffset = 3 * PVT.getStoreSize();
33583   MachineInstrBuilder MIB =
33584       BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
33585   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33586     const MachineOperand &MO = MI.getOperand(i);
33587     if (i == X86::AddrDisp)
33588       MIB.addDisp(MO, SPPOffset);
33589     else if (MO.isReg()) // Don't add the whole operand, we don't want to
33590                          // preserve kill flags.
33591       MIB.addReg(MO.getReg());
33592     else
33593       MIB.add(MO);
33594   }
33595   MIB.setMemRefs(MMOs);
33596 
33597   // Subtract the current SSP from the previous SSP.
33598   Register SspSubReg = MRI.createVirtualRegister(PtrRC);
33599   unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
33600   BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
33601       .addReg(PrevSSPReg)
33602       .addReg(SSPCopyReg);
33603 
33604   // Jump to sink in case PrevSSPReg <= SSPCopyReg.
33605   BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
33606   fallMBB->addSuccessor(sinkMBB);
33607   fallMBB->addSuccessor(fixShadowMBB);
33608 
33609   // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
33610   unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
33611   unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
33612   Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
33613   BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
33614       .addReg(SspSubReg)
33615       .addImm(Offset);
33616 
33617   // Increase SSP when looking only on the lower 8 bits of the delta.
33618   unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
33619   BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
33620 
33621   // Reset the lower 8 bits.
33622   Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
33623   BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
33624       .addReg(SspFirstShrReg)
33625       .addImm(8);
33626 
33627   // Jump if the result of the shift is zero.
33628   BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
33629   fixShadowMBB->addSuccessor(sinkMBB);
33630   fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
33631 
33632   // Do a single shift left.
33633   unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
33634   Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
33635   BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
33636       .addReg(SspSecondShrReg);
33637 
33638   // Save the value 128 to a register (will be used next with incssp).
33639   Register Value128InReg = MRI.createVirtualRegister(PtrRC);
33640   unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
33641   BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
33642       .addImm(128);
33643   fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
33644 
33645   // Since incssp only looks at the lower 8 bits, we might need to do several
33646   // iterations of incssp until we finish fixing the shadow stack.
33647   Register DecReg = MRI.createVirtualRegister(PtrRC);
33648   Register CounterReg = MRI.createVirtualRegister(PtrRC);
33649   BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
33650       .addReg(SspAfterShlReg)
33651       .addMBB(fixShadowLoopPrepareMBB)
33652       .addReg(DecReg)
33653       .addMBB(fixShadowLoopMBB);
33654 
33655   // Every iteration we increase the SSP by 128.
33656   BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
33657 
33658   // Every iteration we decrement the counter by 1.
33659   unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
33660   BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
33661 
33662   // Jump if the counter is not zero yet.
33663   BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
33664   fixShadowLoopMBB->addSuccessor(sinkMBB);
33665   fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
33666 
33667   return sinkMBB;
33668 }
33669 
33670 MachineBasicBlock *
emitEHSjLjLongJmp(MachineInstr & MI,MachineBasicBlock * MBB) const33671 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
33672                                      MachineBasicBlock *MBB) const {
33673   const DebugLoc &DL = MI.getDebugLoc();
33674   MachineFunction *MF = MBB->getParent();
33675   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33676   MachineRegisterInfo &MRI = MF->getRegInfo();
33677 
33678   // Memory Reference
33679   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33680                                            MI.memoperands_end());
33681 
33682   MVT PVT = getPointerTy(MF->getDataLayout());
33683   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
33684          "Invalid Pointer Size!");
33685 
33686   const TargetRegisterClass *RC =
33687     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
33688   Register Tmp = MRI.createVirtualRegister(RC);
33689   // Since FP is only updated here but NOT referenced, it's treated as GPR.
33690   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
33691   Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
33692   Register SP = RegInfo->getStackRegister();
33693 
33694   MachineInstrBuilder MIB;
33695 
33696   const int64_t LabelOffset = 1 * PVT.getStoreSize();
33697   const int64_t SPOffset = 2 * PVT.getStoreSize();
33698 
33699   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
33700   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
33701 
33702   MachineBasicBlock *thisMBB = MBB;
33703 
33704   // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
33705   if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
33706     thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
33707   }
33708 
33709   // Reload FP
33710   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
33711   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33712     const MachineOperand &MO = MI.getOperand(i);
33713     if (MO.isReg()) // Don't add the whole operand, we don't want to
33714                     // preserve kill flags.
33715       MIB.addReg(MO.getReg());
33716     else
33717       MIB.add(MO);
33718   }
33719   MIB.setMemRefs(MMOs);
33720 
33721   // Reload IP
33722   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
33723   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33724     const MachineOperand &MO = MI.getOperand(i);
33725     if (i == X86::AddrDisp)
33726       MIB.addDisp(MO, LabelOffset);
33727     else if (MO.isReg()) // Don't add the whole operand, we don't want to
33728                          // preserve kill flags.
33729       MIB.addReg(MO.getReg());
33730     else
33731       MIB.add(MO);
33732   }
33733   MIB.setMemRefs(MMOs);
33734 
33735   // Reload SP
33736   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
33737   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33738     if (i == X86::AddrDisp)
33739       MIB.addDisp(MI.getOperand(i), SPOffset);
33740     else
33741       MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
33742                                  // the last instruction of the expansion.
33743   }
33744   MIB.setMemRefs(MMOs);
33745 
33746   // Jump
33747   BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
33748 
33749   MI.eraseFromParent();
33750   return thisMBB;
33751 }
33752 
SetupEntryBlockForSjLj(MachineInstr & MI,MachineBasicBlock * MBB,MachineBasicBlock * DispatchBB,int FI) const33753 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
33754                                                MachineBasicBlock *MBB,
33755                                                MachineBasicBlock *DispatchBB,
33756                                                int FI) const {
33757   const DebugLoc &DL = MI.getDebugLoc();
33758   MachineFunction *MF = MBB->getParent();
33759   MachineRegisterInfo *MRI = &MF->getRegInfo();
33760   const X86InstrInfo *TII = Subtarget.getInstrInfo();
33761 
33762   MVT PVT = getPointerTy(MF->getDataLayout());
33763   assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
33764 
33765   unsigned Op = 0;
33766   unsigned VR = 0;
33767 
33768   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
33769                      !isPositionIndependent();
33770 
33771   if (UseImmLabel) {
33772     Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
33773   } else {
33774     const TargetRegisterClass *TRC =
33775         (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
33776     VR = MRI->createVirtualRegister(TRC);
33777     Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
33778 
33779     if (Subtarget.is64Bit())
33780       BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
33781           .addReg(X86::RIP)
33782           .addImm(1)
33783           .addReg(0)
33784           .addMBB(DispatchBB)
33785           .addReg(0);
33786     else
33787       BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
33788           .addReg(0) /* TII->getGlobalBaseReg(MF) */
33789           .addImm(1)
33790           .addReg(0)
33791           .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
33792           .addReg(0);
33793   }
33794 
33795   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
33796   addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
33797   if (UseImmLabel)
33798     MIB.addMBB(DispatchBB);
33799   else
33800     MIB.addReg(VR);
33801 }
33802 
33803 MachineBasicBlock *
EmitSjLjDispatchBlock(MachineInstr & MI,MachineBasicBlock * BB) const33804 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
33805                                          MachineBasicBlock *BB) const {
33806   const DebugLoc &DL = MI.getDebugLoc();
33807   MachineFunction *MF = BB->getParent();
33808   MachineRegisterInfo *MRI = &MF->getRegInfo();
33809   const X86InstrInfo *TII = Subtarget.getInstrInfo();
33810   int FI = MF->getFrameInfo().getFunctionContextIndex();
33811 
33812   // Get a mapping of the call site numbers to all of the landing pads they're
33813   // associated with.
33814   DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
33815   unsigned MaxCSNum = 0;
33816   for (auto &MBB : *MF) {
33817     if (!MBB.isEHPad())
33818       continue;
33819 
33820     MCSymbol *Sym = nullptr;
33821     for (const auto &MI : MBB) {
33822       if (MI.isDebugInstr())
33823         continue;
33824 
33825       assert(MI.isEHLabel() && "expected EH_LABEL");
33826       Sym = MI.getOperand(0).getMCSymbol();
33827       break;
33828     }
33829 
33830     if (!MF->hasCallSiteLandingPad(Sym))
33831       continue;
33832 
33833     for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
33834       CallSiteNumToLPad[CSI].push_back(&MBB);
33835       MaxCSNum = std::max(MaxCSNum, CSI);
33836     }
33837   }
33838 
33839   // Get an ordered list of the machine basic blocks for the jump table.
33840   std::vector<MachineBasicBlock *> LPadList;
33841   SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
33842   LPadList.reserve(CallSiteNumToLPad.size());
33843 
33844   for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
33845     for (auto &LP : CallSiteNumToLPad[CSI]) {
33846       LPadList.push_back(LP);
33847       InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
33848     }
33849   }
33850 
33851   assert(!LPadList.empty() &&
33852          "No landing pad destinations for the dispatch jump table!");
33853 
33854   // Create the MBBs for the dispatch code.
33855 
33856   // Shove the dispatch's address into the return slot in the function context.
33857   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
33858   DispatchBB->setIsEHPad(true);
33859 
33860   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
33861   BuildMI(TrapBB, DL, TII->get(X86::TRAP));
33862   DispatchBB->addSuccessor(TrapBB);
33863 
33864   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
33865   DispatchBB->addSuccessor(DispContBB);
33866 
33867   // Insert MBBs.
33868   MF->push_back(DispatchBB);
33869   MF->push_back(DispContBB);
33870   MF->push_back(TrapBB);
33871 
33872   // Insert code into the entry block that creates and registers the function
33873   // context.
33874   SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
33875 
33876   // Create the jump table and associated information
33877   unsigned JTE = getJumpTableEncoding();
33878   MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
33879   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
33880 
33881   const X86RegisterInfo &RI = TII->getRegisterInfo();
33882   // Add a register mask with no preserved registers.  This results in all
33883   // registers being marked as clobbered.
33884   if (RI.hasBasePointer(*MF)) {
33885     const bool FPIs64Bit =
33886         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
33887     X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
33888     MFI->setRestoreBasePointer(MF);
33889 
33890     Register FP = RI.getFrameRegister(*MF);
33891     Register BP = RI.getBaseRegister();
33892     unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
33893     addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
33894                  MFI->getRestoreBasePointerOffset())
33895         .addRegMask(RI.getNoPreservedMask());
33896   } else {
33897     BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
33898         .addRegMask(RI.getNoPreservedMask());
33899   }
33900 
33901   // IReg is used as an index in a memory operand and therefore can't be SP
33902   Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
33903   addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
33904                     Subtarget.is64Bit() ? 8 : 4);
33905   BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
33906       .addReg(IReg)
33907       .addImm(LPadList.size());
33908   BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
33909 
33910   if (Subtarget.is64Bit()) {
33911     Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
33912     Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
33913 
33914     // leaq .LJTI0_0(%rip), BReg
33915     BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
33916         .addReg(X86::RIP)
33917         .addImm(1)
33918         .addReg(0)
33919         .addJumpTableIndex(MJTI)
33920         .addReg(0);
33921     // movzx IReg64, IReg
33922     BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
33923         .addImm(0)
33924         .addReg(IReg)
33925         .addImm(X86::sub_32bit);
33926 
33927     switch (JTE) {
33928     case MachineJumpTableInfo::EK_BlockAddress:
33929       // jmpq *(BReg,IReg64,8)
33930       BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
33931           .addReg(BReg)
33932           .addImm(8)
33933           .addReg(IReg64)
33934           .addImm(0)
33935           .addReg(0);
33936       break;
33937     case MachineJumpTableInfo::EK_LabelDifference32: {
33938       Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
33939       Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
33940       Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
33941 
33942       // movl (BReg,IReg64,4), OReg
33943       BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
33944           .addReg(BReg)
33945           .addImm(4)
33946           .addReg(IReg64)
33947           .addImm(0)
33948           .addReg(0);
33949       // movsx OReg64, OReg
33950       BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
33951       // addq BReg, OReg64, TReg
33952       BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
33953           .addReg(OReg64)
33954           .addReg(BReg);
33955       // jmpq *TReg
33956       BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
33957       break;
33958     }
33959     default:
33960       llvm_unreachable("Unexpected jump table encoding");
33961     }
33962   } else {
33963     // jmpl *.LJTI0_0(,IReg,4)
33964     BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
33965         .addReg(0)
33966         .addImm(4)
33967         .addReg(IReg)
33968         .addJumpTableIndex(MJTI)
33969         .addReg(0);
33970   }
33971 
33972   // Add the jump table entries as successors to the MBB.
33973   SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
33974   for (auto &LP : LPadList)
33975     if (SeenMBBs.insert(LP).second)
33976       DispContBB->addSuccessor(LP);
33977 
33978   // N.B. the order the invoke BBs are processed in doesn't matter here.
33979   SmallVector<MachineBasicBlock *, 64> MBBLPads;
33980   const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
33981   for (MachineBasicBlock *MBB : InvokeBBs) {
33982     // Remove the landing pad successor from the invoke block and replace it
33983     // with the new dispatch block.
33984     // Keep a copy of Successors since it's modified inside the loop.
33985     SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
33986                                                    MBB->succ_rend());
33987     // FIXME: Avoid quadratic complexity.
33988     for (auto MBBS : Successors) {
33989       if (MBBS->isEHPad()) {
33990         MBB->removeSuccessor(MBBS);
33991         MBBLPads.push_back(MBBS);
33992       }
33993     }
33994 
33995     MBB->addSuccessor(DispatchBB);
33996 
33997     // Find the invoke call and mark all of the callee-saved registers as
33998     // 'implicit defined' so that they're spilled.  This prevents code from
33999     // moving instructions to before the EH block, where they will never be
34000     // executed.
34001     for (auto &II : reverse(*MBB)) {
34002       if (!II.isCall())
34003         continue;
34004 
34005       DenseMap<unsigned, bool> DefRegs;
34006       for (auto &MOp : II.operands())
34007         if (MOp.isReg())
34008           DefRegs[MOp.getReg()] = true;
34009 
34010       MachineInstrBuilder MIB(*MF, &II);
34011       for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
34012         unsigned Reg = SavedRegs[RegIdx];
34013         if (!DefRegs[Reg])
34014           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
34015       }
34016 
34017       break;
34018     }
34019   }
34020 
34021   // Mark all former landing pads as non-landing pads.  The dispatch is the only
34022   // landing pad now.
34023   for (auto &LP : MBBLPads)
34024     LP->setIsEHPad(false);
34025 
34026   // The instruction is gone now.
34027   MI.eraseFromParent();
34028   return BB;
34029 }
34030 
34031 MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr & MI,MachineBasicBlock * BB) const34032 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
34033                                                MachineBasicBlock *BB) const {
34034   MachineFunction *MF = BB->getParent();
34035   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34036   const DebugLoc &DL = MI.getDebugLoc();
34037 
34038   auto TMMImmToTMMReg = [](unsigned Imm) {
34039     assert (Imm < 8 && "Illegal tmm index");
34040     return X86::TMM0 + Imm;
34041   };
34042   switch (MI.getOpcode()) {
34043   default: llvm_unreachable("Unexpected instr type to insert");
34044   case X86::TLS_addr32:
34045   case X86::TLS_addr64:
34046   case X86::TLS_addrX32:
34047   case X86::TLS_base_addr32:
34048   case X86::TLS_base_addr64:
34049   case X86::TLS_base_addrX32:
34050     return EmitLoweredTLSAddr(MI, BB);
34051   case X86::INDIRECT_THUNK_CALL32:
34052   case X86::INDIRECT_THUNK_CALL64:
34053   case X86::INDIRECT_THUNK_TCRETURN32:
34054   case X86::INDIRECT_THUNK_TCRETURN64:
34055     return EmitLoweredIndirectThunk(MI, BB);
34056   case X86::CATCHRET:
34057     return EmitLoweredCatchRet(MI, BB);
34058   case X86::SEG_ALLOCA_32:
34059   case X86::SEG_ALLOCA_64:
34060     return EmitLoweredSegAlloca(MI, BB);
34061   case X86::PROBED_ALLOCA_32:
34062   case X86::PROBED_ALLOCA_64:
34063     return EmitLoweredProbedAlloca(MI, BB);
34064   case X86::TLSCall_32:
34065   case X86::TLSCall_64:
34066     return EmitLoweredTLSCall(MI, BB);
34067   case X86::CMOV_FR32:
34068   case X86::CMOV_FR32X:
34069   case X86::CMOV_FR64:
34070   case X86::CMOV_FR64X:
34071   case X86::CMOV_GR8:
34072   case X86::CMOV_GR16:
34073   case X86::CMOV_GR32:
34074   case X86::CMOV_RFP32:
34075   case X86::CMOV_RFP64:
34076   case X86::CMOV_RFP80:
34077   case X86::CMOV_VR64:
34078   case X86::CMOV_VR128:
34079   case X86::CMOV_VR128X:
34080   case X86::CMOV_VR256:
34081   case X86::CMOV_VR256X:
34082   case X86::CMOV_VR512:
34083   case X86::CMOV_VK1:
34084   case X86::CMOV_VK2:
34085   case X86::CMOV_VK4:
34086   case X86::CMOV_VK8:
34087   case X86::CMOV_VK16:
34088   case X86::CMOV_VK32:
34089   case X86::CMOV_VK64:
34090     return EmitLoweredSelect(MI, BB);
34091 
34092   case X86::RDFLAGS32:
34093   case X86::RDFLAGS64: {
34094     unsigned PushF =
34095         MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
34096     unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
34097     MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
34098     // Permit reads of the EFLAGS and DF registers without them being defined.
34099     // This intrinsic exists to read external processor state in flags, such as
34100     // the trap flag, interrupt flag, and direction flag, none of which are
34101     // modeled by the backend.
34102     assert(Push->getOperand(2).getReg() == X86::EFLAGS &&
34103            "Unexpected register in operand!");
34104     Push->getOperand(2).setIsUndef();
34105     assert(Push->getOperand(3).getReg() == X86::DF &&
34106            "Unexpected register in operand!");
34107     Push->getOperand(3).setIsUndef();
34108     BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
34109 
34110     MI.eraseFromParent(); // The pseudo is gone now.
34111     return BB;
34112   }
34113 
34114   case X86::WRFLAGS32:
34115   case X86::WRFLAGS64: {
34116     unsigned Push =
34117         MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
34118     unsigned PopF =
34119         MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
34120     BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
34121     BuildMI(*BB, MI, DL, TII->get(PopF));
34122 
34123     MI.eraseFromParent(); // The pseudo is gone now.
34124     return BB;
34125   }
34126 
34127   case X86::FP32_TO_INT16_IN_MEM:
34128   case X86::FP32_TO_INT32_IN_MEM:
34129   case X86::FP32_TO_INT64_IN_MEM:
34130   case X86::FP64_TO_INT16_IN_MEM:
34131   case X86::FP64_TO_INT32_IN_MEM:
34132   case X86::FP64_TO_INT64_IN_MEM:
34133   case X86::FP80_TO_INT16_IN_MEM:
34134   case X86::FP80_TO_INT32_IN_MEM:
34135   case X86::FP80_TO_INT64_IN_MEM: {
34136     // Change the floating point control register to use "round towards zero"
34137     // mode when truncating to an integer value.
34138     int OrigCWFrameIdx =
34139         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
34140     addFrameReference(BuildMI(*BB, MI, DL,
34141                               TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
34142 
34143     // Load the old value of the control word...
34144     Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
34145     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
34146                       OrigCWFrameIdx);
34147 
34148     // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
34149     Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
34150     BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
34151       .addReg(OldCW, RegState::Kill).addImm(0xC00);
34152 
34153     // Extract to 16 bits.
34154     Register NewCW16 =
34155         MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
34156     BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
34157       .addReg(NewCW, RegState::Kill, X86::sub_16bit);
34158 
34159     // Prepare memory for FLDCW.
34160     int NewCWFrameIdx =
34161         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
34162     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
34163                       NewCWFrameIdx)
34164       .addReg(NewCW16, RegState::Kill);
34165 
34166     // Reload the modified control word now...
34167     addFrameReference(BuildMI(*BB, MI, DL,
34168                               TII->get(X86::FLDCW16m)), NewCWFrameIdx);
34169 
34170     // Get the X86 opcode to use.
34171     unsigned Opc;
34172     switch (MI.getOpcode()) {
34173     default: llvm_unreachable("illegal opcode!");
34174     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
34175     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
34176     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
34177     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
34178     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
34179     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
34180     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
34181     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
34182     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
34183     }
34184 
34185     X86AddressMode AM = getAddressFromInstr(&MI, 0);
34186     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
34187         .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
34188 
34189     // Reload the original control word now.
34190     addFrameReference(BuildMI(*BB, MI, DL,
34191                               TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
34192 
34193     MI.eraseFromParent(); // The pseudo instruction is gone now.
34194     return BB;
34195   }
34196 
34197   // xbegin
34198   case X86::XBEGIN:
34199     return emitXBegin(MI, BB, Subtarget.getInstrInfo());
34200 
34201   case X86::VAARG_64:
34202   case X86::VAARG_X32:
34203     return EmitVAARGWithCustomInserter(MI, BB);
34204 
34205   case X86::EH_SjLj_SetJmp32:
34206   case X86::EH_SjLj_SetJmp64:
34207     return emitEHSjLjSetJmp(MI, BB);
34208 
34209   case X86::EH_SjLj_LongJmp32:
34210   case X86::EH_SjLj_LongJmp64:
34211     return emitEHSjLjLongJmp(MI, BB);
34212 
34213   case X86::Int_eh_sjlj_setup_dispatch:
34214     return EmitSjLjDispatchBlock(MI, BB);
34215 
34216   case TargetOpcode::STATEPOINT:
34217     // As an implementation detail, STATEPOINT shares the STACKMAP format at
34218     // this point in the process.  We diverge later.
34219     return emitPatchPoint(MI, BB);
34220 
34221   case TargetOpcode::STACKMAP:
34222   case TargetOpcode::PATCHPOINT:
34223     return emitPatchPoint(MI, BB);
34224 
34225   case TargetOpcode::PATCHABLE_EVENT_CALL:
34226   case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
34227     return BB;
34228 
34229   case X86::LCMPXCHG8B: {
34230     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
34231     // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
34232     // requires a memory operand. If it happens that current architecture is
34233     // i686 and for current function we need a base pointer
34234     // - which is ESI for i686 - register allocator would not be able to
34235     // allocate registers for an address in form of X(%reg, %reg, Y)
34236     // - there never would be enough unreserved registers during regalloc
34237     // (without the need for base ptr the only option would be X(%edi, %esi, Y).
34238     // We are giving a hand to register allocator by precomputing the address in
34239     // a new vreg using LEA.
34240 
34241     // If it is not i686 or there is no base pointer - nothing to do here.
34242     if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
34243       return BB;
34244 
34245     // Even though this code does not necessarily needs the base pointer to
34246     // be ESI, we check for that. The reason: if this assert fails, there are
34247     // some changes happened in the compiler base pointer handling, which most
34248     // probably have to be addressed somehow here.
34249     assert(TRI->getBaseRegister() == X86::ESI &&
34250            "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
34251            "base pointer in mind");
34252 
34253     MachineRegisterInfo &MRI = MF->getRegInfo();
34254     MVT SPTy = getPointerTy(MF->getDataLayout());
34255     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
34256     Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
34257 
34258     X86AddressMode AM = getAddressFromInstr(&MI, 0);
34259     // Regalloc does not need any help when the memory operand of CMPXCHG8B
34260     // does not use index register.
34261     if (AM.IndexReg == X86::NoRegister)
34262       return BB;
34263 
34264     // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
34265     // four operand definitions that are E[ABCD] registers. We skip them and
34266     // then insert the LEA.
34267     MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
34268     while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
34269                                    RMBBI->definesRegister(X86::EBX) ||
34270                                    RMBBI->definesRegister(X86::ECX) ||
34271                                    RMBBI->definesRegister(X86::EDX))) {
34272       ++RMBBI;
34273     }
34274     MachineBasicBlock::iterator MBBI(RMBBI);
34275     addFullAddress(
34276         BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
34277 
34278     setDirectAddressInInstr(&MI, 0, computedAddrVReg);
34279 
34280     return BB;
34281   }
34282   case X86::LCMPXCHG16B_NO_RBX: {
34283     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
34284     Register BasePtr = TRI->getBaseRegister();
34285     if (TRI->hasBasePointer(*MF) &&
34286         (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
34287       if (!BB->isLiveIn(BasePtr))
34288         BB->addLiveIn(BasePtr);
34289       // Save RBX into a virtual register.
34290       Register SaveRBX =
34291           MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
34292       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
34293           .addReg(X86::RBX);
34294       Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
34295       MachineInstrBuilder MIB =
34296           BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
34297       for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
34298         MIB.add(MI.getOperand(Idx));
34299       MIB.add(MI.getOperand(X86::AddrNumOperands));
34300       MIB.addReg(SaveRBX);
34301     } else {
34302       // Simple case, just copy the virtual register to RBX.
34303       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
34304           .add(MI.getOperand(X86::AddrNumOperands));
34305       MachineInstrBuilder MIB =
34306           BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));
34307       for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
34308         MIB.add(MI.getOperand(Idx));
34309     }
34310     MI.eraseFromParent();
34311     return BB;
34312   }
34313   case X86::MWAITX: {
34314     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
34315     Register BasePtr = TRI->getBaseRegister();
34316     bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
34317     // If no need to save the base pointer, we generate MWAITXrrr,
34318     // else we generate pseudo MWAITX_SAVE_RBX.
34319     if (!IsRBX || !TRI->hasBasePointer(*MF)) {
34320       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
34321           .addReg(MI.getOperand(0).getReg());
34322       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
34323           .addReg(MI.getOperand(1).getReg());
34324       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)
34325           .addReg(MI.getOperand(2).getReg());
34326       BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));
34327       MI.eraseFromParent();
34328     } else {
34329       if (!BB->isLiveIn(BasePtr)) {
34330         BB->addLiveIn(BasePtr);
34331       }
34332       // Parameters can be copied into ECX and EAX but not EBX yet.
34333       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
34334           .addReg(MI.getOperand(0).getReg());
34335       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
34336           .addReg(MI.getOperand(1).getReg());
34337       assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
34338       // Save RBX into a virtual register.
34339       Register SaveRBX =
34340           MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
34341       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
34342           .addReg(X86::RBX);
34343       // Generate mwaitx pseudo.
34344       Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
34345       BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))
34346           .addDef(Dst) // Destination tied in with SaveRBX.
34347           .addReg(MI.getOperand(2).getReg()) // input value of EBX.
34348           .addUse(SaveRBX);                  // Save of base pointer.
34349       MI.eraseFromParent();
34350     }
34351     return BB;
34352   }
34353   case TargetOpcode::PREALLOCATED_SETUP: {
34354     assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
34355     auto MFI = MF->getInfo<X86MachineFunctionInfo>();
34356     MFI->setHasPreallocatedCall(true);
34357     int64_t PreallocatedId = MI.getOperand(0).getImm();
34358     size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
34359     assert(StackAdjustment != 0 && "0 stack adjustment");
34360     LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
34361                       << StackAdjustment << "\n");
34362     BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
34363         .addReg(X86::ESP)
34364         .addImm(StackAdjustment);
34365     MI.eraseFromParent();
34366     return BB;
34367   }
34368   case TargetOpcode::PREALLOCATED_ARG: {
34369     assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
34370     int64_t PreallocatedId = MI.getOperand(1).getImm();
34371     int64_t ArgIdx = MI.getOperand(2).getImm();
34372     auto MFI = MF->getInfo<X86MachineFunctionInfo>();
34373     size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
34374     LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
34375                       << ", arg offset " << ArgOffset << "\n");
34376     // stack pointer + offset
34377     addRegOffset(
34378         BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
34379         X86::ESP, false, ArgOffset);
34380     MI.eraseFromParent();
34381     return BB;
34382   }
34383   case X86::PTDPBSSD:
34384   case X86::PTDPBSUD:
34385   case X86::PTDPBUSD:
34386   case X86::PTDPBUUD:
34387   case X86::PTDPBF16PS: {
34388     unsigned Opc;
34389     switch (MI.getOpcode()) {
34390     case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
34391     case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
34392     case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
34393     case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
34394     case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
34395     }
34396 
34397     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
34398     MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
34399     MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
34400     MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
34401     MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
34402 
34403     MI.eraseFromParent(); // The pseudo is gone now.
34404     return BB;
34405   }
34406   case X86::PTILEZERO: {
34407     unsigned Imm = MI.getOperand(0).getImm();
34408     BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
34409     MI.eraseFromParent(); // The pseudo is gone now.
34410     return BB;
34411   }
34412   case X86::PTILELOADD:
34413   case X86::PTILELOADDT1:
34414   case X86::PTILESTORED: {
34415     unsigned Opc;
34416     switch (MI.getOpcode()) {
34417     case X86::PTILELOADD:   Opc = X86::TILELOADD;   break;
34418     case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
34419     case X86::PTILESTORED:  Opc = X86::TILESTORED;  break;
34420     }
34421 
34422     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
34423     unsigned CurOp = 0;
34424     if (Opc != X86::TILESTORED)
34425       MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
34426                  RegState::Define);
34427 
34428     MIB.add(MI.getOperand(CurOp++)); // base
34429     MIB.add(MI.getOperand(CurOp++)); // scale
34430     MIB.add(MI.getOperand(CurOp++)); // index -- stride
34431     MIB.add(MI.getOperand(CurOp++)); // displacement
34432     MIB.add(MI.getOperand(CurOp++)); // segment
34433 
34434     if (Opc == X86::TILESTORED)
34435       MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
34436                  RegState::Undef);
34437 
34438     MI.eraseFromParent(); // The pseudo is gone now.
34439     return BB;
34440   }
34441   }
34442 }
34443 
34444 //===----------------------------------------------------------------------===//
34445 //                           X86 Optimization Hooks
34446 //===----------------------------------------------------------------------===//
34447 
34448 bool
targetShrinkDemandedConstant(SDValue Op,const APInt & DemandedBits,const APInt & DemandedElts,TargetLoweringOpt & TLO) const34449 X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
34450                                                 const APInt &DemandedBits,
34451                                                 const APInt &DemandedElts,
34452                                                 TargetLoweringOpt &TLO) const {
34453   EVT VT = Op.getValueType();
34454   unsigned Opcode = Op.getOpcode();
34455   unsigned EltSize = VT.getScalarSizeInBits();
34456 
34457   if (VT.isVector()) {
34458     // If the constant is only all signbits in the active bits, then we should
34459     // extend it to the entire constant to allow it act as a boolean constant
34460     // vector.
34461     auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
34462       if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
34463         return false;
34464       for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
34465         if (!DemandedElts[i] || V.getOperand(i).isUndef())
34466           continue;
34467         const APInt &Val = V.getConstantOperandAPInt(i);
34468         if (Val.getBitWidth() > Val.getNumSignBits() &&
34469             Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
34470           return true;
34471       }
34472       return false;
34473     };
34474     // For vectors - if we have a constant, then try to sign extend.
34475     // TODO: Handle AND/ANDN cases.
34476     unsigned ActiveBits = DemandedBits.getActiveBits();
34477     if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
34478         (Opcode == ISD::OR || Opcode == ISD::XOR) &&
34479         NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
34480       EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
34481       EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
34482                                     VT.getVectorNumElements());
34483       SDValue NewC =
34484           TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
34485                           Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
34486       SDValue NewOp =
34487           TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
34488       return TLO.CombineTo(Op, NewOp);
34489     }
34490     return false;
34491   }
34492 
34493   // Only optimize Ands to prevent shrinking a constant that could be
34494   // matched by movzx.
34495   if (Opcode != ISD::AND)
34496     return false;
34497 
34498   // Make sure the RHS really is a constant.
34499   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
34500   if (!C)
34501     return false;
34502 
34503   const APInt &Mask = C->getAPIntValue();
34504 
34505   // Clear all non-demanded bits initially.
34506   APInt ShrunkMask = Mask & DemandedBits;
34507 
34508   // Find the width of the shrunk mask.
34509   unsigned Width = ShrunkMask.getActiveBits();
34510 
34511   // If the mask is all 0s there's nothing to do here.
34512   if (Width == 0)
34513     return false;
34514 
34515   // Find the next power of 2 width, rounding up to a byte.
34516   Width = PowerOf2Ceil(std::max(Width, 8U));
34517   // Truncate the width to size to handle illegal types.
34518   Width = std::min(Width, EltSize);
34519 
34520   // Calculate a possible zero extend mask for this constant.
34521   APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
34522 
34523   // If we aren't changing the mask, just return true to keep it and prevent
34524   // the caller from optimizing.
34525   if (ZeroExtendMask == Mask)
34526     return true;
34527 
34528   // Make sure the new mask can be represented by a combination of mask bits
34529   // and non-demanded bits.
34530   if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
34531     return false;
34532 
34533   // Replace the constant with the zero extend mask.
34534   SDLoc DL(Op);
34535   SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
34536   SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
34537   return TLO.CombineTo(Op, NewOp);
34538 }
34539 
computeKnownBitsForTargetNode(const SDValue Op,KnownBits & Known,const APInt & DemandedElts,const SelectionDAG & DAG,unsigned Depth) const34540 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
34541                                                       KnownBits &Known,
34542                                                       const APInt &DemandedElts,
34543                                                       const SelectionDAG &DAG,
34544                                                       unsigned Depth) const {
34545   unsigned BitWidth = Known.getBitWidth();
34546   unsigned NumElts = DemandedElts.getBitWidth();
34547   unsigned Opc = Op.getOpcode();
34548   EVT VT = Op.getValueType();
34549   assert((Opc >= ISD::BUILTIN_OP_END ||
34550           Opc == ISD::INTRINSIC_WO_CHAIN ||
34551           Opc == ISD::INTRINSIC_W_CHAIN ||
34552           Opc == ISD::INTRINSIC_VOID) &&
34553          "Should use MaskedValueIsZero if you don't know whether Op"
34554          " is a target node!");
34555 
34556   Known.resetAll();
34557   switch (Opc) {
34558   default: break;
34559   case X86ISD::SETCC:
34560     Known.Zero.setBitsFrom(1);
34561     break;
34562   case X86ISD::MOVMSK: {
34563     unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
34564     Known.Zero.setBitsFrom(NumLoBits);
34565     break;
34566   }
34567   case X86ISD::PEXTRB:
34568   case X86ISD::PEXTRW: {
34569     SDValue Src = Op.getOperand(0);
34570     EVT SrcVT = Src.getValueType();
34571     APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
34572                                             Op.getConstantOperandVal(1));
34573     Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
34574     Known = Known.anyextOrTrunc(BitWidth);
34575     Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
34576     break;
34577   }
34578   case X86ISD::VSRAI:
34579   case X86ISD::VSHLI:
34580   case X86ISD::VSRLI: {
34581     unsigned ShAmt = Op.getConstantOperandVal(1);
34582     if (ShAmt >= VT.getScalarSizeInBits()) {
34583       Known.setAllZero();
34584       break;
34585     }
34586 
34587     Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34588     if (Opc == X86ISD::VSHLI) {
34589       Known.Zero <<= ShAmt;
34590       Known.One <<= ShAmt;
34591       // Low bits are known zero.
34592       Known.Zero.setLowBits(ShAmt);
34593     } else if (Opc == X86ISD::VSRLI) {
34594       Known.Zero.lshrInPlace(ShAmt);
34595       Known.One.lshrInPlace(ShAmt);
34596       // High bits are known zero.
34597       Known.Zero.setHighBits(ShAmt);
34598     } else {
34599       Known.Zero.ashrInPlace(ShAmt);
34600       Known.One.ashrInPlace(ShAmt);
34601     }
34602     break;
34603   }
34604   case X86ISD::PACKUS: {
34605     // PACKUS is just a truncation if the upper half is zero.
34606     APInt DemandedLHS, DemandedRHS;
34607     getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
34608 
34609     Known.One = APInt::getAllOnesValue(BitWidth * 2);
34610     Known.Zero = APInt::getAllOnesValue(BitWidth * 2);
34611 
34612     KnownBits Known2;
34613     if (!!DemandedLHS) {
34614       Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
34615       Known = KnownBits::commonBits(Known, Known2);
34616     }
34617     if (!!DemandedRHS) {
34618       Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
34619       Known = KnownBits::commonBits(Known, Known2);
34620     }
34621 
34622     if (Known.countMinLeadingZeros() < BitWidth)
34623       Known.resetAll();
34624     Known = Known.trunc(BitWidth);
34625     break;
34626   }
34627   case X86ISD::VBROADCAST: {
34628     SDValue Src = Op.getOperand(0);
34629     if (!Src.getSimpleValueType().isVector()) {
34630       Known = DAG.computeKnownBits(Src, Depth + 1);
34631       return;
34632     }
34633     break;
34634   }
34635   case X86ISD::ANDNP: {
34636     KnownBits Known2;
34637     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34638     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34639 
34640     // ANDNP = (~X & Y);
34641     Known.One &= Known2.Zero;
34642     Known.Zero |= Known2.One;
34643     break;
34644   }
34645   case X86ISD::FOR: {
34646     KnownBits Known2;
34647     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34648     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34649 
34650     Known |= Known2;
34651     break;
34652   }
34653   case X86ISD::PSADBW: {
34654     assert(VT.getScalarType() == MVT::i64 &&
34655            Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
34656            "Unexpected PSADBW types");
34657 
34658     // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
34659     Known.Zero.setBitsFrom(16);
34660     break;
34661   }
34662   case X86ISD::PMULUDQ: {
34663     KnownBits Known2;
34664     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34665     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34666 
34667     Known = Known.trunc(BitWidth / 2).zext(BitWidth);
34668     Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
34669     Known = KnownBits::mul(Known, Known2);
34670     break;
34671   }
34672   case X86ISD::CMOV: {
34673     Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
34674     // If we don't know any bits, early out.
34675     if (Known.isUnknown())
34676       break;
34677     KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
34678 
34679     // Only known if known in both the LHS and RHS.
34680     Known = KnownBits::commonBits(Known, Known2);
34681     break;
34682   }
34683   case X86ISD::BEXTR:
34684   case X86ISD::BEXTRI: {
34685     SDValue Op0 = Op.getOperand(0);
34686     SDValue Op1 = Op.getOperand(1);
34687 
34688     if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
34689       unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
34690       unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
34691 
34692       // If the length is 0, the result is 0.
34693       if (Length == 0) {
34694         Known.setAllZero();
34695         break;
34696       }
34697 
34698       if ((Shift + Length) <= BitWidth) {
34699         Known = DAG.computeKnownBits(Op0, Depth + 1);
34700         Known = Known.extractBits(Length, Shift);
34701         Known = Known.zextOrTrunc(BitWidth);
34702       }
34703     }
34704     break;
34705   }
34706   case X86ISD::PDEP: {
34707     KnownBits Known2;
34708     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34709     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34710     // Zeros are retained from the mask operand. But not ones.
34711     Known.One.clearAllBits();
34712     // The result will have at least as many trailing zeros as the non-mask
34713     // operand since bits can only map to the same or higher bit position.
34714     Known.Zero.setLowBits(Known2.countMinTrailingZeros());
34715     break;
34716   }
34717   case X86ISD::PEXT: {
34718     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34719     // The result has as many leading zeros as the number of zeroes in the mask.
34720     unsigned Count = Known.Zero.countPopulation();
34721     Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
34722     Known.One.clearAllBits();
34723     break;
34724   }
34725   case X86ISD::VTRUNC:
34726   case X86ISD::VTRUNCS:
34727   case X86ISD::VTRUNCUS:
34728   case X86ISD::CVTSI2P:
34729   case X86ISD::CVTUI2P:
34730   case X86ISD::CVTP2SI:
34731   case X86ISD::CVTP2UI:
34732   case X86ISD::MCVTP2SI:
34733   case X86ISD::MCVTP2UI:
34734   case X86ISD::CVTTP2SI:
34735   case X86ISD::CVTTP2UI:
34736   case X86ISD::MCVTTP2SI:
34737   case X86ISD::MCVTTP2UI:
34738   case X86ISD::MCVTSI2P:
34739   case X86ISD::MCVTUI2P:
34740   case X86ISD::VFPROUND:
34741   case X86ISD::VMFPROUND:
34742   case X86ISD::CVTPS2PH:
34743   case X86ISD::MCVTPS2PH: {
34744     // Truncations/Conversions - upper elements are known zero.
34745     EVT SrcVT = Op.getOperand(0).getValueType();
34746     if (SrcVT.isVector()) {
34747       unsigned NumSrcElts = SrcVT.getVectorNumElements();
34748       if (NumElts > NumSrcElts &&
34749           DemandedElts.countTrailingZeros() >= NumSrcElts)
34750         Known.setAllZero();
34751     }
34752     break;
34753   }
34754   case X86ISD::STRICT_CVTTP2SI:
34755   case X86ISD::STRICT_CVTTP2UI:
34756   case X86ISD::STRICT_CVTSI2P:
34757   case X86ISD::STRICT_CVTUI2P:
34758   case X86ISD::STRICT_VFPROUND:
34759   case X86ISD::STRICT_CVTPS2PH: {
34760     // Strict Conversions - upper elements are known zero.
34761     EVT SrcVT = Op.getOperand(1).getValueType();
34762     if (SrcVT.isVector()) {
34763       unsigned NumSrcElts = SrcVT.getVectorNumElements();
34764       if (NumElts > NumSrcElts &&
34765           DemandedElts.countTrailingZeros() >= NumSrcElts)
34766         Known.setAllZero();
34767     }
34768     break;
34769   }
34770   case X86ISD::MOVQ2DQ: {
34771     // Move from MMX to XMM. Upper half of XMM should be 0.
34772     if (DemandedElts.countTrailingZeros() >= (NumElts / 2))
34773       Known.setAllZero();
34774     break;
34775   }
34776   }
34777 
34778   // Handle target shuffles.
34779   // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
34780   if (isTargetShuffle(Opc)) {
34781     SmallVector<int, 64> Mask;
34782     SmallVector<SDValue, 2> Ops;
34783     if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
34784       unsigned NumOps = Ops.size();
34785       unsigned NumElts = VT.getVectorNumElements();
34786       if (Mask.size() == NumElts) {
34787         SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
34788         Known.Zero.setAllBits(); Known.One.setAllBits();
34789         for (unsigned i = 0; i != NumElts; ++i) {
34790           if (!DemandedElts[i])
34791             continue;
34792           int M = Mask[i];
34793           if (M == SM_SentinelUndef) {
34794             // For UNDEF elements, we don't know anything about the common state
34795             // of the shuffle result.
34796             Known.resetAll();
34797             break;
34798           }
34799           if (M == SM_SentinelZero) {
34800             Known.One.clearAllBits();
34801             continue;
34802           }
34803           assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
34804                  "Shuffle index out of range");
34805 
34806           unsigned OpIdx = (unsigned)M / NumElts;
34807           unsigned EltIdx = (unsigned)M % NumElts;
34808           if (Ops[OpIdx].getValueType() != VT) {
34809             // TODO - handle target shuffle ops with different value types.
34810             Known.resetAll();
34811             break;
34812           }
34813           DemandedOps[OpIdx].setBit(EltIdx);
34814         }
34815         // Known bits are the values that are shared by every demanded element.
34816         for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
34817           if (!DemandedOps[i])
34818             continue;
34819           KnownBits Known2 =
34820               DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
34821           Known = KnownBits::commonBits(Known, Known2);
34822         }
34823       }
34824     }
34825   }
34826 }
34827 
ComputeNumSignBitsForTargetNode(SDValue Op,const APInt & DemandedElts,const SelectionDAG & DAG,unsigned Depth) const34828 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
34829     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
34830     unsigned Depth) const {
34831   EVT VT = Op.getValueType();
34832   unsigned VTBits = VT.getScalarSizeInBits();
34833   unsigned Opcode = Op.getOpcode();
34834   switch (Opcode) {
34835   case X86ISD::SETCC_CARRY:
34836     // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
34837     return VTBits;
34838 
34839   case X86ISD::VTRUNC: {
34840     SDValue Src = Op.getOperand(0);
34841     MVT SrcVT = Src.getSimpleValueType();
34842     unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
34843     assert(VTBits < NumSrcBits && "Illegal truncation input type");
34844     APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
34845     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
34846     if (Tmp > (NumSrcBits - VTBits))
34847       return Tmp - (NumSrcBits - VTBits);
34848     return 1;
34849   }
34850 
34851   case X86ISD::PACKSS: {
34852     // PACKSS is just a truncation if the sign bits extend to the packed size.
34853     APInt DemandedLHS, DemandedRHS;
34854     getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
34855                         DemandedRHS);
34856 
34857     unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
34858     unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
34859     if (!!DemandedLHS)
34860       Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
34861     if (!!DemandedRHS)
34862       Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
34863     unsigned Tmp = std::min(Tmp0, Tmp1);
34864     if (Tmp > (SrcBits - VTBits))
34865       return Tmp - (SrcBits - VTBits);
34866     return 1;
34867   }
34868 
34869   case X86ISD::VBROADCAST: {
34870     SDValue Src = Op.getOperand(0);
34871     if (!Src.getSimpleValueType().isVector())
34872       return DAG.ComputeNumSignBits(Src, Depth + 1);
34873     break;
34874   }
34875 
34876   case X86ISD::VSHLI: {
34877     SDValue Src = Op.getOperand(0);
34878     const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
34879     if (ShiftVal.uge(VTBits))
34880       return VTBits; // Shifted all bits out --> zero.
34881     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
34882     if (ShiftVal.uge(Tmp))
34883       return 1; // Shifted all sign bits out --> unknown.
34884     return Tmp - ShiftVal.getZExtValue();
34885   }
34886 
34887   case X86ISD::VSRAI: {
34888     SDValue Src = Op.getOperand(0);
34889     APInt ShiftVal = Op.getConstantOperandAPInt(1);
34890     if (ShiftVal.uge(VTBits - 1))
34891       return VTBits; // Sign splat.
34892     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
34893     ShiftVal += Tmp;
34894     return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
34895   }
34896 
34897   case X86ISD::PCMPGT:
34898   case X86ISD::PCMPEQ:
34899   case X86ISD::CMPP:
34900   case X86ISD::VPCOM:
34901   case X86ISD::VPCOMU:
34902     // Vector compares return zero/all-bits result values.
34903     return VTBits;
34904 
34905   case X86ISD::ANDNP: {
34906     unsigned Tmp0 =
34907         DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
34908     if (Tmp0 == 1) return 1; // Early out.
34909     unsigned Tmp1 =
34910         DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
34911     return std::min(Tmp0, Tmp1);
34912   }
34913 
34914   case X86ISD::CMOV: {
34915     unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
34916     if (Tmp0 == 1) return 1;  // Early out.
34917     unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
34918     return std::min(Tmp0, Tmp1);
34919   }
34920   }
34921 
34922   // Handle target shuffles.
34923   // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
34924   if (isTargetShuffle(Opcode)) {
34925     SmallVector<int, 64> Mask;
34926     SmallVector<SDValue, 2> Ops;
34927     if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
34928       unsigned NumOps = Ops.size();
34929       unsigned NumElts = VT.getVectorNumElements();
34930       if (Mask.size() == NumElts) {
34931         SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
34932         for (unsigned i = 0; i != NumElts; ++i) {
34933           if (!DemandedElts[i])
34934             continue;
34935           int M = Mask[i];
34936           if (M == SM_SentinelUndef) {
34937             // For UNDEF elements, we don't know anything about the common state
34938             // of the shuffle result.
34939             return 1;
34940           } else if (M == SM_SentinelZero) {
34941             // Zero = all sign bits.
34942             continue;
34943           }
34944           assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
34945                  "Shuffle index out of range");
34946 
34947           unsigned OpIdx = (unsigned)M / NumElts;
34948           unsigned EltIdx = (unsigned)M % NumElts;
34949           if (Ops[OpIdx].getValueType() != VT) {
34950             // TODO - handle target shuffle ops with different value types.
34951             return 1;
34952           }
34953           DemandedOps[OpIdx].setBit(EltIdx);
34954         }
34955         unsigned Tmp0 = VTBits;
34956         for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
34957           if (!DemandedOps[i])
34958             continue;
34959           unsigned Tmp1 =
34960               DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
34961           Tmp0 = std::min(Tmp0, Tmp1);
34962         }
34963         return Tmp0;
34964       }
34965     }
34966   }
34967 
34968   // Fallback case.
34969   return 1;
34970 }
34971 
unwrapAddress(SDValue N) const34972 SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
34973   if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
34974     return N->getOperand(0);
34975   return N;
34976 }
34977 
34978 // Helper to look for a normal load that can be narrowed into a vzload with the
34979 // specified VT and memory VT. Returns SDValue() on failure.
narrowLoadToVZLoad(LoadSDNode * LN,MVT MemVT,MVT VT,SelectionDAG & DAG)34980 static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
34981                                   SelectionDAG &DAG) {
34982   // Can't if the load is volatile or atomic.
34983   if (!LN->isSimple())
34984     return SDValue();
34985 
34986   SDVTList Tys = DAG.getVTList(VT, MVT::Other);
34987   SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
34988   return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
34989                                  LN->getPointerInfo(), LN->getOriginalAlign(),
34990                                  LN->getMemOperand()->getFlags());
34991 }
34992 
34993 // Attempt to match a combined shuffle mask against supported unary shuffle
34994 // instructions.
34995 // TODO: Investigate sharing more of this with shuffle lowering.
matchUnaryShuffle(MVT MaskVT,ArrayRef<int> Mask,bool AllowFloatDomain,bool AllowIntDomain,SDValue & V1,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget,unsigned & Shuffle,MVT & SrcVT,MVT & DstVT)34996 static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
34997                               bool AllowFloatDomain, bool AllowIntDomain,
34998                               SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,
34999                               const X86Subtarget &Subtarget, unsigned &Shuffle,
35000                               MVT &SrcVT, MVT &DstVT) {
35001   unsigned NumMaskElts = Mask.size();
35002   unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
35003 
35004   // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
35005   if (MaskEltSize == 32 && Mask[0] == 0) {
35006     if (isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
35007       Shuffle = X86ISD::VZEXT_MOVL;
35008       SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
35009       return true;
35010     }
35011     if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
35012         isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
35013       Shuffle = X86ISD::VZEXT_MOVL;
35014       SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
35015       return true;
35016     }
35017   }
35018 
35019   // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
35020   // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
35021   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
35022                          (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
35023     unsigned MaxScale = 64 / MaskEltSize;
35024     for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
35025       bool MatchAny = true;
35026       bool MatchZero = true;
35027       unsigned NumDstElts = NumMaskElts / Scale;
35028       for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
35029         if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
35030           MatchAny = MatchZero = false;
35031           break;
35032         }
35033         MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
35034         MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
35035       }
35036       if (MatchAny || MatchZero) {
35037         assert(MatchZero && "Failed to match zext but matched aext?");
35038         unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
35039         MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
35040                                             MVT::getIntegerVT(MaskEltSize);
35041         SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
35042 
35043         if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
35044           V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
35045 
35046         Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
35047         if (SrcVT.getVectorNumElements() != NumDstElts)
35048           Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);
35049 
35050         DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
35051         DstVT = MVT::getVectorVT(DstVT, NumDstElts);
35052         return true;
35053       }
35054     }
35055   }
35056 
35057   // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
35058   if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
35059       isUndefOrEqual(Mask[0], 0) &&
35060       isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
35061     Shuffle = X86ISD::VZEXT_MOVL;
35062     SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
35063     return true;
35064   }
35065 
35066   // Check if we have SSE3 which will let us use MOVDDUP etc. The
35067   // instructions are no slower than UNPCKLPD but has the option to
35068   // fold the input operand into even an unaligned memory load.
35069   if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
35070     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) {
35071       Shuffle = X86ISD::MOVDDUP;
35072       SrcVT = DstVT = MVT::v2f64;
35073       return true;
35074     }
35075     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
35076       Shuffle = X86ISD::MOVSLDUP;
35077       SrcVT = DstVT = MVT::v4f32;
35078       return true;
35079     }
35080     if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, V1)) {
35081       Shuffle = X86ISD::MOVSHDUP;
35082       SrcVT = DstVT = MVT::v4f32;
35083       return true;
35084     }
35085   }
35086 
35087   if (MaskVT.is256BitVector() && AllowFloatDomain) {
35088     assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
35089     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
35090       Shuffle = X86ISD::MOVDDUP;
35091       SrcVT = DstVT = MVT::v4f64;
35092       return true;
35093     }
35094     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
35095       Shuffle = X86ISD::MOVSLDUP;
35096       SrcVT = DstVT = MVT::v8f32;
35097       return true;
35098     }
35099     if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) {
35100       Shuffle = X86ISD::MOVSHDUP;
35101       SrcVT = DstVT = MVT::v8f32;
35102       return true;
35103     }
35104   }
35105 
35106   if (MaskVT.is512BitVector() && AllowFloatDomain) {
35107     assert(Subtarget.hasAVX512() &&
35108            "AVX512 required for 512-bit vector shuffles");
35109     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
35110       Shuffle = X86ISD::MOVDDUP;
35111       SrcVT = DstVT = MVT::v8f64;
35112       return true;
35113     }
35114     if (isTargetShuffleEquivalent(
35115             MaskVT, Mask,
35116             {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) {
35117       Shuffle = X86ISD::MOVSLDUP;
35118       SrcVT = DstVT = MVT::v16f32;
35119       return true;
35120     }
35121     if (isTargetShuffleEquivalent(
35122             MaskVT, Mask,
35123             {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) {
35124       Shuffle = X86ISD::MOVSHDUP;
35125       SrcVT = DstVT = MVT::v16f32;
35126       return true;
35127     }
35128   }
35129 
35130   return false;
35131 }
35132 
35133 // Attempt to match a combined shuffle mask against supported unary immediate
35134 // permute instructions.
35135 // TODO: Investigate sharing more of this with shuffle lowering.
matchUnaryPermuteShuffle(MVT MaskVT,ArrayRef<int> Mask,const APInt & Zeroable,bool AllowFloatDomain,bool AllowIntDomain,const X86Subtarget & Subtarget,unsigned & Shuffle,MVT & ShuffleVT,unsigned & PermuteImm)35136 static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
35137                                      const APInt &Zeroable,
35138                                      bool AllowFloatDomain, bool AllowIntDomain,
35139                                      const X86Subtarget &Subtarget,
35140                                      unsigned &Shuffle, MVT &ShuffleVT,
35141                                      unsigned &PermuteImm) {
35142   unsigned NumMaskElts = Mask.size();
35143   unsigned InputSizeInBits = MaskVT.getSizeInBits();
35144   unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
35145   MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
35146   bool ContainsZeros = isAnyZero(Mask);
35147 
35148   // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
35149   if (!ContainsZeros && MaskScalarSizeInBits == 64) {
35150     // Check for lane crossing permutes.
35151     if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
35152       // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
35153       if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
35154         Shuffle = X86ISD::VPERMI;
35155         ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
35156         PermuteImm = getV4X86ShuffleImm(Mask);
35157         return true;
35158       }
35159       if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
35160         SmallVector<int, 4> RepeatedMask;
35161         if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
35162           Shuffle = X86ISD::VPERMI;
35163           ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
35164           PermuteImm = getV4X86ShuffleImm(RepeatedMask);
35165           return true;
35166         }
35167       }
35168     } else if (AllowFloatDomain && Subtarget.hasAVX()) {
35169       // VPERMILPD can permute with a non-repeating shuffle.
35170       Shuffle = X86ISD::VPERMILPI;
35171       ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
35172       PermuteImm = 0;
35173       for (int i = 0, e = Mask.size(); i != e; ++i) {
35174         int M = Mask[i];
35175         if (M == SM_SentinelUndef)
35176           continue;
35177         assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
35178         PermuteImm |= (M & 1) << i;
35179       }
35180       return true;
35181     }
35182   }
35183 
35184   // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
35185   // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
35186   // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
35187   if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
35188       !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
35189     SmallVector<int, 4> RepeatedMask;
35190     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
35191       // Narrow the repeated mask to create 32-bit element permutes.
35192       SmallVector<int, 4> WordMask = RepeatedMask;
35193       if (MaskScalarSizeInBits == 64)
35194         narrowShuffleMaskElts(2, RepeatedMask, WordMask);
35195 
35196       Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
35197       ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
35198       ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
35199       PermuteImm = getV4X86ShuffleImm(WordMask);
35200       return true;
35201     }
35202   }
35203 
35204   // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
35205   if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
35206       ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
35207        (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
35208        (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
35209     SmallVector<int, 4> RepeatedMask;
35210     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
35211       ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
35212       ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
35213 
35214       // PSHUFLW: permute lower 4 elements only.
35215       if (isUndefOrInRange(LoMask, 0, 4) &&
35216           isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
35217         Shuffle = X86ISD::PSHUFLW;
35218         ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
35219         PermuteImm = getV4X86ShuffleImm(LoMask);
35220         return true;
35221       }
35222 
35223       // PSHUFHW: permute upper 4 elements only.
35224       if (isUndefOrInRange(HiMask, 4, 8) &&
35225           isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
35226         // Offset the HiMask so that we can create the shuffle immediate.
35227         int OffsetHiMask[4];
35228         for (int i = 0; i != 4; ++i)
35229           OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
35230 
35231         Shuffle = X86ISD::PSHUFHW;
35232         ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
35233         PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
35234         return true;
35235       }
35236     }
35237   }
35238 
35239   // Attempt to match against byte/bit shifts.
35240   if (AllowIntDomain &&
35241       ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
35242        (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
35243        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
35244     int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
35245                                        Mask, 0, Zeroable, Subtarget);
35246     if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
35247                          32 <= ShuffleVT.getScalarSizeInBits())) {
35248       PermuteImm = (unsigned)ShiftAmt;
35249       return true;
35250     }
35251   }
35252 
35253   // Attempt to match against bit rotates.
35254   if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
35255       ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
35256        Subtarget.hasAVX512())) {
35257     int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
35258                                             Subtarget, Mask);
35259     if (0 < RotateAmt) {
35260       Shuffle = X86ISD::VROTLI;
35261       PermuteImm = (unsigned)RotateAmt;
35262       return true;
35263     }
35264   }
35265 
35266   return false;
35267 }
35268 
35269 // Attempt to match a combined unary shuffle mask against supported binary
35270 // shuffle instructions.
35271 // TODO: Investigate sharing more of this with shuffle lowering.
matchBinaryShuffle(MVT MaskVT,ArrayRef<int> Mask,bool AllowFloatDomain,bool AllowIntDomain,SDValue & V1,SDValue & V2,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget,unsigned & Shuffle,MVT & SrcVT,MVT & DstVT,bool IsUnary)35272 static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
35273                                bool AllowFloatDomain, bool AllowIntDomain,
35274                                SDValue &V1, SDValue &V2, const SDLoc &DL,
35275                                SelectionDAG &DAG, const X86Subtarget &Subtarget,
35276                                unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
35277                                bool IsUnary) {
35278   unsigned NumMaskElts = Mask.size();
35279   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
35280 
35281   if (MaskVT.is128BitVector()) {
35282     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}) && AllowFloatDomain) {
35283       V2 = V1;
35284       V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
35285       Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
35286       SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
35287       return true;
35288     }
35289     if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}) && AllowFloatDomain) {
35290       V2 = V1;
35291       Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
35292       SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
35293       return true;
35294     }
35295     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}) &&
35296         Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
35297       std::swap(V1, V2);
35298       Shuffle = X86ISD::MOVSD;
35299       SrcVT = DstVT = MVT::v2f64;
35300       return true;
35301     }
35302     if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}) &&
35303         (AllowFloatDomain || !Subtarget.hasSSE41())) {
35304       Shuffle = X86ISD::MOVSS;
35305       SrcVT = DstVT = MVT::v4f32;
35306       return true;
35307     }
35308   }
35309 
35310   // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
35311   if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
35312       ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
35313       ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
35314     if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
35315                              Subtarget)) {
35316       DstVT = MaskVT;
35317       return true;
35318     }
35319   }
35320 
35321   // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
35322   if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
35323       (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
35324       (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
35325       (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
35326       (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
35327     if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
35328                               Subtarget)) {
35329       SrcVT = DstVT = MaskVT;
35330       if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
35331         SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
35332       return true;
35333     }
35334   }
35335 
35336   // Attempt to match against a OR if we're performing a blend shuffle and the
35337   // non-blended source element is zero in each case.
35338   if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
35339       (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
35340     bool IsBlend = true;
35341     unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
35342     unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
35343     unsigned Scale1 = NumV1Elts / NumMaskElts;
35344     unsigned Scale2 = NumV2Elts / NumMaskElts;
35345     APInt DemandedZeroV1 = APInt::getNullValue(NumV1Elts);
35346     APInt DemandedZeroV2 = APInt::getNullValue(NumV2Elts);
35347     for (unsigned i = 0; i != NumMaskElts; ++i) {
35348       int M = Mask[i];
35349       if (M == SM_SentinelUndef)
35350         continue;
35351       if (M == SM_SentinelZero) {
35352         DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
35353         DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
35354         continue;
35355       }
35356       if (M == (int)i) {
35357         DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
35358         continue;
35359       }
35360       if (M == (int)(i + NumMaskElts)) {
35361         DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
35362         continue;
35363       }
35364       IsBlend = false;
35365       break;
35366     }
35367     if (IsBlend &&
35368         DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
35369         DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
35370       Shuffle = ISD::OR;
35371       SrcVT = DstVT = MaskVT.changeTypeToInteger();
35372       return true;
35373     }
35374   }
35375 
35376   return false;
35377 }
35378 
matchBinaryPermuteShuffle(MVT MaskVT,ArrayRef<int> Mask,const APInt & Zeroable,bool AllowFloatDomain,bool AllowIntDomain,SDValue & V1,SDValue & V2,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget,unsigned & Shuffle,MVT & ShuffleVT,unsigned & PermuteImm)35379 static bool matchBinaryPermuteShuffle(
35380     MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
35381     bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
35382     const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
35383     unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
35384   unsigned NumMaskElts = Mask.size();
35385   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
35386 
35387   // Attempt to match against VALIGND/VALIGNQ rotate.
35388   if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
35389       ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
35390        (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
35391        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
35392     if (!isAnyZero(Mask)) {
35393       int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
35394       if (0 < Rotation) {
35395         Shuffle = X86ISD::VALIGN;
35396         if (EltSizeInBits == 64)
35397           ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
35398         else
35399           ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
35400         PermuteImm = Rotation;
35401         return true;
35402       }
35403     }
35404   }
35405 
35406   // Attempt to match against PALIGNR byte rotate.
35407   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
35408                          (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
35409                          (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
35410     int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
35411     if (0 < ByteRotation) {
35412       Shuffle = X86ISD::PALIGNR;
35413       ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
35414       PermuteImm = ByteRotation;
35415       return true;
35416     }
35417   }
35418 
35419   // Attempt to combine to X86ISD::BLENDI.
35420   if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
35421                             (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
35422       (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
35423     uint64_t BlendMask = 0;
35424     bool ForceV1Zero = false, ForceV2Zero = false;
35425     SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
35426     if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
35427                             ForceV2Zero, BlendMask)) {
35428       if (MaskVT == MVT::v16i16) {
35429         // We can only use v16i16 PBLENDW if the lanes are repeated.
35430         SmallVector<int, 8> RepeatedMask;
35431         if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
35432                                         RepeatedMask)) {
35433           assert(RepeatedMask.size() == 8 &&
35434                  "Repeated mask size doesn't match!");
35435           PermuteImm = 0;
35436           for (int i = 0; i < 8; ++i)
35437             if (RepeatedMask[i] >= 8)
35438               PermuteImm |= 1 << i;
35439           V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
35440           V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
35441           Shuffle = X86ISD::BLENDI;
35442           ShuffleVT = MaskVT;
35443           return true;
35444         }
35445       } else {
35446         V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
35447         V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
35448         PermuteImm = (unsigned)BlendMask;
35449         Shuffle = X86ISD::BLENDI;
35450         ShuffleVT = MaskVT;
35451         return true;
35452       }
35453     }
35454   }
35455 
35456   // Attempt to combine to INSERTPS, but only if it has elements that need to
35457   // be set to zero.
35458   if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
35459       MaskVT.is128BitVector() && isAnyZero(Mask) &&
35460       matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
35461     Shuffle = X86ISD::INSERTPS;
35462     ShuffleVT = MVT::v4f32;
35463     return true;
35464   }
35465 
35466   // Attempt to combine to SHUFPD.
35467   if (AllowFloatDomain && EltSizeInBits == 64 &&
35468       ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
35469        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
35470        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
35471     bool ForceV1Zero = false, ForceV2Zero = false;
35472     if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
35473                                PermuteImm, Mask, Zeroable)) {
35474       V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
35475       V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
35476       Shuffle = X86ISD::SHUFP;
35477       ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
35478       return true;
35479     }
35480   }
35481 
35482   // Attempt to combine to SHUFPS.
35483   if (AllowFloatDomain && EltSizeInBits == 32 &&
35484       ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
35485        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
35486        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
35487     SmallVector<int, 4> RepeatedMask;
35488     if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
35489       // Match each half of the repeated mask, to determine if its just
35490       // referencing one of the vectors, is zeroable or entirely undef.
35491       auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
35492         int M0 = RepeatedMask[Offset];
35493         int M1 = RepeatedMask[Offset + 1];
35494 
35495         if (isUndefInRange(RepeatedMask, Offset, 2)) {
35496           return DAG.getUNDEF(MaskVT);
35497         } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
35498           S0 = (SM_SentinelUndef == M0 ? -1 : 0);
35499           S1 = (SM_SentinelUndef == M1 ? -1 : 1);
35500           return getZeroVector(MaskVT, Subtarget, DAG, DL);
35501         } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
35502           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
35503           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
35504           return V1;
35505         } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
35506           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
35507           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
35508           return V2;
35509         }
35510 
35511         return SDValue();
35512       };
35513 
35514       int ShufMask[4] = {-1, -1, -1, -1};
35515       SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
35516       SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
35517 
35518       if (Lo && Hi) {
35519         V1 = Lo;
35520         V2 = Hi;
35521         Shuffle = X86ISD::SHUFP;
35522         ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
35523         PermuteImm = getV4X86ShuffleImm(ShufMask);
35524         return true;
35525       }
35526     }
35527   }
35528 
35529   // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
35530   if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
35531       MaskVT.is128BitVector() &&
35532       matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
35533     Shuffle = X86ISD::INSERTPS;
35534     ShuffleVT = MVT::v4f32;
35535     return true;
35536   }
35537 
35538   return false;
35539 }
35540 
35541 static SDValue combineX86ShuffleChainWithExtract(
35542     ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
35543     bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
35544     const X86Subtarget &Subtarget);
35545 
35546 /// Combine an arbitrary chain of shuffles into a single instruction if
35547 /// possible.
35548 ///
35549 /// This is the leaf of the recursive combine below. When we have found some
35550 /// chain of single-use x86 shuffle instructions and accumulated the combined
35551 /// shuffle mask represented by them, this will try to pattern match that mask
35552 /// into either a single instruction if there is a special purpose instruction
35553 /// for this operation, or into a PSHUFB instruction which is a fully general
35554 /// instruction but should only be used to replace chains over a certain depth.
combineX86ShuffleChain(ArrayRef<SDValue> Inputs,SDValue Root,ArrayRef<int> BaseMask,int Depth,bool HasVariableMask,bool AllowVariableMask,SelectionDAG & DAG,const X86Subtarget & Subtarget)35555 static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
35556                                       ArrayRef<int> BaseMask, int Depth,
35557                                       bool HasVariableMask,
35558                                       bool AllowVariableMask, SelectionDAG &DAG,
35559                                       const X86Subtarget &Subtarget) {
35560   assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
35561   assert((Inputs.size() == 1 || Inputs.size() == 2) &&
35562          "Unexpected number of shuffle inputs!");
35563 
35564   MVT RootVT = Root.getSimpleValueType();
35565   unsigned RootSizeInBits = RootVT.getSizeInBits();
35566   unsigned NumRootElts = RootVT.getVectorNumElements();
35567 
35568   // Canonicalize shuffle input op to the requested type.
35569   // TODO: Support cases where Op is smaller than VT.
35570   auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
35571     return DAG.getBitcast(VT, Op);
35572   };
35573 
35574   // Find the inputs that enter the chain. Note that multiple uses are OK
35575   // here, we're not going to remove the operands we find.
35576   bool UnaryShuffle = (Inputs.size() == 1);
35577   SDValue V1 = peekThroughBitcasts(Inputs[0]);
35578   SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
35579                              : peekThroughBitcasts(Inputs[1]));
35580 
35581   MVT VT1 = V1.getSimpleValueType();
35582   MVT VT2 = V2.getSimpleValueType();
35583   assert(VT1.getSizeInBits() == RootSizeInBits &&
35584          VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch");
35585 
35586   SDLoc DL(Root);
35587   SDValue Res;
35588 
35589   unsigned NumBaseMaskElts = BaseMask.size();
35590   if (NumBaseMaskElts == 1) {
35591     assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
35592     return CanonicalizeShuffleInput(RootVT, V1);
35593   }
35594 
35595   bool OptForSize = DAG.shouldOptForSize();
35596   unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
35597   bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
35598                      (RootVT.isFloatingPoint() && Depth >= 1) ||
35599                      (RootVT.is256BitVector() && !Subtarget.hasAVX2());
35600 
35601   // Don't combine if we are a AVX512/EVEX target and the mask element size
35602   // is different from the root element size - this would prevent writemasks
35603   // from being reused.
35604   bool IsMaskedShuffle = false;
35605   if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
35606     if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
35607         Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
35608       IsMaskedShuffle = true;
35609     }
35610   }
35611 
35612   // If we are shuffling a broadcast (and not introducing zeros) then
35613   // we can just use the broadcast directly. This works for smaller broadcast
35614   // elements as well as they already repeat across each mask element
35615   if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
35616       (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
35617       V1.getValueSizeInBits() >= RootSizeInBits) {
35618     return CanonicalizeShuffleInput(RootVT, V1);
35619   }
35620 
35621   // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
35622   // etc. can be simplified.
35623   if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits) {
35624     SmallVector<int> ScaledMask, IdentityMask;
35625     unsigned NumElts = VT1.getVectorNumElements();
35626     if (BaseMask.size() <= NumElts &&
35627         scaleShuffleElements(BaseMask, NumElts, ScaledMask)) {
35628       for (unsigned i = 0; i != NumElts; ++i)
35629         IdentityMask.push_back(i);
35630       if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2))
35631         return CanonicalizeShuffleInput(RootVT, V1);
35632     }
35633   }
35634 
35635   // Handle 128/256-bit lane shuffles of 512-bit vectors.
35636   if (RootVT.is512BitVector() &&
35637       (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
35638     // If the upper subvectors are zeroable, then an extract+insert is more
35639     // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
35640     // to zero the upper subvectors.
35641     if (isUndefOrZeroInRange(BaseMask, 1, NumBaseMaskElts - 1)) {
35642       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
35643         return SDValue(); // Nothing to do!
35644       assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) &&
35645              "Unexpected lane shuffle");
35646       Res = CanonicalizeShuffleInput(RootVT, V1);
35647       unsigned SubIdx = BaseMask[0] * (NumRootElts / NumBaseMaskElts);
35648       bool UseZero = isAnyZero(BaseMask);
35649       Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
35650       return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
35651     }
35652 
35653     // Narrow shuffle mask to v4x128.
35654     SmallVector<int, 4> Mask;
35655     assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
35656     narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, Mask);
35657 
35658     // Try to lower to vshuf64x2/vshuf32x4.
35659     auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask,
35660                             SDValue V1, SDValue V2, SelectionDAG &DAG) {
35661       unsigned PermMask = 0;
35662       // Insure elements came from the same Op.
35663       SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
35664       for (int i = 0; i < 4; ++i) {
35665         assert(Mask[i] >= -1 && "Illegal shuffle sentinel value");
35666         if (Mask[i] < 0)
35667           continue;
35668 
35669         SDValue Op = Mask[i] >= 4 ? V2 : V1;
35670         unsigned OpIndex = i / 2;
35671         if (Ops[OpIndex].isUndef())
35672           Ops[OpIndex] = Op;
35673         else if (Ops[OpIndex] != Op)
35674           return SDValue();
35675 
35676         // Convert the 128-bit shuffle mask selection values into 128-bit
35677         // selection bits defined by a vshuf64x2 instruction's immediate control
35678         // byte.
35679         PermMask |= (Mask[i] % 4) << (i * 2);
35680       }
35681 
35682       return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
35683                          CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
35684                          CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
35685                          DAG.getTargetConstant(PermMask, DL, MVT::i8));
35686     };
35687 
35688     // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
35689     // doesn't work because our mask is for 128 bits and we don't have an MVT
35690     // to match that.
35691     bool PreferPERMQ =
35692         UnaryShuffle && isUndefOrInRange(Mask[0], 0, 2) &&
35693         isUndefOrInRange(Mask[1], 0, 2) && isUndefOrInRange(Mask[2], 2, 4) &&
35694         isUndefOrInRange(Mask[3], 2, 4) &&
35695         (Mask[0] < 0 || Mask[2] < 0 || Mask[0] == (Mask[2] % 2)) &&
35696         (Mask[1] < 0 || Mask[3] < 0 || Mask[1] == (Mask[3] % 2));
35697 
35698     if (!isAnyZero(Mask) && !PreferPERMQ) {
35699       if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
35700         return SDValue(); // Nothing to do!
35701       MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
35702       if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG))
35703         return DAG.getBitcast(RootVT, V);
35704     }
35705   }
35706 
35707   // Handle 128-bit lane shuffles of 256-bit vectors.
35708   if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
35709     // If the upper half is zeroable, then an extract+insert is more optimal
35710     // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
35711     // zero the upper half.
35712     if (isUndefOrZero(BaseMask[1])) {
35713       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
35714         return SDValue(); // Nothing to do!
35715       assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle");
35716       Res = CanonicalizeShuffleInput(RootVT, V1);
35717       Res = extract128BitVector(Res, BaseMask[0] * (NumRootElts / 2), DAG, DL);
35718       return widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
35719                             DL, 256);
35720     }
35721 
35722     if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
35723       return SDValue(); // Nothing to do!
35724 
35725     // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
35726     // we need to use the zeroing feature.
35727     // Prefer blends for sequential shuffles unless we are optimizing for size.
35728     if (UnaryShuffle &&
35729         !(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) &&
35730         (OptForSize || !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0))) {
35731       unsigned PermMask = 0;
35732       PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
35733       PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
35734       return DAG.getNode(
35735           X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
35736           DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
35737     }
35738 
35739     if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
35740       return SDValue(); // Nothing to do!
35741 
35742     // TODO - handle AVX512VL cases with X86ISD::SHUF128.
35743     if (!UnaryShuffle && !IsMaskedShuffle) {
35744       assert(llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) &&
35745              "Unexpected shuffle sentinel value");
35746       // Prefer blends to X86ISD::VPERM2X128.
35747       if (!((BaseMask[0] == 0 && BaseMask[1] == 3) ||
35748             (BaseMask[0] == 2 && BaseMask[1] == 1))) {
35749         unsigned PermMask = 0;
35750         PermMask |= ((BaseMask[0] & 3) << 0);
35751         PermMask |= ((BaseMask[1] & 3) << 4);
35752         SDValue LHS = isInRange(BaseMask[0], 0, 2) ? V1 : V2;
35753         SDValue RHS = isInRange(BaseMask[1], 0, 2) ? V1 : V2;
35754         return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
35755                           CanonicalizeShuffleInput(RootVT, LHS),
35756                           CanonicalizeShuffleInput(RootVT, RHS),
35757                           DAG.getTargetConstant(PermMask, DL, MVT::i8));
35758       }
35759     }
35760   }
35761 
35762   // For masks that have been widened to 128-bit elements or more,
35763   // narrow back down to 64-bit elements.
35764   SmallVector<int, 64> Mask;
35765   if (BaseMaskEltSizeInBits > 64) {
35766     assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
35767     int MaskScale = BaseMaskEltSizeInBits / 64;
35768     narrowShuffleMaskElts(MaskScale, BaseMask, Mask);
35769   } else {
35770     Mask.assign(BaseMask.begin(), BaseMask.end());
35771   }
35772 
35773   // For masked shuffles, we're trying to match the root width for better
35774   // writemask folding, attempt to scale the mask.
35775   // TODO - variable shuffles might need this to be widened again.
35776   if (IsMaskedShuffle && NumRootElts > Mask.size()) {
35777     assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
35778     int MaskScale = NumRootElts / Mask.size();
35779     SmallVector<int, 64> ScaledMask;
35780     narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
35781     Mask = std::move(ScaledMask);
35782   }
35783 
35784   unsigned NumMaskElts = Mask.size();
35785   unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
35786 
35787   // Determine the effective mask value type.
35788   FloatDomain &= (32 <= MaskEltSizeInBits);
35789   MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
35790                            : MVT::getIntegerVT(MaskEltSizeInBits);
35791   MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
35792 
35793   // Only allow legal mask types.
35794   if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
35795     return SDValue();
35796 
35797   // Attempt to match the mask against known shuffle patterns.
35798   MVT ShuffleSrcVT, ShuffleVT;
35799   unsigned Shuffle, PermuteImm;
35800 
35801   // Which shuffle domains are permitted?
35802   // Permit domain crossing at higher combine depths.
35803   // TODO: Should we indicate which domain is preferred if both are allowed?
35804   bool AllowFloatDomain = FloatDomain || (Depth >= 3);
35805   bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
35806                         (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
35807 
35808   // Determine zeroable mask elements.
35809   APInt KnownUndef, KnownZero;
35810   resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
35811   APInt Zeroable = KnownUndef | KnownZero;
35812 
35813   if (UnaryShuffle) {
35814     // Attempt to match against broadcast-from-vector.
35815     // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
35816     if ((Subtarget.hasAVX2() ||
35817          (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
35818         (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
35819       if (isUndefOrEqual(Mask, 0)) {
35820         if (V1.getValueType() == MaskVT &&
35821             V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
35822             MayFoldLoad(V1.getOperand(0))) {
35823           if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
35824             return SDValue(); // Nothing to do!
35825           Res = V1.getOperand(0);
35826           Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
35827           return DAG.getBitcast(RootVT, Res);
35828         }
35829         if (Subtarget.hasAVX2()) {
35830           if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
35831             return SDValue(); // Nothing to do!
35832           Res = CanonicalizeShuffleInput(MaskVT, V1);
35833           Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
35834           return DAG.getBitcast(RootVT, Res);
35835         }
35836       }
35837     }
35838 
35839     // See if this is a blend with zero - in which case check if the zero'd
35840     // elements are already zero.
35841     if (isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0)) {
35842       assert(!KnownZero.isNullValue() && "Shuffle has no zero elements");
35843       SDValue NewV1 = CanonicalizeShuffleInput(MaskVT, V1);
35844       if (DAG.MaskedElementsAreZero(NewV1, KnownZero))
35845         return DAG.getBitcast(RootVT, NewV1);
35846     }
35847 
35848     SDValue NewV1 = V1; // Save operand in case early exit happens.
35849     if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
35850                           DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
35851                           ShuffleVT) &&
35852         (!IsMaskedShuffle ||
35853          (NumRootElts == ShuffleVT.getVectorNumElements()))) {
35854       if (Depth == 0 && Root.getOpcode() == Shuffle)
35855         return SDValue(); // Nothing to do!
35856       Res = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
35857       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
35858       return DAG.getBitcast(RootVT, Res);
35859     }
35860 
35861     if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
35862                                  AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
35863                                  PermuteImm) &&
35864         (!IsMaskedShuffle ||
35865          (NumRootElts == ShuffleVT.getVectorNumElements()))) {
35866       if (Depth == 0 && Root.getOpcode() == Shuffle)
35867         return SDValue(); // Nothing to do!
35868       Res = CanonicalizeShuffleInput(ShuffleVT, V1);
35869       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
35870                         DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
35871       return DAG.getBitcast(RootVT, Res);
35872     }
35873   }
35874 
35875   // Attempt to combine to INSERTPS, but only if the inserted element has come
35876   // from a scalar.
35877   // TODO: Handle other insertions here as well?
35878   if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
35879       Subtarget.hasSSE41() &&
35880       !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3})) {
35881     if (MaskEltSizeInBits == 32) {
35882       SDValue SrcV1 = V1, SrcV2 = V2;
35883       if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
35884                                  DAG) &&
35885           SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
35886         if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
35887           return SDValue(); // Nothing to do!
35888         Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
35889                           CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
35890                           CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
35891                           DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
35892         return DAG.getBitcast(RootVT, Res);
35893       }
35894     }
35895     if (MaskEltSizeInBits == 64 &&
35896         isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}) &&
35897         V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
35898         V2.getScalarValueSizeInBits() <= 32) {
35899       if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
35900         return SDValue(); // Nothing to do!
35901       PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0);
35902       Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
35903                         CanonicalizeShuffleInput(MVT::v4f32, V1),
35904                         CanonicalizeShuffleInput(MVT::v4f32, V2),
35905                         DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
35906       return DAG.getBitcast(RootVT, Res);
35907     }
35908   }
35909 
35910   SDValue NewV1 = V1; // Save operands in case early exit happens.
35911   SDValue NewV2 = V2;
35912   if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
35913                          NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
35914                          ShuffleVT, UnaryShuffle) &&
35915       (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
35916     if (Depth == 0 && Root.getOpcode() == Shuffle)
35917       return SDValue(); // Nothing to do!
35918     NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
35919     NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
35920     Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
35921     return DAG.getBitcast(RootVT, Res);
35922   }
35923 
35924   NewV1 = V1; // Save operands in case early exit happens.
35925   NewV2 = V2;
35926   if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
35927                                 AllowIntDomain, NewV1, NewV2, DL, DAG,
35928                                 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
35929       (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
35930     if (Depth == 0 && Root.getOpcode() == Shuffle)
35931       return SDValue(); // Nothing to do!
35932     NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
35933     NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
35934     Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
35935                       DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
35936     return DAG.getBitcast(RootVT, Res);
35937   }
35938 
35939   // Typically from here on, we need an integer version of MaskVT.
35940   MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
35941   IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
35942 
35943   // Annoyingly, SSE4A instructions don't map into the above match helpers.
35944   if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
35945     uint64_t BitLen, BitIdx;
35946     if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
35947                             Zeroable)) {
35948       if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
35949         return SDValue(); // Nothing to do!
35950       V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
35951       Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
35952                         DAG.getTargetConstant(BitLen, DL, MVT::i8),
35953                         DAG.getTargetConstant(BitIdx, DL, MVT::i8));
35954       return DAG.getBitcast(RootVT, Res);
35955     }
35956 
35957     if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
35958       if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
35959         return SDValue(); // Nothing to do!
35960       V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
35961       V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
35962       Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
35963                         DAG.getTargetConstant(BitLen, DL, MVT::i8),
35964                         DAG.getTargetConstant(BitIdx, DL, MVT::i8));
35965       return DAG.getBitcast(RootVT, Res);
35966     }
35967   }
35968 
35969   // Match shuffle against TRUNCATE patterns.
35970   if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
35971     // Match against a VTRUNC instruction, accounting for src/dst sizes.
35972     if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
35973                              Subtarget)) {
35974       bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
35975                         ShuffleSrcVT.getVectorNumElements();
35976       unsigned Opc =
35977           IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
35978       if (Depth == 0 && Root.getOpcode() == Opc)
35979         return SDValue(); // Nothing to do!
35980       V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
35981       Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
35982       if (ShuffleVT.getSizeInBits() < RootSizeInBits)
35983         Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
35984       return DAG.getBitcast(RootVT, Res);
35985     }
35986 
35987     // Do we need a more general binary truncation pattern?
35988     if (RootSizeInBits < 512 &&
35989         ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
35990          (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
35991         (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
35992         isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
35993       if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
35994         return SDValue(); // Nothing to do!
35995       ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
35996       ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
35997       V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
35998       V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
35999       ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
36000       ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
36001       Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
36002       Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
36003       return DAG.getBitcast(RootVT, Res);
36004     }
36005   }
36006 
36007   // Don't try to re-form single instruction chains under any circumstances now
36008   // that we've done encoding canonicalization for them.
36009   if (Depth < 1)
36010     return SDValue();
36011 
36012   // Depth threshold above which we can efficiently use variable mask shuffles.
36013   int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;
36014   AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
36015   // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
36016   // higher depth before combining them.
36017   bool AllowBWIVPERMV3 = (Depth >= 2 || HasVariableMask);
36018 
36019   bool MaskContainsZeros = isAnyZero(Mask);
36020 
36021   if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
36022     // If we have a single input lane-crossing shuffle then lower to VPERMV.
36023     if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros) {
36024       if (Subtarget.hasAVX2() &&
36025           (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
36026         SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
36027         Res = CanonicalizeShuffleInput(MaskVT, V1);
36028         Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
36029         return DAG.getBitcast(RootVT, Res);
36030       }
36031       // AVX512 variants (non-VLX will pad to 512-bit shuffles).
36032       if ((Subtarget.hasAVX512() &&
36033            (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
36034             MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
36035           (Subtarget.hasBWI() &&
36036            (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36037           (Subtarget.hasVBMI() &&
36038            (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
36039         V1 = CanonicalizeShuffleInput(MaskVT, V1);
36040         V2 = DAG.getUNDEF(MaskVT);
36041         Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36042         return DAG.getBitcast(RootVT, Res);
36043       }
36044     }
36045 
36046     // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
36047     // vector as the second source (non-VLX will pad to 512-bit shuffles).
36048     if (UnaryShuffle && AllowVariableMask &&
36049         ((Subtarget.hasAVX512() &&
36050           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
36051            MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
36052            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
36053            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
36054          (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
36055           (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36056          (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
36057           (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
36058       // Adjust shuffle mask - replace SM_SentinelZero with second source index.
36059       for (unsigned i = 0; i != NumMaskElts; ++i)
36060         if (Mask[i] == SM_SentinelZero)
36061           Mask[i] = NumMaskElts + i;
36062       V1 = CanonicalizeShuffleInput(MaskVT, V1);
36063       V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
36064       Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36065       return DAG.getBitcast(RootVT, Res);
36066     }
36067 
36068     // If that failed and either input is extracted then try to combine as a
36069     // shuffle with the larger type.
36070     if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
36071             Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
36072             DAG, Subtarget))
36073       return WideShuffle;
36074 
36075     // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
36076     // (non-VLX will pad to 512-bit shuffles).
36077     if (AllowVariableMask && !MaskContainsZeros &&
36078         ((Subtarget.hasAVX512() &&
36079           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
36080            MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
36081            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
36082            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
36083          (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
36084           (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36085          (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
36086           (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
36087       V1 = CanonicalizeShuffleInput(MaskVT, V1);
36088       V2 = CanonicalizeShuffleInput(MaskVT, V2);
36089       Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36090       return DAG.getBitcast(RootVT, Res);
36091     }
36092     return SDValue();
36093   }
36094 
36095   // See if we can combine a single input shuffle with zeros to a bit-mask,
36096   // which is much simpler than any shuffle.
36097   if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
36098       isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
36099       DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
36100     APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
36101     APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
36102     APInt UndefElts(NumMaskElts, 0);
36103     SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
36104     for (unsigned i = 0; i != NumMaskElts; ++i) {
36105       int M = Mask[i];
36106       if (M == SM_SentinelUndef) {
36107         UndefElts.setBit(i);
36108         continue;
36109       }
36110       if (M == SM_SentinelZero)
36111         continue;
36112       EltBits[i] = AllOnes;
36113     }
36114     SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
36115     Res = CanonicalizeShuffleInput(MaskVT, V1);
36116     unsigned AndOpcode =
36117         MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
36118     Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
36119     return DAG.getBitcast(RootVT, Res);
36120   }
36121 
36122   // If we have a single input shuffle with different shuffle patterns in the
36123   // the 128-bit lanes use the variable mask to VPERMILPS.
36124   // TODO Combine other mask types at higher depths.
36125   if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
36126       ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
36127        (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
36128     SmallVector<SDValue, 16> VPermIdx;
36129     for (int M : Mask) {
36130       SDValue Idx =
36131           M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
36132       VPermIdx.push_back(Idx);
36133     }
36134     SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
36135     Res = CanonicalizeShuffleInput(MaskVT, V1);
36136     Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
36137     return DAG.getBitcast(RootVT, Res);
36138   }
36139 
36140   // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
36141   // to VPERMIL2PD/VPERMIL2PS.
36142   if (AllowVariableMask && Subtarget.hasXOP() &&
36143       (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
36144        MaskVT == MVT::v8f32)) {
36145     // VPERMIL2 Operation.
36146     // Bits[3] - Match Bit.
36147     // Bits[2:1] - (Per Lane) PD Shuffle Mask.
36148     // Bits[2:0] - (Per Lane) PS Shuffle Mask.
36149     unsigned NumLanes = MaskVT.getSizeInBits() / 128;
36150     unsigned NumEltsPerLane = NumMaskElts / NumLanes;
36151     SmallVector<int, 8> VPerm2Idx;
36152     unsigned M2ZImm = 0;
36153     for (int M : Mask) {
36154       if (M == SM_SentinelUndef) {
36155         VPerm2Idx.push_back(-1);
36156         continue;
36157       }
36158       if (M == SM_SentinelZero) {
36159         M2ZImm = 2;
36160         VPerm2Idx.push_back(8);
36161         continue;
36162       }
36163       int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
36164       Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
36165       VPerm2Idx.push_back(Index);
36166     }
36167     V1 = CanonicalizeShuffleInput(MaskVT, V1);
36168     V2 = CanonicalizeShuffleInput(MaskVT, V2);
36169     SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
36170     Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
36171                       DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
36172     return DAG.getBitcast(RootVT, Res);
36173   }
36174 
36175   // If we have 3 or more shuffle instructions or a chain involving a variable
36176   // mask, we can replace them with a single PSHUFB instruction profitably.
36177   // Intel's manuals suggest only using PSHUFB if doing so replacing 5
36178   // instructions, but in practice PSHUFB tends to be *very* fast so we're
36179   // more aggressive.
36180   if (UnaryShuffle && AllowVariableMask &&
36181       ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
36182        (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
36183        (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
36184     SmallVector<SDValue, 16> PSHUFBMask;
36185     int NumBytes = RootVT.getSizeInBits() / 8;
36186     int Ratio = NumBytes / NumMaskElts;
36187     for (int i = 0; i < NumBytes; ++i) {
36188       int M = Mask[i / Ratio];
36189       if (M == SM_SentinelUndef) {
36190         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
36191         continue;
36192       }
36193       if (M == SM_SentinelZero) {
36194         PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
36195         continue;
36196       }
36197       M = Ratio * M + i % Ratio;
36198       assert((M / 16) == (i / 16) && "Lane crossing detected");
36199       PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
36200     }
36201     MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
36202     Res = CanonicalizeShuffleInput(ByteVT, V1);
36203     SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
36204     Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
36205     return DAG.getBitcast(RootVT, Res);
36206   }
36207 
36208   // With XOP, if we have a 128-bit binary input shuffle we can always combine
36209   // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
36210   // slower than PSHUFB on targets that support both.
36211   if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
36212     // VPPERM Mask Operation
36213     // Bits[4:0] - Byte Index (0 - 31)
36214     // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
36215     SmallVector<SDValue, 16> VPPERMMask;
36216     int NumBytes = 16;
36217     int Ratio = NumBytes / NumMaskElts;
36218     for (int i = 0; i < NumBytes; ++i) {
36219       int M = Mask[i / Ratio];
36220       if (M == SM_SentinelUndef) {
36221         VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
36222         continue;
36223       }
36224       if (M == SM_SentinelZero) {
36225         VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
36226         continue;
36227       }
36228       M = Ratio * M + i % Ratio;
36229       VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
36230     }
36231     MVT ByteVT = MVT::v16i8;
36232     V1 = CanonicalizeShuffleInput(ByteVT, V1);
36233     V2 = CanonicalizeShuffleInput(ByteVT, V2);
36234     SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
36235     Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
36236     return DAG.getBitcast(RootVT, Res);
36237   }
36238 
36239   // If that failed and either input is extracted then try to combine as a
36240   // shuffle with the larger type.
36241   if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
36242           Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
36243           DAG, Subtarget))
36244     return WideShuffle;
36245 
36246   // If we have a dual input shuffle then lower to VPERMV3,
36247   // (non-VLX will pad to 512-bit shuffles)
36248   if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
36249       ((Subtarget.hasAVX512() &&
36250         (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
36251          MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
36252          MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
36253          MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
36254          MaskVT == MVT::v16i32)) ||
36255        (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
36256         (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36257        (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
36258         (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
36259     V1 = CanonicalizeShuffleInput(MaskVT, V1);
36260     V2 = CanonicalizeShuffleInput(MaskVT, V2);
36261     Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36262     return DAG.getBitcast(RootVT, Res);
36263   }
36264 
36265   // Failed to find any combines.
36266   return SDValue();
36267 }
36268 
36269 // Combine an arbitrary chain of shuffles + extract_subvectors into a single
36270 // instruction if possible.
36271 //
36272 // Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
36273 // type size to attempt to combine:
36274 // shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
36275 // -->
36276 // extract_subvector(shuffle(x,y,m2),0)
combineX86ShuffleChainWithExtract(ArrayRef<SDValue> Inputs,SDValue Root,ArrayRef<int> BaseMask,int Depth,bool HasVariableMask,bool AllowVariableMask,SelectionDAG & DAG,const X86Subtarget & Subtarget)36277 static SDValue combineX86ShuffleChainWithExtract(
36278     ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
36279     bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
36280     const X86Subtarget &Subtarget) {
36281   unsigned NumMaskElts = BaseMask.size();
36282   unsigned NumInputs = Inputs.size();
36283   if (NumInputs == 0)
36284     return SDValue();
36285 
36286   EVT RootVT = Root.getValueType();
36287   unsigned RootSizeInBits = RootVT.getSizeInBits();
36288   assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
36289 
36290   SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
36291   SmallVector<unsigned, 4> Offsets(NumInputs, 0);
36292 
36293   // Peek through subvectors.
36294   // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
36295   unsigned WideSizeInBits = RootSizeInBits;
36296   for (unsigned i = 0; i != NumInputs; ++i) {
36297     SDValue &Src = WideInputs[i];
36298     unsigned &Offset = Offsets[i];
36299     Src = peekThroughBitcasts(Src);
36300     EVT BaseVT = Src.getValueType();
36301     while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
36302       Offset += Src.getConstantOperandVal(1);
36303       Src = Src.getOperand(0);
36304     }
36305     WideSizeInBits = std::max(WideSizeInBits,
36306                               (unsigned)Src.getValueSizeInBits());
36307     assert((Offset % BaseVT.getVectorNumElements()) == 0 &&
36308            "Unexpected subvector extraction");
36309     Offset /= BaseVT.getVectorNumElements();
36310     Offset *= NumMaskElts;
36311   }
36312 
36313   // Bail if we're always extracting from the lowest subvectors,
36314   // combineX86ShuffleChain should match this for the current width.
36315   if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
36316     return SDValue();
36317 
36318   unsigned Scale = WideSizeInBits / RootSizeInBits;
36319   assert((WideSizeInBits % RootSizeInBits) == 0 &&
36320          "Unexpected subvector extraction");
36321 
36322   // If the src vector types aren't the same, see if we can extend
36323   // them to match each other.
36324   // TODO: Support different scalar types?
36325   EVT WideSVT = WideInputs[0].getValueType().getScalarType();
36326   if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
36327         return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||
36328                Op.getValueType().getScalarType() != WideSVT;
36329       }))
36330     return SDValue();
36331 
36332   for (SDValue &NewInput : WideInputs) {
36333     assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
36334            "Shuffle vector size mismatch");
36335     if (WideSizeInBits > NewInput.getValueSizeInBits())
36336       NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
36337                                 SDLoc(NewInput), WideSizeInBits);
36338     assert(WideSizeInBits == NewInput.getValueSizeInBits() &&
36339            "Unexpected subvector extraction");
36340   }
36341 
36342   // Create new mask for larger type.
36343   for (unsigned i = 1; i != NumInputs; ++i)
36344     Offsets[i] += i * Scale * NumMaskElts;
36345 
36346   SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
36347   for (int &M : WideMask) {
36348     if (M < 0)
36349       continue;
36350     M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
36351   }
36352   WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
36353 
36354   // Remove unused/repeated shuffle source ops.
36355   resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
36356   assert(!WideInputs.empty() && "Shuffle with no inputs detected");
36357 
36358   if (WideInputs.size() > 2)
36359     return SDValue();
36360 
36361   // Increase depth for every upper subvector we've peeked through.
36362   Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });
36363 
36364   // Attempt to combine wider chain.
36365   // TODO: Can we use a better Root?
36366   SDValue WideRoot = WideInputs[0];
36367   if (SDValue WideShuffle = combineX86ShuffleChain(
36368           WideInputs, WideRoot, WideMask, Depth, HasVariableMask,
36369           AllowVariableMask, DAG, Subtarget)) {
36370     WideShuffle =
36371         extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
36372     return DAG.getBitcast(RootVT, WideShuffle);
36373   }
36374   return SDValue();
36375 }
36376 
36377 // Canonicalize the combined shuffle mask chain with horizontal ops.
36378 // NOTE: This may update the Ops and Mask.
canonicalizeShuffleMaskWithHorizOp(MutableArrayRef<SDValue> Ops,MutableArrayRef<int> Mask,unsigned RootSizeInBits,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget)36379 static SDValue canonicalizeShuffleMaskWithHorizOp(
36380     MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
36381     unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
36382     const X86Subtarget &Subtarget) {
36383   if (Mask.empty() || Ops.empty())
36384     return SDValue();
36385 
36386   SmallVector<SDValue> BC;
36387   for (SDValue Op : Ops)
36388     BC.push_back(peekThroughBitcasts(Op));
36389 
36390   // All ops must be the same horizop + type.
36391   SDValue BC0 = BC[0];
36392   EVT VT0 = BC0.getValueType();
36393   unsigned Opcode0 = BC0.getOpcode();
36394   if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
36395         return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
36396       }))
36397     return SDValue();
36398 
36399   bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
36400                   Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
36401   bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
36402   if (!isHoriz && !isPack)
36403     return SDValue();
36404 
36405   // Do all ops have a single use?
36406   bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
36407     return Op.hasOneUse() &&
36408            peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);
36409   });
36410 
36411   int NumElts = VT0.getVectorNumElements();
36412   int NumLanes = VT0.getSizeInBits() / 128;
36413   int NumEltsPerLane = NumElts / NumLanes;
36414   int NumHalfEltsPerLane = NumEltsPerLane / 2;
36415   MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
36416   unsigned EltSizeInBits = RootSizeInBits / Mask.size();
36417 
36418   if (NumEltsPerLane >= 4 &&
36419       (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
36420     SmallVector<int> LaneMask, ScaledMask;
36421     if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
36422         scaleShuffleElements(LaneMask, 4, ScaledMask)) {
36423       // See if we can remove the shuffle by resorting the HOP chain so that
36424       // the HOP args are pre-shuffled.
36425       // TODO: Generalize to any sized/depth chain.
36426       // TODO: Add support for PACKSS/PACKUS.
36427       if (isHoriz) {
36428         // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
36429         auto GetHOpSrc = [&](int M) {
36430           if (M == SM_SentinelUndef)
36431             return DAG.getUNDEF(VT0);
36432           if (M == SM_SentinelZero)
36433             return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
36434           SDValue Src0 = BC[M / 4];
36435           SDValue Src1 = Src0.getOperand((M % 4) >= 2);
36436           if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
36437             return Src1.getOperand(M % 2);
36438           return SDValue();
36439         };
36440         SDValue M0 = GetHOpSrc(ScaledMask[0]);
36441         SDValue M1 = GetHOpSrc(ScaledMask[1]);
36442         SDValue M2 = GetHOpSrc(ScaledMask[2]);
36443         SDValue M3 = GetHOpSrc(ScaledMask[3]);
36444         if (M0 && M1 && M2 && M3) {
36445           SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
36446           SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
36447           return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
36448         }
36449       }
36450       // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
36451       if (Ops.size() >= 2) {
36452         SDValue LHS, RHS;
36453         auto GetHOpSrc = [&](int M, int &OutM) {
36454           // TODO: Support SM_SentinelZero
36455           if (M < 0)
36456             return M == SM_SentinelUndef;
36457           SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
36458           if (!LHS || LHS == Src) {
36459             LHS = Src;
36460             OutM = (M % 2);
36461             return true;
36462           }
36463           if (!RHS || RHS == Src) {
36464             RHS = Src;
36465             OutM = (M % 2) + 2;
36466             return true;
36467           }
36468           return false;
36469         };
36470         int PostMask[4] = {-1, -1, -1, -1};
36471         if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
36472             GetHOpSrc(ScaledMask[1], PostMask[1]) &&
36473             GetHOpSrc(ScaledMask[2], PostMask[2]) &&
36474             GetHOpSrc(ScaledMask[3], PostMask[3])) {
36475           LHS = DAG.getBitcast(SrcVT, LHS);
36476           RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
36477           SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
36478           // Use SHUFPS for the permute so this will work on SSE3 targets,
36479           // shuffle combining and domain handling will simplify this later on.
36480           MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
36481           Res = DAG.getBitcast(ShuffleVT, Res);
36482           return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
36483                              getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
36484         }
36485       }
36486     }
36487   }
36488 
36489   if (2 < Ops.size())
36490     return SDValue();
36491 
36492   SDValue BC1 = BC[BC.size() - 1];
36493   if (Mask.size() == VT0.getVectorNumElements()) {
36494     // Canonicalize binary shuffles of horizontal ops that use the
36495     // same sources to an unary shuffle.
36496     // TODO: Try to perform this fold even if the shuffle remains.
36497     if (Ops.size() == 2) {
36498       auto ContainsOps = [](SDValue HOp, SDValue Op) {
36499         return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
36500       };
36501       // Commute if all BC0's ops are contained in BC1.
36502       if (ContainsOps(BC1, BC0.getOperand(0)) &&
36503           ContainsOps(BC1, BC0.getOperand(1))) {
36504         ShuffleVectorSDNode::commuteMask(Mask);
36505         std::swap(Ops[0], Ops[1]);
36506         std::swap(BC0, BC1);
36507       }
36508 
36509       // If BC1 can be represented by BC0, then convert to unary shuffle.
36510       if (ContainsOps(BC0, BC1.getOperand(0)) &&
36511           ContainsOps(BC0, BC1.getOperand(1))) {
36512         for (int &M : Mask) {
36513           if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
36514             continue;
36515           int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
36516           M -= NumElts + (SubLane * NumHalfEltsPerLane);
36517           if (BC1.getOperand(SubLane) != BC0.getOperand(0))
36518             M += NumHalfEltsPerLane;
36519         }
36520       }
36521     }
36522 
36523     // Canonicalize unary horizontal ops to only refer to lower halves.
36524     for (int i = 0; i != NumElts; ++i) {
36525       int &M = Mask[i];
36526       if (isUndefOrZero(M))
36527         continue;
36528       if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
36529           (M % NumEltsPerLane) >= NumHalfEltsPerLane)
36530         M -= NumHalfEltsPerLane;
36531       if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
36532           (M % NumEltsPerLane) >= NumHalfEltsPerLane)
36533         M -= NumHalfEltsPerLane;
36534     }
36535   }
36536 
36537   // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
36538   // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
36539   // represents the LHS/RHS inputs for the lower/upper halves.
36540   SmallVector<int, 16> TargetMask128, WideMask128;
36541   if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
36542       scaleShuffleElements(TargetMask128, 2, WideMask128)) {
36543     assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
36544     bool SingleOp = (Ops.size() == 1);
36545     if (isPack || OneUseOps ||
36546         shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
36547       SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
36548       SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
36549       Lo = Lo.getOperand(WideMask128[0] & 1);
36550       Hi = Hi.getOperand(WideMask128[1] & 1);
36551       if (SingleOp) {
36552         SDValue Undef = DAG.getUNDEF(SrcVT);
36553         SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
36554         Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
36555         Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
36556         Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
36557         Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
36558       }
36559       return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
36560     }
36561   }
36562 
36563   return SDValue();
36564 }
36565 
36566 // Attempt to constant fold all of the constant source ops.
36567 // Returns true if the entire shuffle is folded to a constant.
36568 // TODO: Extend this to merge multiple constant Ops and update the mask.
combineX86ShufflesConstants(ArrayRef<SDValue> Ops,ArrayRef<int> Mask,SDValue Root,bool HasVariableMask,SelectionDAG & DAG,const X86Subtarget & Subtarget)36569 static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
36570                                            ArrayRef<int> Mask, SDValue Root,
36571                                            bool HasVariableMask,
36572                                            SelectionDAG &DAG,
36573                                            const X86Subtarget &Subtarget) {
36574   MVT VT = Root.getSimpleValueType();
36575 
36576   unsigned SizeInBits = VT.getSizeInBits();
36577   unsigned NumMaskElts = Mask.size();
36578   unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
36579   unsigned NumOps = Ops.size();
36580 
36581   // Extract constant bits from each source op.
36582   bool OneUseConstantOp = false;
36583   SmallVector<APInt, 16> UndefEltsOps(NumOps);
36584   SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
36585   for (unsigned i = 0; i != NumOps; ++i) {
36586     SDValue SrcOp = Ops[i];
36587     OneUseConstantOp |= SrcOp.hasOneUse();
36588     if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
36589                                        RawBitsOps[i]))
36590       return SDValue();
36591   }
36592 
36593   // Only fold if at least one of the constants is only used once or
36594   // the combined shuffle has included a variable mask shuffle, this
36595   // is to avoid constant pool bloat.
36596   if (!OneUseConstantOp && !HasVariableMask)
36597     return SDValue();
36598 
36599   // Shuffle the constant bits according to the mask.
36600   SDLoc DL(Root);
36601   APInt UndefElts(NumMaskElts, 0);
36602   APInt ZeroElts(NumMaskElts, 0);
36603   APInt ConstantElts(NumMaskElts, 0);
36604   SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
36605                                         APInt::getNullValue(MaskSizeInBits));
36606   for (unsigned i = 0; i != NumMaskElts; ++i) {
36607     int M = Mask[i];
36608     if (M == SM_SentinelUndef) {
36609       UndefElts.setBit(i);
36610       continue;
36611     } else if (M == SM_SentinelZero) {
36612       ZeroElts.setBit(i);
36613       continue;
36614     }
36615     assert(0 <= M && M < (int)(NumMaskElts * NumOps));
36616 
36617     unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
36618     unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
36619 
36620     auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
36621     if (SrcUndefElts[SrcMaskIdx]) {
36622       UndefElts.setBit(i);
36623       continue;
36624     }
36625 
36626     auto &SrcEltBits = RawBitsOps[SrcOpIdx];
36627     APInt &Bits = SrcEltBits[SrcMaskIdx];
36628     if (!Bits) {
36629       ZeroElts.setBit(i);
36630       continue;
36631     }
36632 
36633     ConstantElts.setBit(i);
36634     ConstantBitData[i] = Bits;
36635   }
36636   assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
36637 
36638   // Attempt to create a zero vector.
36639   if ((UndefElts | ZeroElts).isAllOnesValue())
36640     return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
36641 
36642   // Create the constant data.
36643   MVT MaskSVT;
36644   if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
36645     MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
36646   else
36647     MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
36648 
36649   MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
36650   if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
36651     return SDValue();
36652 
36653   SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
36654   return DAG.getBitcast(VT, CstOp);
36655 }
36656 
36657 namespace llvm {
36658   namespace X86 {
36659     enum {
36660       MaxShuffleCombineDepth = 8
36661     };
36662   }
36663 } // namespace llvm
36664 
36665 /// Fully generic combining of x86 shuffle instructions.
36666 ///
36667 /// This should be the last combine run over the x86 shuffle instructions. Once
36668 /// they have been fully optimized, this will recursively consider all chains
36669 /// of single-use shuffle instructions, build a generic model of the cumulative
36670 /// shuffle operation, and check for simpler instructions which implement this
36671 /// operation. We use this primarily for two purposes:
36672 ///
36673 /// 1) Collapse generic shuffles to specialized single instructions when
36674 ///    equivalent. In most cases, this is just an encoding size win, but
36675 ///    sometimes we will collapse multiple generic shuffles into a single
36676 ///    special-purpose shuffle.
36677 /// 2) Look for sequences of shuffle instructions with 3 or more total
36678 ///    instructions, and replace them with the slightly more expensive SSSE3
36679 ///    PSHUFB instruction if available. We do this as the last combining step
36680 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
36681 ///    a suitable short sequence of other instructions. The PSHUFB will either
36682 ///    use a register or have to read from memory and so is slightly (but only
36683 ///    slightly) more expensive than the other shuffle instructions.
36684 ///
36685 /// Because this is inherently a quadratic operation (for each shuffle in
36686 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
36687 /// This should never be an issue in practice as the shuffle lowering doesn't
36688 /// produce sequences of more than 8 instructions.
36689 ///
36690 /// FIXME: We will currently miss some cases where the redundant shuffling
36691 /// would simplify under the threshold for PSHUFB formation because of
36692 /// combine-ordering. To fix this, we should do the redundant instruction
36693 /// combining in this recursive walk.
combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,int SrcOpIndex,SDValue Root,ArrayRef<int> RootMask,ArrayRef<const SDNode * > SrcNodes,unsigned Depth,unsigned MaxDepth,bool HasVariableMask,bool AllowVariableMask,SelectionDAG & DAG,const X86Subtarget & Subtarget)36694 static SDValue combineX86ShufflesRecursively(
36695     ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
36696     ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
36697     unsigned MaxDepth, bool HasVariableMask, bool AllowVariableMask,
36698     SelectionDAG &DAG, const X86Subtarget &Subtarget) {
36699   assert(RootMask.size() > 0 &&
36700          (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
36701          "Illegal shuffle root mask");
36702   assert(Root.getSimpleValueType().isVector() &&
36703          "Shuffles operate on vector types!");
36704   unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
36705 
36706   // Bound the depth of our recursive combine because this is ultimately
36707   // quadratic in nature.
36708   if (Depth >= MaxDepth)
36709     return SDValue();
36710 
36711   // Directly rip through bitcasts to find the underlying operand.
36712   SDValue Op = SrcOps[SrcOpIndex];
36713   Op = peekThroughOneUseBitcasts(Op);
36714 
36715   EVT VT = Op.getValueType();
36716   if (!VT.isVector() || !VT.isSimple())
36717     return SDValue(); // Bail if we hit a non-simple non-vector.
36718 
36719   assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
36720          "Can only combine shuffles upto size of the root op.");
36721 
36722   // Extract target shuffle mask and resolve sentinels and inputs.
36723   // TODO - determine Op's demanded elts from RootMask.
36724   SmallVector<int, 64> OpMask;
36725   SmallVector<SDValue, 2> OpInputs;
36726   APInt OpUndef, OpZero;
36727   APInt OpDemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
36728   bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
36729   if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
36730                               OpZero, DAG, Depth, false))
36731     return SDValue();
36732 
36733   // Shuffle inputs must not be larger than the shuffle result.
36734   // TODO: Relax this for single input faux shuffles (trunc/extract_subvector).
36735   if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
36736         return OpInput.getValueSizeInBits() > VT.getSizeInBits();
36737       }))
36738     return SDValue();
36739 
36740   // If the shuffle result was smaller than the root, we need to adjust the
36741   // mask indices and pad the mask with undefs.
36742   if (RootSizeInBits > VT.getSizeInBits()) {
36743     unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
36744     unsigned OpMaskSize = OpMask.size();
36745     if (OpInputs.size() > 1) {
36746       unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
36747       for (int &M : OpMask) {
36748         if (M < 0)
36749           continue;
36750         int EltIdx = M % OpMaskSize;
36751         int OpIdx = M / OpMaskSize;
36752         M = (PaddedMaskSize * OpIdx) + EltIdx;
36753       }
36754     }
36755     OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
36756     OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
36757     OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
36758   }
36759 
36760   SmallVector<int, 64> Mask;
36761   SmallVector<SDValue, 16> Ops;
36762 
36763   // We don't need to merge masks if the root is empty.
36764   bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
36765   if (EmptyRoot) {
36766     // Only resolve zeros if it will remove an input, otherwise we might end
36767     // up in an infinite loop.
36768     bool ResolveKnownZeros = true;
36769     if (!OpZero.isNullValue()) {
36770       APInt UsedInputs = APInt::getNullValue(OpInputs.size());
36771       for (int i = 0, e = OpMask.size(); i != e; ++i) {
36772         int M = OpMask[i];
36773         if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
36774           continue;
36775         UsedInputs.setBit(M / OpMask.size());
36776         if (UsedInputs.isAllOnesValue()) {
36777           ResolveKnownZeros = false;
36778           break;
36779         }
36780       }
36781     }
36782     resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
36783                                       ResolveKnownZeros);
36784 
36785     Mask = OpMask;
36786     Ops.append(OpInputs.begin(), OpInputs.end());
36787   } else {
36788     resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
36789 
36790     // Add the inputs to the Ops list, avoiding duplicates.
36791     Ops.append(SrcOps.begin(), SrcOps.end());
36792 
36793     auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
36794       // Attempt to find an existing match.
36795       SDValue InputBC = peekThroughBitcasts(Input);
36796       for (int i = 0, e = Ops.size(); i < e; ++i)
36797         if (InputBC == peekThroughBitcasts(Ops[i]))
36798           return i;
36799       // Match failed - should we replace an existing Op?
36800       if (InsertionPoint >= 0) {
36801         Ops[InsertionPoint] = Input;
36802         return InsertionPoint;
36803       }
36804       // Add to the end of the Ops list.
36805       Ops.push_back(Input);
36806       return Ops.size() - 1;
36807     };
36808 
36809     SmallVector<int, 2> OpInputIdx;
36810     for (SDValue OpInput : OpInputs)
36811       OpInputIdx.push_back(
36812           AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
36813 
36814     assert(((RootMask.size() > OpMask.size() &&
36815              RootMask.size() % OpMask.size() == 0) ||
36816             (OpMask.size() > RootMask.size() &&
36817              OpMask.size() % RootMask.size() == 0) ||
36818             OpMask.size() == RootMask.size()) &&
36819            "The smaller number of elements must divide the larger.");
36820 
36821     // This function can be performance-critical, so we rely on the power-of-2
36822     // knowledge that we have about the mask sizes to replace div/rem ops with
36823     // bit-masks and shifts.
36824     assert(isPowerOf2_32(RootMask.size()) &&
36825            "Non-power-of-2 shuffle mask sizes");
36826     assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
36827     unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
36828     unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
36829 
36830     unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
36831     unsigned RootRatio =
36832         std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
36833     unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
36834     assert((RootRatio == 1 || OpRatio == 1) &&
36835            "Must not have a ratio for both incoming and op masks!");
36836 
36837     assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
36838     assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
36839     assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
36840     unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
36841     unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
36842 
36843     Mask.resize(MaskWidth, SM_SentinelUndef);
36844 
36845     // Merge this shuffle operation's mask into our accumulated mask. Note that
36846     // this shuffle's mask will be the first applied to the input, followed by
36847     // the root mask to get us all the way to the root value arrangement. The
36848     // reason for this order is that we are recursing up the operation chain.
36849     for (unsigned i = 0; i < MaskWidth; ++i) {
36850       unsigned RootIdx = i >> RootRatioLog2;
36851       if (RootMask[RootIdx] < 0) {
36852         // This is a zero or undef lane, we're done.
36853         Mask[i] = RootMask[RootIdx];
36854         continue;
36855       }
36856 
36857       unsigned RootMaskedIdx =
36858           RootRatio == 1
36859               ? RootMask[RootIdx]
36860               : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
36861 
36862       // Just insert the scaled root mask value if it references an input other
36863       // than the SrcOp we're currently inserting.
36864       if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
36865           (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
36866         Mask[i] = RootMaskedIdx;
36867         continue;
36868       }
36869 
36870       RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
36871       unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
36872       if (OpMask[OpIdx] < 0) {
36873         // The incoming lanes are zero or undef, it doesn't matter which ones we
36874         // are using.
36875         Mask[i] = OpMask[OpIdx];
36876         continue;
36877       }
36878 
36879       // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
36880       unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
36881                                           : (OpMask[OpIdx] << OpRatioLog2) +
36882                                                 (RootMaskedIdx & (OpRatio - 1));
36883 
36884       OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
36885       int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
36886       assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
36887       OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
36888 
36889       Mask[i] = OpMaskedIdx;
36890     }
36891   }
36892 
36893   // Remove unused/repeated shuffle source ops.
36894   resolveTargetShuffleInputsAndMask(Ops, Mask);
36895 
36896   // Handle the all undef/zero/ones cases early.
36897   if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
36898     return DAG.getUNDEF(Root.getValueType());
36899   if (all_of(Mask, [](int Idx) { return Idx < 0; }))
36900     return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
36901                          SDLoc(Root));
36902   if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
36903       none_of(Mask, [](int M) { return M == SM_SentinelZero; }))
36904     return getOnesVector(Root.getValueType(), DAG, SDLoc(Root));
36905 
36906   assert(!Ops.empty() && "Shuffle with no inputs detected");
36907   HasVariableMask |= IsOpVariableMask;
36908 
36909   // Update the list of shuffle nodes that have been combined so far.
36910   SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
36911                                                 SrcNodes.end());
36912   CombinedNodes.push_back(Op.getNode());
36913 
36914   // See if we can recurse into each shuffle source op (if it's a target
36915   // shuffle). The source op should only be generally combined if it either has
36916   // a single use (i.e. current Op) or all its users have already been combined,
36917   // if not then we can still combine but should prevent generation of variable
36918   // shuffles to avoid constant pool bloat.
36919   // Don't recurse if we already have more source ops than we can combine in
36920   // the remaining recursion depth.
36921   if (Ops.size() < (MaxDepth - Depth)) {
36922     for (int i = 0, e = Ops.size(); i < e; ++i) {
36923       // For empty roots, we need to resolve zeroable elements before combining
36924       // them with other shuffles.
36925       SmallVector<int, 64> ResolvedMask = Mask;
36926       if (EmptyRoot)
36927         resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
36928       bool AllowVar = false;
36929       if (Ops[i].getNode()->hasOneUse() ||
36930           SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
36931         AllowVar = AllowVariableMask;
36932       if (SDValue Res = combineX86ShufflesRecursively(
36933               Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
36934               HasVariableMask, AllowVar, DAG, Subtarget))
36935         return Res;
36936     }
36937   }
36938 
36939   // Attempt to constant fold all of the constant source ops.
36940   if (SDValue Cst = combineX86ShufflesConstants(
36941           Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
36942     return Cst;
36943 
36944   // Canonicalize the combined shuffle mask chain with horizontal ops.
36945   // NOTE: This will update the Ops and Mask.
36946   if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
36947           Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
36948     return DAG.getBitcast(Root.getValueType(), HOp);
36949 
36950   // Widen any subvector shuffle inputs we've collected.
36951   if (any_of(Ops, [RootSizeInBits](SDValue Op) {
36952         return Op.getValueSizeInBits() < RootSizeInBits;
36953       })) {
36954     for (SDValue &Op : Ops)
36955       if (Op.getValueSizeInBits() < RootSizeInBits)
36956         Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
36957                             RootSizeInBits);
36958     // Reresolve - we might have repeated subvector sources.
36959     resolveTargetShuffleInputsAndMask(Ops, Mask);
36960   }
36961 
36962   // We can only combine unary and binary shuffle mask cases.
36963   if (Ops.size() <= 2) {
36964     // Minor canonicalization of the accumulated shuffle mask to make it easier
36965     // to match below. All this does is detect masks with sequential pairs of
36966     // elements, and shrink them to the half-width mask. It does this in a loop
36967     // so it will reduce the size of the mask to the minimal width mask which
36968     // performs an equivalent shuffle.
36969     while (Mask.size() > 1) {
36970       SmallVector<int, 64> WidenedMask;
36971       if (!canWidenShuffleElements(Mask, WidenedMask))
36972         break;
36973       Mask = std::move(WidenedMask);
36974     }
36975 
36976     // Canonicalization of binary shuffle masks to improve pattern matching by
36977     // commuting the inputs.
36978     if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
36979       ShuffleVectorSDNode::commuteMask(Mask);
36980       std::swap(Ops[0], Ops[1]);
36981     }
36982 
36983     // Finally, try to combine into a single shuffle instruction.
36984     return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
36985                                   AllowVariableMask, DAG, Subtarget);
36986   }
36987 
36988   // If that failed and any input is extracted then try to combine as a
36989   // shuffle with the larger type.
36990   return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth,
36991                                            HasVariableMask, AllowVariableMask,
36992                                            DAG, Subtarget);
36993 }
36994 
36995 /// Helper entry wrapper to combineX86ShufflesRecursively.
combineX86ShufflesRecursively(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)36996 static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
36997                                              const X86Subtarget &Subtarget) {
36998   return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 0,
36999                                        X86::MaxShuffleCombineDepth,
37000                                        /*HasVarMask*/ false,
37001                                        /*AllowVarMask*/ true, DAG, Subtarget);
37002 }
37003 
37004 /// Get the PSHUF-style mask from PSHUF node.
37005 ///
37006 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
37007 /// PSHUF-style masks that can be reused with such instructions.
getPSHUFShuffleMask(SDValue N)37008 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
37009   MVT VT = N.getSimpleValueType();
37010   SmallVector<int, 4> Mask;
37011   SmallVector<SDValue, 2> Ops;
37012   bool HaveMask =
37013       getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);
37014   (void)HaveMask;
37015   assert(HaveMask);
37016 
37017   // If we have more than 128-bits, only the low 128-bits of shuffle mask
37018   // matter. Check that the upper masks are repeats and remove them.
37019   if (VT.getSizeInBits() > 128) {
37020     int LaneElts = 128 / VT.getScalarSizeInBits();
37021 #ifndef NDEBUG
37022     for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
37023       for (int j = 0; j < LaneElts; ++j)
37024         assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
37025                "Mask doesn't repeat in high 128-bit lanes!");
37026 #endif
37027     Mask.resize(LaneElts);
37028   }
37029 
37030   switch (N.getOpcode()) {
37031   case X86ISD::PSHUFD:
37032     return Mask;
37033   case X86ISD::PSHUFLW:
37034     Mask.resize(4);
37035     return Mask;
37036   case X86ISD::PSHUFHW:
37037     Mask.erase(Mask.begin(), Mask.begin() + 4);
37038     for (int &M : Mask)
37039       M -= 4;
37040     return Mask;
37041   default:
37042     llvm_unreachable("No valid shuffle instruction found!");
37043   }
37044 }
37045 
37046 /// Search for a combinable shuffle across a chain ending in pshufd.
37047 ///
37048 /// We walk up the chain and look for a combinable shuffle, skipping over
37049 /// shuffles that we could hoist this shuffle's transformation past without
37050 /// altering anything.
37051 static SDValue
combineRedundantDWordShuffle(SDValue N,MutableArrayRef<int> Mask,SelectionDAG & DAG)37052 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
37053                              SelectionDAG &DAG) {
37054   assert(N.getOpcode() == X86ISD::PSHUFD &&
37055          "Called with something other than an x86 128-bit half shuffle!");
37056   SDLoc DL(N);
37057 
37058   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
37059   // of the shuffles in the chain so that we can form a fresh chain to replace
37060   // this one.
37061   SmallVector<SDValue, 8> Chain;
37062   SDValue V = N.getOperand(0);
37063   for (; V.hasOneUse(); V = V.getOperand(0)) {
37064     switch (V.getOpcode()) {
37065     default:
37066       return SDValue(); // Nothing combined!
37067 
37068     case ISD::BITCAST:
37069       // Skip bitcasts as we always know the type for the target specific
37070       // instructions.
37071       continue;
37072 
37073     case X86ISD::PSHUFD:
37074       // Found another dword shuffle.
37075       break;
37076 
37077     case X86ISD::PSHUFLW:
37078       // Check that the low words (being shuffled) are the identity in the
37079       // dword shuffle, and the high words are self-contained.
37080       if (Mask[0] != 0 || Mask[1] != 1 ||
37081           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
37082         return SDValue();
37083 
37084       Chain.push_back(V);
37085       continue;
37086 
37087     case X86ISD::PSHUFHW:
37088       // Check that the high words (being shuffled) are the identity in the
37089       // dword shuffle, and the low words are self-contained.
37090       if (Mask[2] != 2 || Mask[3] != 3 ||
37091           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
37092         return SDValue();
37093 
37094       Chain.push_back(V);
37095       continue;
37096 
37097     case X86ISD::UNPCKL:
37098     case X86ISD::UNPCKH:
37099       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
37100       // shuffle into a preceding word shuffle.
37101       if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
37102           V.getSimpleValueType().getVectorElementType() != MVT::i16)
37103         return SDValue();
37104 
37105       // Search for a half-shuffle which we can combine with.
37106       unsigned CombineOp =
37107           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
37108       if (V.getOperand(0) != V.getOperand(1) ||
37109           !V->isOnlyUserOf(V.getOperand(0).getNode()))
37110         return SDValue();
37111       Chain.push_back(V);
37112       V = V.getOperand(0);
37113       do {
37114         switch (V.getOpcode()) {
37115         default:
37116           return SDValue(); // Nothing to combine.
37117 
37118         case X86ISD::PSHUFLW:
37119         case X86ISD::PSHUFHW:
37120           if (V.getOpcode() == CombineOp)
37121             break;
37122 
37123           Chain.push_back(V);
37124 
37125           LLVM_FALLTHROUGH;
37126         case ISD::BITCAST:
37127           V = V.getOperand(0);
37128           continue;
37129         }
37130         break;
37131       } while (V.hasOneUse());
37132       break;
37133     }
37134     // Break out of the loop if we break out of the switch.
37135     break;
37136   }
37137 
37138   if (!V.hasOneUse())
37139     // We fell out of the loop without finding a viable combining instruction.
37140     return SDValue();
37141 
37142   // Merge this node's mask and our incoming mask.
37143   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
37144   for (int &M : Mask)
37145     M = VMask[M];
37146   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
37147                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
37148 
37149   // Rebuild the chain around this new shuffle.
37150   while (!Chain.empty()) {
37151     SDValue W = Chain.pop_back_val();
37152 
37153     if (V.getValueType() != W.getOperand(0).getValueType())
37154       V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
37155 
37156     switch (W.getOpcode()) {
37157     default:
37158       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
37159 
37160     case X86ISD::UNPCKL:
37161     case X86ISD::UNPCKH:
37162       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
37163       break;
37164 
37165     case X86ISD::PSHUFD:
37166     case X86ISD::PSHUFLW:
37167     case X86ISD::PSHUFHW:
37168       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
37169       break;
37170     }
37171   }
37172   if (V.getValueType() != N.getValueType())
37173     V = DAG.getBitcast(N.getValueType(), V);
37174 
37175   // Return the new chain to replace N.
37176   return V;
37177 }
37178 
37179 // Attempt to commute shufps LHS loads:
37180 // permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
combineCommutableSHUFP(SDValue N,MVT VT,const SDLoc & DL,SelectionDAG & DAG)37181 static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
37182                                       SelectionDAG &DAG) {
37183   // TODO: Add vXf64 support.
37184   if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
37185     return SDValue();
37186 
37187   // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
37188   auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
37189     if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
37190       return SDValue();
37191     SDValue N0 = V.getOperand(0);
37192     SDValue N1 = V.getOperand(1);
37193     unsigned Imm = V.getConstantOperandVal(2);
37194     if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) ||
37195         MayFoldLoad(peekThroughOneUseBitcasts(N1)))
37196       return SDValue();
37197     Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
37198     return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
37199                        DAG.getTargetConstant(Imm, DL, MVT::i8));
37200   };
37201 
37202   switch (N.getOpcode()) {
37203   case X86ISD::VPERMILPI:
37204     if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
37205       unsigned Imm = N.getConstantOperandVal(1);
37206       return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
37207                          DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
37208     }
37209     break;
37210   case X86ISD::SHUFP: {
37211     SDValue N0 = N.getOperand(0);
37212     SDValue N1 = N.getOperand(1);
37213     unsigned Imm = N.getConstantOperandVal(2);
37214     if (N0 == N1) {
37215       if (SDValue NewSHUFP = commuteSHUFP(N, N0))
37216         return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
37217                            DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
37218     } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
37219       return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
37220                          DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
37221     } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
37222       return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
37223                          DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
37224     }
37225     break;
37226   }
37227   }
37228 
37229   return SDValue();
37230 }
37231 
37232 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
canonicalizeShuffleWithBinOps(SDValue N,SelectionDAG & DAG,const SDLoc & DL)37233 static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
37234                                              const SDLoc &DL) {
37235   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
37236   EVT ShuffleVT = N.getValueType();
37237 
37238   auto IsMergeableWithShuffle = [](SDValue Op) {
37239     // AllZeros/AllOnes constants are freely shuffled and will peek through
37240     // bitcasts. Other constant build vectors do not peek through bitcasts. Only
37241     // merge with target shuffles if it has one use so shuffle combining is
37242     // likely to kick in.
37243     return ISD::isBuildVectorAllOnes(Op.getNode()) ||
37244            ISD::isBuildVectorAllZeros(Op.getNode()) ||
37245            ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
37246            ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
37247            (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse());
37248   };
37249   auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
37250     // Ensure we only shuffle whole vector src elements, unless its a logical
37251     // binops where we can more aggressively move shuffles from dst to src.
37252     return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||
37253            (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
37254   };
37255 
37256   unsigned Opc = N.getOpcode();
37257   switch (Opc) {
37258   // Unary and Unary+Permute Shuffles.
37259   case X86ISD::PSHUFB: {
37260     // Don't merge PSHUFB if it contains zero'd elements.
37261     SmallVector<int> Mask;
37262     SmallVector<SDValue> Ops;
37263     if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,
37264                               Mask))
37265       break;
37266     LLVM_FALLTHROUGH;
37267   }
37268   case X86ISD::VBROADCAST:
37269   case X86ISD::MOVDDUP:
37270   case X86ISD::PSHUFD:
37271   case X86ISD::VPERMI:
37272   case X86ISD::VPERMILPI: {
37273     if (N.getOperand(0).getValueType() == ShuffleVT &&
37274         N->isOnlyUserOf(N.getOperand(0).getNode())) {
37275       SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
37276       unsigned SrcOpcode = N0.getOpcode();
37277       if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
37278         SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
37279         SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
37280         if (IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op01)) {
37281           SDValue LHS, RHS;
37282           Op00 = DAG.getBitcast(ShuffleVT, Op00);
37283           Op01 = DAG.getBitcast(ShuffleVT, Op01);
37284           if (N.getNumOperands() == 2) {
37285             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
37286             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
37287           } else {
37288             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
37289             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
37290           }
37291           EVT OpVT = N0.getValueType();
37292           return DAG.getBitcast(ShuffleVT,
37293                                 DAG.getNode(SrcOpcode, DL, OpVT,
37294                                             DAG.getBitcast(OpVT, LHS),
37295                                             DAG.getBitcast(OpVT, RHS)));
37296         }
37297       }
37298     }
37299     break;
37300   }
37301   // Binary and Binary+Permute Shuffles.
37302   case X86ISD::INSERTPS: {
37303     // Don't merge INSERTPS if it contains zero'd elements.
37304     unsigned InsertPSMask = N.getConstantOperandVal(2);
37305     unsigned ZeroMask = InsertPSMask & 0xF;
37306     if (ZeroMask != 0)
37307       break;
37308     LLVM_FALLTHROUGH;
37309   }
37310   case X86ISD::MOVSD:
37311   case X86ISD::MOVSS:
37312   case X86ISD::BLENDI:
37313   case X86ISD::SHUFP:
37314   case X86ISD::UNPCKH:
37315   case X86ISD::UNPCKL: {
37316     if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
37317         N->isOnlyUserOf(N.getOperand(1).getNode())) {
37318       SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
37319       SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
37320       unsigned SrcOpcode = N0.getOpcode();
37321       if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
37322           IsSafeToMoveShuffle(N0, SrcOpcode) &&
37323           IsSafeToMoveShuffle(N1, SrcOpcode)) {
37324         SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
37325         SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));
37326         SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
37327         SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));
37328         // Ensure the total number of shuffles doesn't increase by folding this
37329         // shuffle through to the source ops.
37330         if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
37331              (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
37332             ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
37333              (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
37334           SDValue LHS, RHS;
37335           Op00 = DAG.getBitcast(ShuffleVT, Op00);
37336           Op10 = DAG.getBitcast(ShuffleVT, Op10);
37337           Op01 = DAG.getBitcast(ShuffleVT, Op01);
37338           Op11 = DAG.getBitcast(ShuffleVT, Op11);
37339           if (N.getNumOperands() == 3) {
37340             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
37341             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
37342           } else {
37343             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
37344             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
37345           }
37346           EVT OpVT = N0.getValueType();
37347           return DAG.getBitcast(ShuffleVT,
37348                                 DAG.getNode(SrcOpcode, DL, OpVT,
37349                                             DAG.getBitcast(OpVT, LHS),
37350                                             DAG.getBitcast(OpVT, RHS)));
37351         }
37352       }
37353     }
37354     break;
37355   }
37356   }
37357   return SDValue();
37358 }
37359 
37360 /// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
canonicalizeLaneShuffleWithRepeatedOps(SDValue V,SelectionDAG & DAG,const SDLoc & DL)37361 static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
37362                                                       SelectionDAG &DAG,
37363                                                       const SDLoc &DL) {
37364   assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
37365 
37366   MVT VT = V.getSimpleValueType();
37367   SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
37368   SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
37369   unsigned SrcOpc0 = Src0.getOpcode();
37370   unsigned SrcOpc1 = Src1.getOpcode();
37371   EVT SrcVT0 = Src0.getValueType();
37372   EVT SrcVT1 = Src1.getValueType();
37373 
37374   if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
37375     return SDValue();
37376 
37377   switch (SrcOpc0) {
37378   case X86ISD::MOVDDUP: {
37379     SDValue LHS = Src0.getOperand(0);
37380     SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
37381     SDValue Res =
37382         DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
37383     Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
37384     return DAG.getBitcast(VT, Res);
37385   }
37386   case X86ISD::VPERMILPI:
37387     // TODO: Handle v4f64 permutes with different low/high lane masks.
37388     if (SrcVT0 == MVT::v4f64) {
37389       uint64_t Mask = Src0.getConstantOperandVal(1);
37390       if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
37391         break;
37392     }
37393     LLVM_FALLTHROUGH;
37394   case X86ISD::VSHLI:
37395   case X86ISD::VSRLI:
37396   case X86ISD::VSRAI:
37397   case X86ISD::PSHUFD:
37398     if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
37399       SDValue LHS = Src0.getOperand(0);
37400       SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
37401       SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
37402                                 V.getOperand(2));
37403       Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
37404       return DAG.getBitcast(VT, Res);
37405     }
37406     break;
37407   }
37408 
37409   return SDValue();
37410 }
37411 
37412 /// Try to combine x86 target specific shuffles.
combineTargetShuffle(SDValue N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)37413 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
37414                                     TargetLowering::DAGCombinerInfo &DCI,
37415                                     const X86Subtarget &Subtarget) {
37416   SDLoc DL(N);
37417   MVT VT = N.getSimpleValueType();
37418   SmallVector<int, 4> Mask;
37419   unsigned Opcode = N.getOpcode();
37420 
37421   if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
37422     return R;
37423 
37424   if (SDValue R = canonicalizeShuffleWithBinOps(N, DAG, DL))
37425     return R;
37426 
37427   // Handle specific target shuffles.
37428   switch (Opcode) {
37429   case X86ISD::MOVDDUP: {
37430     SDValue Src = N.getOperand(0);
37431     // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
37432     if (VT == MVT::v2f64 && Src.hasOneUse() &&
37433         ISD::isNormalLoad(Src.getNode())) {
37434       LoadSDNode *LN = cast<LoadSDNode>(Src);
37435       if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
37436         SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
37437         DCI.CombineTo(N.getNode(), Movddup);
37438         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
37439         DCI.recursivelyDeleteUnusedNodes(LN);
37440         return N; // Return N so it doesn't get rechecked!
37441       }
37442     }
37443 
37444     return SDValue();
37445   }
37446   case X86ISD::VBROADCAST: {
37447     SDValue Src = N.getOperand(0);
37448     SDValue BC = peekThroughBitcasts(Src);
37449     EVT SrcVT = Src.getValueType();
37450     EVT BCVT = BC.getValueType();
37451 
37452     // If broadcasting from another shuffle, attempt to simplify it.
37453     // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
37454     if (isTargetShuffle(BC.getOpcode()) &&
37455         VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
37456       unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
37457       SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
37458                                         SM_SentinelUndef);
37459       for (unsigned i = 0; i != Scale; ++i)
37460         DemandedMask[i] = i;
37461       if (SDValue Res = combineX86ShufflesRecursively(
37462               {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
37463               X86::MaxShuffleCombineDepth,
37464               /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
37465         return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
37466                            DAG.getBitcast(SrcVT, Res));
37467     }
37468 
37469     // broadcast(bitcast(src)) -> bitcast(broadcast(src))
37470     // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
37471     if (Src.getOpcode() == ISD::BITCAST &&
37472         SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
37473         DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) {
37474       EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
37475                                    VT.getVectorNumElements());
37476       return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
37477     }
37478 
37479     // Reduce broadcast source vector to lowest 128-bits.
37480     if (SrcVT.getSizeInBits() > 128)
37481       return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
37482                          extract128BitVector(Src, 0, DAG, DL));
37483 
37484     // broadcast(scalar_to_vector(x)) -> broadcast(x).
37485     if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
37486       return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
37487 
37488     // Share broadcast with the longest vector and extract low subvector (free).
37489     // Ensure the same SDValue from the SDNode use is being used.
37490     for (SDNode *User : Src->uses())
37491       if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
37492           Src == User->getOperand(0) &&
37493           User->getValueSizeInBits(0).getFixedSize() >
37494               VT.getFixedSizeInBits()) {
37495         return extractSubVector(SDValue(User, 0), 0, DAG, DL,
37496                                 VT.getSizeInBits());
37497       }
37498 
37499     // vbroadcast(scalarload X) -> vbroadcast_load X
37500     // For float loads, extract other uses of the scalar from the broadcast.
37501     if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
37502         ISD::isNormalLoad(Src.getNode())) {
37503       LoadSDNode *LN = cast<LoadSDNode>(Src);
37504       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37505       SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
37506       SDValue BcastLd =
37507           DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
37508                                   LN->getMemoryVT(), LN->getMemOperand());
37509       // If the load value is used only by N, replace it via CombineTo N.
37510       bool NoReplaceExtract = Src.hasOneUse();
37511       DCI.CombineTo(N.getNode(), BcastLd);
37512       if (NoReplaceExtract) {
37513         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37514         DCI.recursivelyDeleteUnusedNodes(LN);
37515       } else {
37516         SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
37517                                   DAG.getIntPtrConstant(0, DL));
37518         DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
37519       }
37520       return N; // Return N so it doesn't get rechecked!
37521     }
37522 
37523     // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
37524     // i16. So shrink it ourselves if we can make a broadcast_load.
37525     if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
37526         Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
37527       assert(Subtarget.hasAVX2() && "Expected AVX2");
37528       SDValue TruncIn = Src.getOperand(0);
37529 
37530       // If this is a truncate of a non extending load we can just narrow it to
37531       // use a broadcast_load.
37532       if (ISD::isNormalLoad(TruncIn.getNode())) {
37533         LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
37534         // Unless its volatile or atomic.
37535         if (LN->isSimple()) {
37536           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37537           SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
37538           SDValue BcastLd = DAG.getMemIntrinsicNode(
37539               X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
37540               LN->getPointerInfo(), LN->getOriginalAlign(),
37541               LN->getMemOperand()->getFlags());
37542           DCI.CombineTo(N.getNode(), BcastLd);
37543           DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37544           DCI.recursivelyDeleteUnusedNodes(Src.getNode());
37545           return N; // Return N so it doesn't get rechecked!
37546         }
37547       }
37548 
37549       // If this is a truncate of an i16 extload, we can directly replace it.
37550       if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
37551           ISD::isEXTLoad(Src.getOperand(0).getNode())) {
37552         LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
37553         if (LN->getMemoryVT().getSizeInBits() == 16) {
37554           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37555           SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
37556           SDValue BcastLd =
37557               DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
37558                                       LN->getMemoryVT(), LN->getMemOperand());
37559           DCI.CombineTo(N.getNode(), BcastLd);
37560           DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37561           DCI.recursivelyDeleteUnusedNodes(Src.getNode());
37562           return N; // Return N so it doesn't get rechecked!
37563         }
37564       }
37565 
37566       // If this is a truncate of load that has been shifted right, we can
37567       // offset the pointer and use a narrower load.
37568       if (TruncIn.getOpcode() == ISD::SRL &&
37569           TruncIn.getOperand(0).hasOneUse() &&
37570           isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
37571           ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
37572         LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
37573         unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
37574         // Make sure the shift amount and the load size are divisible by 16.
37575         // Don't do this if the load is volatile or atomic.
37576         if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
37577             LN->isSimple()) {
37578           unsigned Offset = ShiftAmt / 8;
37579           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37580           SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
37581                                                  TypeSize::Fixed(Offset), DL);
37582           SDValue Ops[] = { LN->getChain(), Ptr };
37583           SDValue BcastLd = DAG.getMemIntrinsicNode(
37584               X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
37585               LN->getPointerInfo().getWithOffset(Offset),
37586               LN->getOriginalAlign(),
37587               LN->getMemOperand()->getFlags());
37588           DCI.CombineTo(N.getNode(), BcastLd);
37589           DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37590           DCI.recursivelyDeleteUnusedNodes(Src.getNode());
37591           return N; // Return N so it doesn't get rechecked!
37592         }
37593       }
37594     }
37595 
37596     // vbroadcast(vzload X) -> vbroadcast_load X
37597     if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
37598       MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
37599       if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
37600         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37601         SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
37602         SDValue BcastLd =
37603             DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
37604                                     LN->getMemoryVT(), LN->getMemOperand());
37605         DCI.CombineTo(N.getNode(), BcastLd);
37606         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37607         DCI.recursivelyDeleteUnusedNodes(LN);
37608         return N; // Return N so it doesn't get rechecked!
37609       }
37610     }
37611 
37612     // vbroadcast(vector load X) -> vbroadcast_load
37613     if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
37614          SrcVT == MVT::v4i32) &&
37615         Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
37616       LoadSDNode *LN = cast<LoadSDNode>(Src);
37617       // Unless the load is volatile or atomic.
37618       if (LN->isSimple()) {
37619         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37620         SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
37621         SDValue BcastLd = DAG.getMemIntrinsicNode(
37622             X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
37623             LN->getPointerInfo(), LN->getOriginalAlign(),
37624             LN->getMemOperand()->getFlags());
37625         DCI.CombineTo(N.getNode(), BcastLd);
37626         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37627         DCI.recursivelyDeleteUnusedNodes(LN);
37628         return N; // Return N so it doesn't get rechecked!
37629       }
37630     }
37631 
37632     return SDValue();
37633   }
37634   case X86ISD::VZEXT_MOVL: {
37635     SDValue N0 = N.getOperand(0);
37636 
37637     // If this a vzmovl of a full vector load, replace it with a vzload, unless
37638     // the load is volatile.
37639     if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
37640       auto *LN = cast<LoadSDNode>(N0);
37641       if (SDValue VZLoad =
37642               narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
37643         DCI.CombineTo(N.getNode(), VZLoad);
37644         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
37645         DCI.recursivelyDeleteUnusedNodes(LN);
37646         return N;
37647       }
37648     }
37649 
37650     // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
37651     // and can just use a VZEXT_LOAD.
37652     // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
37653     if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
37654       auto *LN = cast<MemSDNode>(N0);
37655       if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
37656         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37657         SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
37658         SDValue VZLoad =
37659             DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
37660                                     LN->getMemoryVT(), LN->getMemOperand());
37661         DCI.CombineTo(N.getNode(), VZLoad);
37662         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
37663         DCI.recursivelyDeleteUnusedNodes(LN);
37664         return N;
37665       }
37666     }
37667 
37668     // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
37669     // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
37670     // if the upper bits of the i64 are zero.
37671     if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
37672         N0.getOperand(0).hasOneUse() &&
37673         N0.getOperand(0).getValueType() == MVT::i64) {
37674       SDValue In = N0.getOperand(0);
37675       APInt Mask = APInt::getHighBitsSet(64, 32);
37676       if (DAG.MaskedValueIsZero(In, Mask)) {
37677         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
37678         MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
37679         SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
37680         SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
37681         return DAG.getBitcast(VT, Movl);
37682       }
37683     }
37684 
37685     // Load a scalar integer constant directly to XMM instead of transferring an
37686     // immediate value from GPR.
37687     // vzext_movl (scalar_to_vector C) --> load [C,0...]
37688     if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
37689       if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
37690         // Create a vector constant - scalar constant followed by zeros.
37691         EVT ScalarVT = N0.getOperand(0).getValueType();
37692         Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
37693         unsigned NumElts = VT.getVectorNumElements();
37694         Constant *Zero = ConstantInt::getNullValue(ScalarTy);
37695         SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
37696         ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
37697 
37698         // Load the vector constant from constant pool.
37699         MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
37700         SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
37701         MachinePointerInfo MPI =
37702             MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
37703         Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
37704         return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
37705                            MachineMemOperand::MOLoad);
37706       }
37707     }
37708 
37709     // Pull subvector inserts into undef through VZEXT_MOVL by making it an
37710     // insert into a zero vector. This helps get VZEXT_MOVL closer to
37711     // scalar_to_vectors where 256/512 are canonicalized to an insert and a
37712     // 128-bit scalar_to_vector. This reduces the number of isel patterns.
37713     if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
37714       SDValue V = peekThroughOneUseBitcasts(N0);
37715 
37716       if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
37717           isNullConstant(V.getOperand(2))) {
37718         SDValue In = V.getOperand(1);
37719         MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
37720                                      In.getValueSizeInBits() /
37721                                          VT.getScalarSizeInBits());
37722         In = DAG.getBitcast(SubVT, In);
37723         SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
37724         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
37725                            getZeroVector(VT, Subtarget, DAG, DL), Movl,
37726                            V.getOperand(2));
37727       }
37728     }
37729 
37730     return SDValue();
37731   }
37732   case X86ISD::BLENDI: {
37733     SDValue N0 = N.getOperand(0);
37734     SDValue N1 = N.getOperand(1);
37735 
37736     // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
37737     // TODO: Handle MVT::v16i16 repeated blend mask.
37738     if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
37739         N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
37740       MVT SrcVT = N0.getOperand(0).getSimpleValueType();
37741       if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
37742           SrcVT.getScalarSizeInBits() >= 32) {
37743         unsigned BlendMask = N.getConstantOperandVal(2);
37744         unsigned Size = VT.getVectorNumElements();
37745         unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
37746         BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
37747         return DAG.getBitcast(
37748             VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
37749                             N1.getOperand(0),
37750                             DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
37751       }
37752     }
37753     return SDValue();
37754   }
37755   case X86ISD::VPERMI: {
37756     // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
37757     // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
37758     SDValue N0 = N.getOperand(0);
37759     SDValue N1 = N.getOperand(1);
37760     unsigned EltSizeInBits = VT.getScalarSizeInBits();
37761     if (N0.getOpcode() == ISD::BITCAST &&
37762         N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
37763       SDValue Src = N0.getOperand(0);
37764       EVT SrcVT = Src.getValueType();
37765       SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
37766       return DAG.getBitcast(VT, Res);
37767     }
37768     return SDValue();
37769   }
37770   case X86ISD::VPERM2X128: {
37771     // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
37772     SDValue LHS = N->getOperand(0);
37773     SDValue RHS = N->getOperand(1);
37774     if (LHS.getOpcode() == ISD::BITCAST &&
37775         (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
37776       EVT SrcVT = LHS.getOperand(0).getValueType();
37777       if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
37778         return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
37779                                               DAG.getBitcast(SrcVT, LHS),
37780                                               DAG.getBitcast(SrcVT, RHS),
37781                                               N->getOperand(2)));
37782       }
37783     }
37784 
37785     // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
37786     if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
37787       return Res;
37788 
37789     // Fold vperm2x128 subvector shuffle with an inner concat pattern.
37790     // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
37791     auto FindSubVector128 = [&](unsigned Idx) {
37792       if (Idx > 3)
37793         return SDValue();
37794       SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
37795       SmallVector<SDValue> SubOps;
37796       if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2)
37797         return SubOps[Idx & 1];
37798       unsigned NumElts = Src.getValueType().getVectorNumElements();
37799       if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
37800           Src.getOperand(1).getValueSizeInBits() == 128 &&
37801           Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
37802         return Src.getOperand(1);
37803       }
37804       return SDValue();
37805     };
37806     unsigned Imm = N.getConstantOperandVal(2);
37807     if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
37808       if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
37809         MVT SubVT = VT.getHalfNumVectorElementsVT();
37810         SubLo = DAG.getBitcast(SubVT, SubLo);
37811         SubHi = DAG.getBitcast(SubVT, SubHi);
37812         return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
37813       }
37814     }
37815     return SDValue();
37816   }
37817   case X86ISD::PSHUFD:
37818   case X86ISD::PSHUFLW:
37819   case X86ISD::PSHUFHW:
37820     Mask = getPSHUFShuffleMask(N);
37821     assert(Mask.size() == 4);
37822     break;
37823   case X86ISD::MOVSD:
37824   case X86ISD::MOVSS: {
37825     SDValue N0 = N.getOperand(0);
37826     SDValue N1 = N.getOperand(1);
37827 
37828     // Canonicalize scalar FPOps:
37829     // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
37830     // If commutable, allow OP(N1[0], N0[0]).
37831     unsigned Opcode1 = N1.getOpcode();
37832     if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
37833         Opcode1 == ISD::FDIV) {
37834       SDValue N10 = N1.getOperand(0);
37835       SDValue N11 = N1.getOperand(1);
37836       if (N10 == N0 ||
37837           (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
37838         if (N10 != N0)
37839           std::swap(N10, N11);
37840         MVT SVT = VT.getVectorElementType();
37841         SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
37842         N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
37843         N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
37844         SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
37845         SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
37846         return DAG.getNode(Opcode, DL, VT, N0, SclVec);
37847       }
37848     }
37849 
37850     return SDValue();
37851   }
37852   case X86ISD::INSERTPS: {
37853     assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
37854     SDValue Op0 = N.getOperand(0);
37855     SDValue Op1 = N.getOperand(1);
37856     unsigned InsertPSMask = N.getConstantOperandVal(2);
37857     unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
37858     unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
37859     unsigned ZeroMask = InsertPSMask & 0xF;
37860 
37861     // If we zero out all elements from Op0 then we don't need to reference it.
37862     if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
37863       return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
37864                          DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
37865 
37866     // If we zero out the element from Op1 then we don't need to reference it.
37867     if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
37868       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
37869                          DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
37870 
37871     // Attempt to merge insertps Op1 with an inner target shuffle node.
37872     SmallVector<int, 8> TargetMask1;
37873     SmallVector<SDValue, 2> Ops1;
37874     APInt KnownUndef1, KnownZero1;
37875     if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
37876                                      KnownZero1)) {
37877       if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
37878         // Zero/UNDEF insertion - zero out element and remove dependency.
37879         InsertPSMask |= (1u << DstIdx);
37880         return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
37881                            DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
37882       }
37883       // Update insertps mask srcidx and reference the source input directly.
37884       int M = TargetMask1[SrcIdx];
37885       assert(0 <= M && M < 8 && "Shuffle index out of range");
37886       InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
37887       Op1 = Ops1[M < 4 ? 0 : 1];
37888       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
37889                          DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
37890     }
37891 
37892     // Attempt to merge insertps Op0 with an inner target shuffle node.
37893     SmallVector<int, 8> TargetMask0;
37894     SmallVector<SDValue, 2> Ops0;
37895     APInt KnownUndef0, KnownZero0;
37896     if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
37897                                      KnownZero0)) {
37898       bool Updated = false;
37899       bool UseInput00 = false;
37900       bool UseInput01 = false;
37901       for (int i = 0; i != 4; ++i) {
37902         if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
37903           // No change if element is already zero or the inserted element.
37904           continue;
37905         } else if (KnownUndef0[i] || KnownZero0[i]) {
37906           // If the target mask is undef/zero then we must zero the element.
37907           InsertPSMask |= (1u << i);
37908           Updated = true;
37909           continue;
37910         }
37911 
37912         // The input vector element must be inline.
37913         int M = TargetMask0[i];
37914         if (M != i && M != (i + 4))
37915           return SDValue();
37916 
37917         // Determine which inputs of the target shuffle we're using.
37918         UseInput00 |= (0 <= M && M < 4);
37919         UseInput01 |= (4 <= M);
37920       }
37921 
37922       // If we're not using both inputs of the target shuffle then use the
37923       // referenced input directly.
37924       if (UseInput00 && !UseInput01) {
37925         Updated = true;
37926         Op0 = Ops0[0];
37927       } else if (!UseInput00 && UseInput01) {
37928         Updated = true;
37929         Op0 = Ops0[1];
37930       }
37931 
37932       if (Updated)
37933         return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
37934                            DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
37935     }
37936 
37937     // If we're inserting an element from a vbroadcast load, fold the
37938     // load into the X86insertps instruction. We need to convert the scalar
37939     // load to a vector and clear the source lane of the INSERTPS control.
37940     if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
37941       auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
37942       if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
37943         SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
37944                                    MemIntr->getBasePtr(),
37945                                    MemIntr->getMemOperand());
37946         SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
37947                            DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
37948                                        Load),
37949                            DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
37950         DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
37951         return Insert;
37952       }
37953     }
37954 
37955     return SDValue();
37956   }
37957   default:
37958     return SDValue();
37959   }
37960 
37961   // Nuke no-op shuffles that show up after combining.
37962   if (isNoopShuffleMask(Mask))
37963     return N.getOperand(0);
37964 
37965   // Look for simplifications involving one or two shuffle instructions.
37966   SDValue V = N.getOperand(0);
37967   switch (N.getOpcode()) {
37968   default:
37969     break;
37970   case X86ISD::PSHUFLW:
37971   case X86ISD::PSHUFHW:
37972     assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
37973 
37974     // See if this reduces to a PSHUFD which is no more expensive and can
37975     // combine with more operations. Note that it has to at least flip the
37976     // dwords as otherwise it would have been removed as a no-op.
37977     if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
37978       int DMask[] = {0, 1, 2, 3};
37979       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
37980       DMask[DOffset + 0] = DOffset + 1;
37981       DMask[DOffset + 1] = DOffset + 0;
37982       MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
37983       V = DAG.getBitcast(DVT, V);
37984       V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
37985                       getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
37986       return DAG.getBitcast(VT, V);
37987     }
37988 
37989     // Look for shuffle patterns which can be implemented as a single unpack.
37990     // FIXME: This doesn't handle the location of the PSHUFD generically, and
37991     // only works when we have a PSHUFD followed by two half-shuffles.
37992     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
37993         (V.getOpcode() == X86ISD::PSHUFLW ||
37994          V.getOpcode() == X86ISD::PSHUFHW) &&
37995         V.getOpcode() != N.getOpcode() &&
37996         V.hasOneUse() && V.getOperand(0).hasOneUse()) {
37997       SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
37998       if (D.getOpcode() == X86ISD::PSHUFD) {
37999         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
38000         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
38001         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
38002         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
38003         int WordMask[8];
38004         for (int i = 0; i < 4; ++i) {
38005           WordMask[i + NOffset] = Mask[i] + NOffset;
38006           WordMask[i + VOffset] = VMask[i] + VOffset;
38007         }
38008         // Map the word mask through the DWord mask.
38009         int MappedMask[8];
38010         for (int i = 0; i < 8; ++i)
38011           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
38012         if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
38013             makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
38014           // We can replace all three shuffles with an unpack.
38015           V = DAG.getBitcast(VT, D.getOperand(0));
38016           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
38017                                                 : X86ISD::UNPCKH,
38018                              DL, VT, V, V);
38019         }
38020       }
38021     }
38022 
38023     break;
38024 
38025   case X86ISD::PSHUFD:
38026     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
38027       return NewN;
38028 
38029     break;
38030   }
38031 
38032   return SDValue();
38033 }
38034 
38035 /// Checks if the shuffle mask takes subsequent elements
38036 /// alternately from two vectors.
38037 /// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
isAddSubOrSubAddMask(ArrayRef<int> Mask,bool & Op0Even)38038 static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
38039 
38040   int ParitySrc[2] = {-1, -1};
38041   unsigned Size = Mask.size();
38042   for (unsigned i = 0; i != Size; ++i) {
38043     int M = Mask[i];
38044     if (M < 0)
38045       continue;
38046 
38047     // Make sure we are using the matching element from the input.
38048     if ((M % Size) != i)
38049       return false;
38050 
38051     // Make sure we use the same input for all elements of the same parity.
38052     int Src = M / Size;
38053     if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
38054       return false;
38055     ParitySrc[i % 2] = Src;
38056   }
38057 
38058   // Make sure each input is used.
38059   if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
38060     return false;
38061 
38062   Op0Even = ParitySrc[0] == 0;
38063   return true;
38064 }
38065 
38066 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
38067 /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
38068 /// are written to the parameters \p Opnd0 and \p Opnd1.
38069 ///
38070 /// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
38071 /// so it is easier to generically match. We also insert dummy vector shuffle
38072 /// nodes for the operands which explicitly discard the lanes which are unused
38073 /// by this operation to try to flow through the rest of the combiner
38074 /// the fact that they're unused.
isAddSubOrSubAdd(SDNode * N,const X86Subtarget & Subtarget,SelectionDAG & DAG,SDValue & Opnd0,SDValue & Opnd1,bool & IsSubAdd)38075 static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
38076                              SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
38077                              bool &IsSubAdd) {
38078 
38079   EVT VT = N->getValueType(0);
38080   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38081   if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
38082       !VT.getSimpleVT().isFloatingPoint())
38083     return false;
38084 
38085   // We only handle target-independent shuffles.
38086   // FIXME: It would be easy and harmless to use the target shuffle mask
38087   // extraction tool to support more.
38088   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
38089     return false;
38090 
38091   SDValue V1 = N->getOperand(0);
38092   SDValue V2 = N->getOperand(1);
38093 
38094   // Make sure we have an FADD and an FSUB.
38095   if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
38096       (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
38097       V1.getOpcode() == V2.getOpcode())
38098     return false;
38099 
38100   // If there are other uses of these operations we can't fold them.
38101   if (!V1->hasOneUse() || !V2->hasOneUse())
38102     return false;
38103 
38104   // Ensure that both operations have the same operands. Note that we can
38105   // commute the FADD operands.
38106   SDValue LHS, RHS;
38107   if (V1.getOpcode() == ISD::FSUB) {
38108     LHS = V1->getOperand(0); RHS = V1->getOperand(1);
38109     if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
38110         (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
38111       return false;
38112   } else {
38113     assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
38114     LHS = V2->getOperand(0); RHS = V2->getOperand(1);
38115     if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
38116         (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
38117       return false;
38118   }
38119 
38120   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
38121   bool Op0Even;
38122   if (!isAddSubOrSubAddMask(Mask, Op0Even))
38123     return false;
38124 
38125   // It's a subadd if the vector in the even parity is an FADD.
38126   IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
38127                      : V2->getOpcode() == ISD::FADD;
38128 
38129   Opnd0 = LHS;
38130   Opnd1 = RHS;
38131   return true;
38132 }
38133 
38134 /// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
combineShuffleToFMAddSub(SDNode * N,const X86Subtarget & Subtarget,SelectionDAG & DAG)38135 static SDValue combineShuffleToFMAddSub(SDNode *N,
38136                                         const X86Subtarget &Subtarget,
38137                                         SelectionDAG &DAG) {
38138   // We only handle target-independent shuffles.
38139   // FIXME: It would be easy and harmless to use the target shuffle mask
38140   // extraction tool to support more.
38141   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
38142     return SDValue();
38143 
38144   MVT VT = N->getSimpleValueType(0);
38145   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38146   if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
38147     return SDValue();
38148 
38149   // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
38150   SDValue Op0 = N->getOperand(0);
38151   SDValue Op1 = N->getOperand(1);
38152   SDValue FMAdd = Op0, FMSub = Op1;
38153   if (FMSub.getOpcode() != X86ISD::FMSUB)
38154     std::swap(FMAdd, FMSub);
38155 
38156   if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
38157       FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
38158       FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
38159       FMAdd.getOperand(2) != FMSub.getOperand(2))
38160     return SDValue();
38161 
38162   // Check for correct shuffle mask.
38163   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
38164   bool Op0Even;
38165   if (!isAddSubOrSubAddMask(Mask, Op0Even))
38166     return SDValue();
38167 
38168   // FMAddSub takes zeroth operand from FMSub node.
38169   SDLoc DL(N);
38170   bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
38171   unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
38172   return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
38173                      FMAdd.getOperand(2));
38174 }
38175 
38176 /// Try to combine a shuffle into a target-specific add-sub or
38177 /// mul-add-sub node.
combineShuffleToAddSubOrFMAddSub(SDNode * N,const X86Subtarget & Subtarget,SelectionDAG & DAG)38178 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
38179                                                 const X86Subtarget &Subtarget,
38180                                                 SelectionDAG &DAG) {
38181   if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
38182     return V;
38183 
38184   SDValue Opnd0, Opnd1;
38185   bool IsSubAdd;
38186   if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
38187     return SDValue();
38188 
38189   MVT VT = N->getSimpleValueType(0);
38190   SDLoc DL(N);
38191 
38192   // Try to generate X86ISD::FMADDSUB node here.
38193   SDValue Opnd2;
38194   if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
38195     unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
38196     return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
38197   }
38198 
38199   if (IsSubAdd)
38200     return SDValue();
38201 
38202   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
38203   // the ADDSUB idiom has been successfully recognized. There are no known
38204   // X86 targets with 512-bit ADDSUB instructions!
38205   if (VT.is512BitVector())
38206     return SDValue();
38207 
38208   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
38209 }
38210 
38211 // We are looking for a shuffle where both sources are concatenated with undef
38212 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
38213 // if we can express this as a single-source shuffle, that's preferable.
combineShuffleOfConcatUndef(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)38214 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
38215                                            const X86Subtarget &Subtarget) {
38216   if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
38217     return SDValue();
38218 
38219   EVT VT = N->getValueType(0);
38220 
38221   // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
38222   if (!VT.is128BitVector() && !VT.is256BitVector())
38223     return SDValue();
38224 
38225   if (VT.getVectorElementType() != MVT::i32 &&
38226       VT.getVectorElementType() != MVT::i64 &&
38227       VT.getVectorElementType() != MVT::f32 &&
38228       VT.getVectorElementType() != MVT::f64)
38229     return SDValue();
38230 
38231   SDValue N0 = N->getOperand(0);
38232   SDValue N1 = N->getOperand(1);
38233 
38234   // Check that both sources are concats with undef.
38235   if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
38236       N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
38237       N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
38238       !N1.getOperand(1).isUndef())
38239     return SDValue();
38240 
38241   // Construct the new shuffle mask. Elements from the first source retain their
38242   // index, but elements from the second source no longer need to skip an undef.
38243   SmallVector<int, 8> Mask;
38244   int NumElts = VT.getVectorNumElements();
38245 
38246   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
38247   for (int Elt : SVOp->getMask())
38248     Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
38249 
38250   SDLoc DL(N);
38251   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
38252                                N1.getOperand(0));
38253   return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
38254 }
38255 
38256 /// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
38257 /// low half of each source vector and does not set any high half elements in
38258 /// the destination vector, narrow the shuffle to half its original size.
narrowShuffle(ShuffleVectorSDNode * Shuf,SelectionDAG & DAG)38259 static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
38260   if (!Shuf->getValueType(0).isSimple())
38261     return SDValue();
38262   MVT VT = Shuf->getSimpleValueType(0);
38263   if (!VT.is256BitVector() && !VT.is512BitVector())
38264     return SDValue();
38265 
38266   // See if we can ignore all of the high elements of the shuffle.
38267   ArrayRef<int> Mask = Shuf->getMask();
38268   if (!isUndefUpperHalf(Mask))
38269     return SDValue();
38270 
38271   // Check if the shuffle mask accesses only the low half of each input vector
38272   // (half-index output is 0 or 2).
38273   int HalfIdx1, HalfIdx2;
38274   SmallVector<int, 8> HalfMask(Mask.size() / 2);
38275   if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
38276       (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
38277     return SDValue();
38278 
38279   // Create a half-width shuffle to replace the unnecessarily wide shuffle.
38280   // The trick is knowing that all of the insert/extract are actually free
38281   // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
38282   // of narrow inputs into a narrow output, and that is always cheaper than
38283   // the wide shuffle that we started with.
38284   return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
38285                                Shuf->getOperand(1), HalfMask, HalfIdx1,
38286                                HalfIdx2, false, DAG, /*UseConcat*/true);
38287 }
38288 
combineShuffle(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)38289 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
38290                               TargetLowering::DAGCombinerInfo &DCI,
38291                               const X86Subtarget &Subtarget) {
38292   if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
38293     if (SDValue V = narrowShuffle(Shuf, DAG))
38294       return V;
38295 
38296   // If we have legalized the vector types, look for blends of FADD and FSUB
38297   // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
38298   SDLoc dl(N);
38299   EVT VT = N->getValueType(0);
38300   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38301   if (TLI.isTypeLegal(VT))
38302     if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
38303       return AddSub;
38304 
38305   // Attempt to combine into a vector load/broadcast.
38306   if (SDValue LD = combineToConsecutiveLoads(VT, SDValue(N, 0), dl, DAG,
38307                                              Subtarget, true))
38308     return LD;
38309 
38310   // For AVX2, we sometimes want to combine
38311   // (vector_shuffle <mask> (concat_vectors t1, undef)
38312   //                        (concat_vectors t2, undef))
38313   // Into:
38314   // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
38315   // Since the latter can be efficiently lowered with VPERMD/VPERMQ
38316   if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
38317     return ShufConcat;
38318 
38319   if (isTargetShuffle(N->getOpcode())) {
38320     SDValue Op(N, 0);
38321     if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
38322       return Shuffle;
38323 
38324     // Try recursively combining arbitrary sequences of x86 shuffle
38325     // instructions into higher-order shuffles. We do this after combining
38326     // specific PSHUF instruction sequences into their minimal form so that we
38327     // can evaluate how many specialized shuffle instructions are involved in
38328     // a particular chain.
38329     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
38330       return Res;
38331 
38332     // Simplify source operands based on shuffle mask.
38333     // TODO - merge this into combineX86ShufflesRecursively.
38334     APInt KnownUndef, KnownZero;
38335     APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
38336     if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
38337                                        DCI))
38338       return SDValue(N, 0);
38339   }
38340 
38341   return SDValue();
38342 }
38343 
38344 // Simplify variable target shuffle masks based on the demanded elements.
38345 // TODO: Handle DemandedBits in mask indices as well?
SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op,const APInt & DemandedElts,unsigned MaskIndex,TargetLowering::TargetLoweringOpt & TLO,unsigned Depth) const38346 bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
38347     SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
38348     TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
38349   // If we're demanding all elements don't bother trying to simplify the mask.
38350   unsigned NumElts = DemandedElts.getBitWidth();
38351   if (DemandedElts.isAllOnesValue())
38352     return false;
38353 
38354   SDValue Mask = Op.getOperand(MaskIndex);
38355   if (!Mask.hasOneUse())
38356     return false;
38357 
38358   // Attempt to generically simplify the variable shuffle mask.
38359   APInt MaskUndef, MaskZero;
38360   if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
38361                                  Depth + 1))
38362     return true;
38363 
38364   // Attempt to extract+simplify a (constant pool load) shuffle mask.
38365   // TODO: Support other types from getTargetShuffleMaskIndices?
38366   SDValue BC = peekThroughOneUseBitcasts(Mask);
38367   EVT BCVT = BC.getValueType();
38368   auto *Load = dyn_cast<LoadSDNode>(BC);
38369   if (!Load)
38370     return false;
38371 
38372   const Constant *C = getTargetConstantFromNode(Load);
38373   if (!C)
38374     return false;
38375 
38376   Type *CTy = C->getType();
38377   if (!CTy->isVectorTy() ||
38378       CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
38379     return false;
38380 
38381   // Handle scaling for i64 elements on 32-bit targets.
38382   unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
38383   if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
38384     return false;
38385   unsigned Scale = NumCstElts / NumElts;
38386 
38387   // Simplify mask if we have an undemanded element that is not undef.
38388   bool Simplified = false;
38389   SmallVector<Constant *, 32> ConstVecOps;
38390   for (unsigned i = 0; i != NumCstElts; ++i) {
38391     Constant *Elt = C->getAggregateElement(i);
38392     if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
38393       ConstVecOps.push_back(UndefValue::get(Elt->getType()));
38394       Simplified = true;
38395       continue;
38396     }
38397     ConstVecOps.push_back(Elt);
38398   }
38399   if (!Simplified)
38400     return false;
38401 
38402   // Generate new constant pool entry + legalize immediately for the load.
38403   SDLoc DL(Op);
38404   SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
38405   SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
38406   SDValue NewMask = TLO.DAG.getLoad(
38407       BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
38408       MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
38409       Load->getAlign());
38410   return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
38411 }
38412 
SimplifyDemandedVectorEltsForTargetNode(SDValue Op,const APInt & DemandedElts,APInt & KnownUndef,APInt & KnownZero,TargetLoweringOpt & TLO,unsigned Depth) const38413 bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
38414     SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
38415     TargetLoweringOpt &TLO, unsigned Depth) const {
38416   int NumElts = DemandedElts.getBitWidth();
38417   unsigned Opc = Op.getOpcode();
38418   EVT VT = Op.getValueType();
38419 
38420   // Handle special case opcodes.
38421   switch (Opc) {
38422   case X86ISD::PMULDQ:
38423   case X86ISD::PMULUDQ: {
38424     APInt LHSUndef, LHSZero;
38425     APInt RHSUndef, RHSZero;
38426     SDValue LHS = Op.getOperand(0);
38427     SDValue RHS = Op.getOperand(1);
38428     if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
38429                                    Depth + 1))
38430       return true;
38431     if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
38432                                    Depth + 1))
38433       return true;
38434     // Multiply by zero.
38435     KnownZero = LHSZero | RHSZero;
38436     break;
38437   }
38438   case X86ISD::VSHL:
38439   case X86ISD::VSRL:
38440   case X86ISD::VSRA: {
38441     // We only need the bottom 64-bits of the (128-bit) shift amount.
38442     SDValue Amt = Op.getOperand(1);
38443     MVT AmtVT = Amt.getSimpleValueType();
38444     assert(AmtVT.is128BitVector() && "Unexpected value type");
38445 
38446     // If we reuse the shift amount just for sse shift amounts then we know that
38447     // only the bottom 64-bits are only ever used.
38448     bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
38449       unsigned UseOpc = Use->getOpcode();
38450       return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
38451               UseOpc == X86ISD::VSRA) &&
38452              Use->getOperand(0) != Amt;
38453     });
38454 
38455     APInt AmtUndef, AmtZero;
38456     unsigned NumAmtElts = AmtVT.getVectorNumElements();
38457     APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
38458     if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
38459                                    Depth + 1, AssumeSingleUse))
38460       return true;
38461     LLVM_FALLTHROUGH;
38462   }
38463   case X86ISD::VSHLI:
38464   case X86ISD::VSRLI:
38465   case X86ISD::VSRAI: {
38466     SDValue Src = Op.getOperand(0);
38467     APInt SrcUndef;
38468     if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
38469                                    Depth + 1))
38470       return true;
38471 
38472     // Aggressively peek through ops to get at the demanded elts.
38473     if (!DemandedElts.isAllOnesValue())
38474       if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
38475               Src, DemandedElts, TLO.DAG, Depth + 1))
38476         return TLO.CombineTo(
38477             Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
38478     break;
38479   }
38480   case X86ISD::KSHIFTL: {
38481     SDValue Src = Op.getOperand(0);
38482     auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
38483     assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
38484     unsigned ShiftAmt = Amt->getZExtValue();
38485 
38486     if (ShiftAmt == 0)
38487       return TLO.CombineTo(Op, Src);
38488 
38489     // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
38490     // single shift.  We can do this if the bottom bits (which are shifted
38491     // out) are never demanded.
38492     if (Src.getOpcode() == X86ISD::KSHIFTR) {
38493       if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
38494         unsigned C1 = Src.getConstantOperandVal(1);
38495         unsigned NewOpc = X86ISD::KSHIFTL;
38496         int Diff = ShiftAmt - C1;
38497         if (Diff < 0) {
38498           Diff = -Diff;
38499           NewOpc = X86ISD::KSHIFTR;
38500         }
38501 
38502         SDLoc dl(Op);
38503         SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
38504         return TLO.CombineTo(
38505             Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
38506       }
38507     }
38508 
38509     APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
38510     if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
38511                                    Depth + 1))
38512       return true;
38513 
38514     KnownUndef <<= ShiftAmt;
38515     KnownZero <<= ShiftAmt;
38516     KnownZero.setLowBits(ShiftAmt);
38517     break;
38518   }
38519   case X86ISD::KSHIFTR: {
38520     SDValue Src = Op.getOperand(0);
38521     auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
38522     assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
38523     unsigned ShiftAmt = Amt->getZExtValue();
38524 
38525     if (ShiftAmt == 0)
38526       return TLO.CombineTo(Op, Src);
38527 
38528     // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
38529     // single shift.  We can do this if the top bits (which are shifted
38530     // out) are never demanded.
38531     if (Src.getOpcode() == X86ISD::KSHIFTL) {
38532       if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
38533         unsigned C1 = Src.getConstantOperandVal(1);
38534         unsigned NewOpc = X86ISD::KSHIFTR;
38535         int Diff = ShiftAmt - C1;
38536         if (Diff < 0) {
38537           Diff = -Diff;
38538           NewOpc = X86ISD::KSHIFTL;
38539         }
38540 
38541         SDLoc dl(Op);
38542         SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
38543         return TLO.CombineTo(
38544             Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
38545       }
38546     }
38547 
38548     APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
38549     if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
38550                                    Depth + 1))
38551       return true;
38552 
38553     KnownUndef.lshrInPlace(ShiftAmt);
38554     KnownZero.lshrInPlace(ShiftAmt);
38555     KnownZero.setHighBits(ShiftAmt);
38556     break;
38557   }
38558   case X86ISD::CVTSI2P:
38559   case X86ISD::CVTUI2P: {
38560     SDValue Src = Op.getOperand(0);
38561     MVT SrcVT = Src.getSimpleValueType();
38562     APInt SrcUndef, SrcZero;
38563     APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
38564     if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
38565                                    Depth + 1))
38566       return true;
38567     break;
38568   }
38569   case X86ISD::PACKSS:
38570   case X86ISD::PACKUS: {
38571     SDValue N0 = Op.getOperand(0);
38572     SDValue N1 = Op.getOperand(1);
38573 
38574     APInt DemandedLHS, DemandedRHS;
38575     getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38576 
38577     APInt LHSUndef, LHSZero;
38578     if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
38579                                    Depth + 1))
38580       return true;
38581     APInt RHSUndef, RHSZero;
38582     if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
38583                                    Depth + 1))
38584       return true;
38585 
38586     // TODO - pass on known zero/undef.
38587 
38588     // Aggressively peek through ops to get at the demanded elts.
38589     // TODO - we should do this for all target/faux shuffles ops.
38590     if (!DemandedElts.isAllOnesValue()) {
38591       SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
38592                                                             TLO.DAG, Depth + 1);
38593       SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
38594                                                             TLO.DAG, Depth + 1);
38595       if (NewN0 || NewN1) {
38596         NewN0 = NewN0 ? NewN0 : N0;
38597         NewN1 = NewN1 ? NewN1 : N1;
38598         return TLO.CombineTo(Op,
38599                              TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
38600       }
38601     }
38602     break;
38603   }
38604   case X86ISD::HADD:
38605   case X86ISD::HSUB:
38606   case X86ISD::FHADD:
38607   case X86ISD::FHSUB: {
38608     SDValue N0 = Op.getOperand(0);
38609     SDValue N1 = Op.getOperand(1);
38610 
38611     APInt DemandedLHS, DemandedRHS;
38612     getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38613 
38614     APInt LHSUndef, LHSZero;
38615     if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
38616                                    Depth + 1))
38617       return true;
38618     APInt RHSUndef, RHSZero;
38619     if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
38620                                    Depth + 1))
38621       return true;
38622 
38623     // TODO - pass on known zero/undef.
38624 
38625     // Aggressively peek through ops to get at the demanded elts.
38626     // TODO: Handle repeated operands.
38627     if (N0 != N1 && !DemandedElts.isAllOnesValue()) {
38628       SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
38629                                                             TLO.DAG, Depth + 1);
38630       SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
38631                                                             TLO.DAG, Depth + 1);
38632       if (NewN0 || NewN1) {
38633         NewN0 = NewN0 ? NewN0 : N0;
38634         NewN1 = NewN1 ? NewN1 : N1;
38635         return TLO.CombineTo(Op,
38636                              TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
38637       }
38638     }
38639     break;
38640   }
38641   case X86ISD::VTRUNC:
38642   case X86ISD::VTRUNCS:
38643   case X86ISD::VTRUNCUS: {
38644     SDValue Src = Op.getOperand(0);
38645     MVT SrcVT = Src.getSimpleValueType();
38646     APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
38647     APInt SrcUndef, SrcZero;
38648     if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
38649                                    Depth + 1))
38650       return true;
38651     KnownZero = SrcZero.zextOrTrunc(NumElts);
38652     KnownUndef = SrcUndef.zextOrTrunc(NumElts);
38653     break;
38654   }
38655   case X86ISD::BLENDV: {
38656     APInt SelUndef, SelZero;
38657     if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
38658                                    SelZero, TLO, Depth + 1))
38659       return true;
38660 
38661     // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
38662     APInt LHSUndef, LHSZero;
38663     if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
38664                                    LHSZero, TLO, Depth + 1))
38665       return true;
38666 
38667     APInt RHSUndef, RHSZero;
38668     if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
38669                                    RHSZero, TLO, Depth + 1))
38670       return true;
38671 
38672     KnownZero = LHSZero & RHSZero;
38673     KnownUndef = LHSUndef & RHSUndef;
38674     break;
38675   }
38676   case X86ISD::VZEXT_MOVL: {
38677     // If upper demanded elements are already zero then we have nothing to do.
38678     SDValue Src = Op.getOperand(0);
38679     APInt DemandedUpperElts = DemandedElts;
38680     DemandedUpperElts.clearLowBits(1);
38681     if (TLO.DAG.computeKnownBits(Src, DemandedUpperElts, Depth + 1).isZero())
38682       return TLO.CombineTo(Op, Src);
38683     break;
38684   }
38685   case X86ISD::VBROADCAST: {
38686     SDValue Src = Op.getOperand(0);
38687     MVT SrcVT = Src.getSimpleValueType();
38688     if (!SrcVT.isVector())
38689       break;
38690     // Don't bother broadcasting if we just need the 0'th element.
38691     if (DemandedElts == 1) {
38692       if (Src.getValueType() != VT)
38693         Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
38694                              SDLoc(Op));
38695       return TLO.CombineTo(Op, Src);
38696     }
38697     APInt SrcUndef, SrcZero;
38698     APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
38699     if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
38700                                    Depth + 1))
38701       return true;
38702     // Aggressively peek through src to get at the demanded elt.
38703     // TODO - we should do this for all target/faux shuffles ops.
38704     if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
38705             Src, SrcElts, TLO.DAG, Depth + 1))
38706       return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
38707     break;
38708   }
38709   case X86ISD::VPERMV:
38710     if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
38711                                                    Depth))
38712       return true;
38713     break;
38714   case X86ISD::PSHUFB:
38715   case X86ISD::VPERMV3:
38716   case X86ISD::VPERMILPV:
38717     if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
38718                                                    Depth))
38719       return true;
38720     break;
38721   case X86ISD::VPPERM:
38722   case X86ISD::VPERMIL2:
38723     if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
38724                                                    Depth))
38725       return true;
38726     break;
38727   }
38728 
38729   // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
38730   // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
38731   // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
38732   if ((VT.is256BitVector() || VT.is512BitVector()) &&
38733       DemandedElts.lshr(NumElts / 2) == 0) {
38734     unsigned SizeInBits = VT.getSizeInBits();
38735     unsigned ExtSizeInBits = SizeInBits / 2;
38736 
38737     // See if 512-bit ops only use the bottom 128-bits.
38738     if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
38739       ExtSizeInBits = SizeInBits / 4;
38740 
38741     switch (Opc) {
38742       // Scalar broadcast.
38743     case X86ISD::VBROADCAST: {
38744       SDLoc DL(Op);
38745       SDValue Src = Op.getOperand(0);
38746       if (Src.getValueSizeInBits() > ExtSizeInBits)
38747         Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
38748       EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
38749                                     ExtSizeInBits / VT.getScalarSizeInBits());
38750       SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
38751       return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
38752                                                TLO.DAG, DL, ExtSizeInBits));
38753     }
38754     case X86ISD::VBROADCAST_LOAD: {
38755       SDLoc DL(Op);
38756       auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
38757       EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
38758                                     ExtSizeInBits / VT.getScalarSizeInBits());
38759       SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
38760       SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
38761       SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
38762           X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
38763           MemIntr->getMemOperand());
38764       TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
38765                                            Bcst.getValue(1));
38766       return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
38767                                                TLO.DAG, DL, ExtSizeInBits));
38768     }
38769       // Subvector broadcast.
38770     case X86ISD::SUBV_BROADCAST_LOAD: {
38771       auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
38772       EVT MemVT = MemIntr->getMemoryVT();
38773       if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
38774         SDLoc DL(Op);
38775         SDValue Ld =
38776             TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
38777                             MemIntr->getBasePtr(), MemIntr->getMemOperand());
38778         TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
38779                                              Ld.getValue(1));
38780         return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
38781                                                  TLO.DAG, DL, ExtSizeInBits));
38782       } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
38783         SDLoc DL(Op);
38784         EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
38785                                       ExtSizeInBits / VT.getScalarSizeInBits());
38786         SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
38787         SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
38788         SDValue Bcst =
38789             TLO.DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys,
38790                                         Ops, MemVT, MemIntr->getMemOperand());
38791         TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
38792                                              Bcst.getValue(1));
38793         return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
38794                                                  TLO.DAG, DL, ExtSizeInBits));
38795       }
38796       break;
38797     }
38798       // Byte shifts by immediate.
38799     case X86ISD::VSHLDQ:
38800     case X86ISD::VSRLDQ:
38801       // Shift by uniform.
38802     case X86ISD::VSHL:
38803     case X86ISD::VSRL:
38804     case X86ISD::VSRA:
38805       // Shift by immediate.
38806     case X86ISD::VSHLI:
38807     case X86ISD::VSRLI:
38808     case X86ISD::VSRAI: {
38809       SDLoc DL(Op);
38810       SDValue Ext0 =
38811           extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
38812       SDValue ExtOp =
38813           TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
38814       SDValue UndefVec = TLO.DAG.getUNDEF(VT);
38815       SDValue Insert =
38816           insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
38817       return TLO.CombineTo(Op, Insert);
38818     }
38819     case X86ISD::VPERMI: {
38820       // Simplify PERMPD/PERMQ to extract_subvector.
38821       // TODO: This should be done in shuffle combining.
38822       if (VT == MVT::v4f64 || VT == MVT::v4i64) {
38823         SmallVector<int, 4> Mask;
38824         DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
38825         if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
38826           SDLoc DL(Op);
38827           SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
38828           SDValue UndefVec = TLO.DAG.getUNDEF(VT);
38829           SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
38830           return TLO.CombineTo(Op, Insert);
38831         }
38832       }
38833       break;
38834     }
38835       // Zero upper elements.
38836     case X86ISD::VZEXT_MOVL:
38837       // Target unary shuffles by immediate:
38838     case X86ISD::PSHUFD:
38839     case X86ISD::PSHUFLW:
38840     case X86ISD::PSHUFHW:
38841     case X86ISD::VPERMILPI:
38842       // (Non-Lane Crossing) Target Shuffles.
38843     case X86ISD::VPERMILPV:
38844     case X86ISD::VPERMIL2:
38845     case X86ISD::PSHUFB:
38846     case X86ISD::UNPCKL:
38847     case X86ISD::UNPCKH:
38848     case X86ISD::BLENDI:
38849       // Integer ops.
38850     case X86ISD::AVG:
38851     case X86ISD::PACKSS:
38852     case X86ISD::PACKUS:
38853       // Horizontal Ops.
38854     case X86ISD::HADD:
38855     case X86ISD::HSUB:
38856     case X86ISD::FHADD:
38857     case X86ISD::FHSUB: {
38858       SDLoc DL(Op);
38859       SmallVector<SDValue, 4> Ops;
38860       for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
38861         SDValue SrcOp = Op.getOperand(i);
38862         EVT SrcVT = SrcOp.getValueType();
38863         assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
38864                "Unsupported vector size");
38865         Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
38866                                                           ExtSizeInBits)
38867                                        : SrcOp);
38868       }
38869       MVT ExtVT = VT.getSimpleVT();
38870       ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
38871                                ExtSizeInBits / ExtVT.getScalarSizeInBits());
38872       SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
38873       SDValue UndefVec = TLO.DAG.getUNDEF(VT);
38874       SDValue Insert =
38875           insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
38876       return TLO.CombineTo(Op, Insert);
38877     }
38878     }
38879   }
38880 
38881   // Get target/faux shuffle mask.
38882   APInt OpUndef, OpZero;
38883   SmallVector<int, 64> OpMask;
38884   SmallVector<SDValue, 2> OpInputs;
38885   if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
38886                               OpZero, TLO.DAG, Depth, false))
38887     return false;
38888 
38889   // Shuffle inputs must be the same size as the result.
38890   if (OpMask.size() != (unsigned)NumElts ||
38891       llvm::any_of(OpInputs, [VT](SDValue V) {
38892         return VT.getSizeInBits() != V.getValueSizeInBits() ||
38893                !V.getValueType().isVector();
38894       }))
38895     return false;
38896 
38897   KnownZero = OpZero;
38898   KnownUndef = OpUndef;
38899 
38900   // Check if shuffle mask can be simplified to undef/zero/identity.
38901   int NumSrcs = OpInputs.size();
38902   for (int i = 0; i != NumElts; ++i)
38903     if (!DemandedElts[i])
38904       OpMask[i] = SM_SentinelUndef;
38905 
38906   if (isUndefInRange(OpMask, 0, NumElts)) {
38907     KnownUndef.setAllBits();
38908     return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
38909   }
38910   if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
38911     KnownZero.setAllBits();
38912     return TLO.CombineTo(
38913         Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
38914   }
38915   for (int Src = 0; Src != NumSrcs; ++Src)
38916     if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
38917       return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
38918 
38919   // Attempt to simplify inputs.
38920   for (int Src = 0; Src != NumSrcs; ++Src) {
38921     // TODO: Support inputs of different types.
38922     if (OpInputs[Src].getValueType() != VT)
38923       continue;
38924 
38925     int Lo = Src * NumElts;
38926     APInt SrcElts = APInt::getNullValue(NumElts);
38927     for (int i = 0; i != NumElts; ++i)
38928       if (DemandedElts[i]) {
38929         int M = OpMask[i] - Lo;
38930         if (0 <= M && M < NumElts)
38931           SrcElts.setBit(M);
38932       }
38933 
38934     // TODO - Propagate input undef/zero elts.
38935     APInt SrcUndef, SrcZero;
38936     if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
38937                                    TLO, Depth + 1))
38938       return true;
38939   }
38940 
38941   // If we don't demand all elements, then attempt to combine to a simpler
38942   // shuffle.
38943   // We need to convert the depth to something combineX86ShufflesRecursively
38944   // can handle - so pretend its Depth == 0 again, and reduce the max depth
38945   // to match. This prevents combineX86ShuffleChain from returning a
38946   // combined shuffle that's the same as the original root, causing an
38947   // infinite loop.
38948   if (!DemandedElts.isAllOnesValue()) {
38949     assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
38950 
38951     SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
38952     for (int i = 0; i != NumElts; ++i)
38953       if (DemandedElts[i])
38954         DemandedMask[i] = i;
38955 
38956     SDValue NewShuffle = combineX86ShufflesRecursively(
38957         {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
38958         /*HasVarMask*/ false,
38959         /*AllowVarMask*/ true, TLO.DAG, Subtarget);
38960     if (NewShuffle)
38961       return TLO.CombineTo(Op, NewShuffle);
38962   }
38963 
38964   return false;
38965 }
38966 
SimplifyDemandedBitsForTargetNode(SDValue Op,const APInt & OriginalDemandedBits,const APInt & OriginalDemandedElts,KnownBits & Known,TargetLoweringOpt & TLO,unsigned Depth) const38967 bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
38968     SDValue Op, const APInt &OriginalDemandedBits,
38969     const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
38970     unsigned Depth) const {
38971   EVT VT = Op.getValueType();
38972   unsigned BitWidth = OriginalDemandedBits.getBitWidth();
38973   unsigned Opc = Op.getOpcode();
38974   switch(Opc) {
38975   case X86ISD::VTRUNC: {
38976     KnownBits KnownOp;
38977     SDValue Src = Op.getOperand(0);
38978     MVT SrcVT = Src.getSimpleValueType();
38979 
38980     // Simplify the input, using demanded bit information.
38981     APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
38982     APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
38983     if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
38984       return true;
38985     break;
38986   }
38987   case X86ISD::PMULDQ:
38988   case X86ISD::PMULUDQ: {
38989     // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
38990     KnownBits KnownOp;
38991     SDValue LHS = Op.getOperand(0);
38992     SDValue RHS = Op.getOperand(1);
38993     // FIXME: Can we bound this better?
38994     APInt DemandedMask = APInt::getLowBitsSet(64, 32);
38995     if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
38996                              TLO, Depth + 1))
38997       return true;
38998     if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
38999                              TLO, Depth + 1))
39000       return true;
39001 
39002     // Aggressively peek through ops to get at the demanded low bits.
39003     SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
39004         LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
39005     SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
39006         RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
39007     if (DemandedLHS || DemandedRHS) {
39008       DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
39009       DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
39010       return TLO.CombineTo(
39011           Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
39012     }
39013     break;
39014   }
39015   case X86ISD::VSHLI: {
39016     SDValue Op0 = Op.getOperand(0);
39017 
39018     unsigned ShAmt = Op.getConstantOperandVal(1);
39019     if (ShAmt >= BitWidth)
39020       break;
39021 
39022     APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
39023 
39024     // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
39025     // single shift.  We can do this if the bottom bits (which are shifted
39026     // out) are never demanded.
39027     if (Op0.getOpcode() == X86ISD::VSRLI &&
39028         OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
39029       unsigned Shift2Amt = Op0.getConstantOperandVal(1);
39030       if (Shift2Amt < BitWidth) {
39031         int Diff = ShAmt - Shift2Amt;
39032         if (Diff == 0)
39033           return TLO.CombineTo(Op, Op0.getOperand(0));
39034 
39035         unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
39036         SDValue NewShift = TLO.DAG.getNode(
39037             NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
39038             TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
39039         return TLO.CombineTo(Op, NewShift);
39040       }
39041     }
39042 
39043     // If we are only demanding sign bits then we can use the shift source directly.
39044     unsigned NumSignBits =
39045         TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
39046     unsigned UpperDemandedBits =
39047         BitWidth - OriginalDemandedBits.countTrailingZeros();
39048     if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
39049       return TLO.CombineTo(Op, Op0);
39050 
39051     if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
39052                              TLO, Depth + 1))
39053       return true;
39054 
39055     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
39056     Known.Zero <<= ShAmt;
39057     Known.One <<= ShAmt;
39058 
39059     // Low bits known zero.
39060     Known.Zero.setLowBits(ShAmt);
39061     return false;
39062   }
39063   case X86ISD::VSRLI: {
39064     unsigned ShAmt = Op.getConstantOperandVal(1);
39065     if (ShAmt >= BitWidth)
39066       break;
39067 
39068     APInt DemandedMask = OriginalDemandedBits << ShAmt;
39069 
39070     if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
39071                              OriginalDemandedElts, Known, TLO, Depth + 1))
39072       return true;
39073 
39074     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
39075     Known.Zero.lshrInPlace(ShAmt);
39076     Known.One.lshrInPlace(ShAmt);
39077 
39078     // High bits known zero.
39079     Known.Zero.setHighBits(ShAmt);
39080     return false;
39081   }
39082   case X86ISD::VSRAI: {
39083     SDValue Op0 = Op.getOperand(0);
39084     SDValue Op1 = Op.getOperand(1);
39085 
39086     unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
39087     if (ShAmt >= BitWidth)
39088       break;
39089 
39090     APInt DemandedMask = OriginalDemandedBits << ShAmt;
39091 
39092     // If we just want the sign bit then we don't need to shift it.
39093     if (OriginalDemandedBits.isSignMask())
39094       return TLO.CombineTo(Op, Op0);
39095 
39096     // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
39097     if (Op0.getOpcode() == X86ISD::VSHLI &&
39098         Op.getOperand(1) == Op0.getOperand(1)) {
39099       SDValue Op00 = Op0.getOperand(0);
39100       unsigned NumSignBits =
39101           TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
39102       if (ShAmt < NumSignBits)
39103         return TLO.CombineTo(Op, Op00);
39104     }
39105 
39106     // If any of the demanded bits are produced by the sign extension, we also
39107     // demand the input sign bit.
39108     if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
39109       DemandedMask.setSignBit();
39110 
39111     if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
39112                              TLO, Depth + 1))
39113       return true;
39114 
39115     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
39116     Known.Zero.lshrInPlace(ShAmt);
39117     Known.One.lshrInPlace(ShAmt);
39118 
39119     // If the input sign bit is known to be zero, or if none of the top bits
39120     // are demanded, turn this into an unsigned shift right.
39121     if (Known.Zero[BitWidth - ShAmt - 1] ||
39122         OriginalDemandedBits.countLeadingZeros() >= ShAmt)
39123       return TLO.CombineTo(
39124           Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
39125 
39126     // High bits are known one.
39127     if (Known.One[BitWidth - ShAmt - 1])
39128       Known.One.setHighBits(ShAmt);
39129     return false;
39130   }
39131   case X86ISD::PEXTRB:
39132   case X86ISD::PEXTRW: {
39133     SDValue Vec = Op.getOperand(0);
39134     auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
39135     MVT VecVT = Vec.getSimpleValueType();
39136     unsigned NumVecElts = VecVT.getVectorNumElements();
39137 
39138     if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
39139       unsigned Idx = CIdx->getZExtValue();
39140       unsigned VecBitWidth = VecVT.getScalarSizeInBits();
39141 
39142       // If we demand no bits from the vector then we must have demanded
39143       // bits from the implict zext - simplify to zero.
39144       APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
39145       if (DemandedVecBits == 0)
39146         return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
39147 
39148       APInt KnownUndef, KnownZero;
39149       APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
39150       if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
39151                                      KnownZero, TLO, Depth + 1))
39152         return true;
39153 
39154       KnownBits KnownVec;
39155       if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
39156                                KnownVec, TLO, Depth + 1))
39157         return true;
39158 
39159       if (SDValue V = SimplifyMultipleUseDemandedBits(
39160               Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
39161         return TLO.CombineTo(
39162             Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
39163 
39164       Known = KnownVec.zext(BitWidth);
39165       return false;
39166     }
39167     break;
39168   }
39169   case X86ISD::PINSRB:
39170   case X86ISD::PINSRW: {
39171     SDValue Vec = Op.getOperand(0);
39172     SDValue Scl = Op.getOperand(1);
39173     auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
39174     MVT VecVT = Vec.getSimpleValueType();
39175 
39176     if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
39177       unsigned Idx = CIdx->getZExtValue();
39178       if (!OriginalDemandedElts[Idx])
39179         return TLO.CombineTo(Op, Vec);
39180 
39181       KnownBits KnownVec;
39182       APInt DemandedVecElts(OriginalDemandedElts);
39183       DemandedVecElts.clearBit(Idx);
39184       if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
39185                                KnownVec, TLO, Depth + 1))
39186         return true;
39187 
39188       KnownBits KnownScl;
39189       unsigned NumSclBits = Scl.getScalarValueSizeInBits();
39190       APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
39191       if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
39192         return true;
39193 
39194       KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
39195       Known = KnownBits::commonBits(KnownVec, KnownScl);
39196       return false;
39197     }
39198     break;
39199   }
39200   case X86ISD::PACKSS:
39201     // PACKSS saturates to MIN/MAX integer values. So if we just want the
39202     // sign bit then we can just ask for the source operands sign bit.
39203     // TODO - add known bits handling.
39204     if (OriginalDemandedBits.isSignMask()) {
39205       APInt DemandedLHS, DemandedRHS;
39206       getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
39207 
39208       KnownBits KnownLHS, KnownRHS;
39209       APInt SignMask = APInt::getSignMask(BitWidth * 2);
39210       if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
39211                                KnownLHS, TLO, Depth + 1))
39212         return true;
39213       if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
39214                                KnownRHS, TLO, Depth + 1))
39215         return true;
39216 
39217       // Attempt to avoid multi-use ops if we don't need anything from them.
39218       SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
39219           Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
39220       SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
39221           Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
39222       if (DemandedOp0 || DemandedOp1) {
39223         SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
39224         SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
39225         return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
39226       }
39227     }
39228     // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
39229     break;
39230   case X86ISD::VBROADCAST: {
39231     SDValue Src = Op.getOperand(0);
39232     MVT SrcVT = Src.getSimpleValueType();
39233     APInt DemandedElts = APInt::getOneBitSet(
39234         SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
39235     if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
39236                              TLO, Depth + 1))
39237       return true;
39238     // If we don't need the upper bits, attempt to narrow the broadcast source.
39239     // Don't attempt this on AVX512 as it might affect broadcast folding.
39240     // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
39241     if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
39242         OriginalDemandedBits.countLeadingZeros() >= (BitWidth / 2)) {
39243       MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
39244       SDValue NewSrc =
39245           TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
39246       MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
39247       SDValue NewBcst =
39248           TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
39249       return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
39250     }
39251     break;
39252   }
39253   case X86ISD::PCMPGT:
39254     // icmp sgt(0, R) == ashr(R, BitWidth-1).
39255     // iff we only need the sign bit then we can use R directly.
39256     if (OriginalDemandedBits.isSignMask() &&
39257         ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
39258       return TLO.CombineTo(Op, Op.getOperand(1));
39259     break;
39260   case X86ISD::MOVMSK: {
39261     SDValue Src = Op.getOperand(0);
39262     MVT SrcVT = Src.getSimpleValueType();
39263     unsigned SrcBits = SrcVT.getScalarSizeInBits();
39264     unsigned NumElts = SrcVT.getVectorNumElements();
39265 
39266     // If we don't need the sign bits at all just return zero.
39267     if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
39268       return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
39269 
39270     // Only demand the vector elements of the sign bits we need.
39271     APInt KnownUndef, KnownZero;
39272     APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
39273     if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
39274                                    TLO, Depth + 1))
39275       return true;
39276 
39277     Known.Zero = KnownZero.zextOrSelf(BitWidth);
39278     Known.Zero.setHighBits(BitWidth - NumElts);
39279 
39280     // MOVMSK only uses the MSB from each vector element.
39281     KnownBits KnownSrc;
39282     APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
39283     if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
39284                              Depth + 1))
39285       return true;
39286 
39287     if (KnownSrc.One[SrcBits - 1])
39288       Known.One.setLowBits(NumElts);
39289     else if (KnownSrc.Zero[SrcBits - 1])
39290       Known.Zero.setLowBits(NumElts);
39291 
39292     // Attempt to avoid multi-use os if we don't need anything from it.
39293     if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
39294             Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
39295       return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
39296     return false;
39297   }
39298   case X86ISD::BEXTR:
39299   case X86ISD::BEXTRI: {
39300     SDValue Op0 = Op.getOperand(0);
39301     SDValue Op1 = Op.getOperand(1);
39302 
39303     // Only bottom 16-bits of the control bits are required.
39304     if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
39305       // NOTE: SimplifyDemandedBits won't do this for constants.
39306       uint64_t Val1 = Cst1->getZExtValue();
39307       uint64_t MaskedVal1 = Val1 & 0xFFFF;
39308       if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
39309         SDLoc DL(Op);
39310         return TLO.CombineTo(
39311             Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
39312                                 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
39313       }
39314 
39315       unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
39316       unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
39317 
39318       // If the length is 0, the result is 0.
39319       if (Length == 0) {
39320         Known.setAllZero();
39321         return false;
39322       }
39323 
39324       if ((Shift + Length) <= BitWidth) {
39325         APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
39326         if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
39327           return true;
39328 
39329         Known = Known.extractBits(Length, Shift);
39330         Known = Known.zextOrTrunc(BitWidth);
39331         return false;
39332       }
39333     } else {
39334       assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
39335       KnownBits Known1;
39336       APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
39337       if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
39338         return true;
39339 
39340       // If the length is 0, replace with 0.
39341       KnownBits LengthBits = Known1.extractBits(8, 8);
39342       if (LengthBits.isZero())
39343         return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
39344     }
39345 
39346     break;
39347   }
39348   case X86ISD::PDEP: {
39349     SDValue Op0 = Op.getOperand(0);
39350     SDValue Op1 = Op.getOperand(1);
39351 
39352     unsigned DemandedBitsLZ = OriginalDemandedBits.countLeadingZeros();
39353     APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
39354 
39355     // If the demanded bits has leading zeroes, we don't demand those from the
39356     // mask.
39357     if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
39358       return true;
39359 
39360     // The number of possible 1s in the mask determines the number of LSBs of
39361     // operand 0 used. Undemanded bits from the mask don't matter so filter
39362     // them before counting.
39363     KnownBits Known2;
39364     uint64_t Count = (~Known.Zero & LoMask).countPopulation();
39365     APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
39366     if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
39367       return true;
39368 
39369     // Zeroes are retained from the mask, but not ones.
39370     Known.One.clearAllBits();
39371     // The result will have at least as many trailing zeros as the non-mask
39372     // operand since bits can only map to the same or higher bit position.
39373     Known.Zero.setLowBits(Known2.countMinTrailingZeros());
39374     return false;
39375   }
39376   }
39377 
39378   return TargetLowering::SimplifyDemandedBitsForTargetNode(
39379       Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
39380 }
39381 
SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op,const APInt & DemandedBits,const APInt & DemandedElts,SelectionDAG & DAG,unsigned Depth) const39382 SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
39383     SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
39384     SelectionDAG &DAG, unsigned Depth) const {
39385   int NumElts = DemandedElts.getBitWidth();
39386   unsigned Opc = Op.getOpcode();
39387   EVT VT = Op.getValueType();
39388 
39389   switch (Opc) {
39390   case X86ISD::PINSRB:
39391   case X86ISD::PINSRW: {
39392     // If we don't demand the inserted element, return the base vector.
39393     SDValue Vec = Op.getOperand(0);
39394     auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
39395     MVT VecVT = Vec.getSimpleValueType();
39396     if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
39397         !DemandedElts[CIdx->getZExtValue()])
39398       return Vec;
39399     break;
39400   }
39401   case X86ISD::VSHLI: {
39402     // If we are only demanding sign bits then we can use the shift source
39403     // directly.
39404     SDValue Op0 = Op.getOperand(0);
39405     unsigned ShAmt = Op.getConstantOperandVal(1);
39406     unsigned BitWidth = DemandedBits.getBitWidth();
39407     unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
39408     unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();
39409     if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
39410       return Op0;
39411     break;
39412   }
39413   case X86ISD::VSRAI:
39414     // iff we only need the sign bit then we can use the source directly.
39415     // TODO: generalize where we only demand extended signbits.
39416     if (DemandedBits.isSignMask())
39417       return Op.getOperand(0);
39418     break;
39419   case X86ISD::PCMPGT:
39420     // icmp sgt(0, R) == ashr(R, BitWidth-1).
39421     // iff we only need the sign bit then we can use R directly.
39422     if (DemandedBits.isSignMask() &&
39423         ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
39424       return Op.getOperand(1);
39425     break;
39426   }
39427 
39428   APInt ShuffleUndef, ShuffleZero;
39429   SmallVector<int, 16> ShuffleMask;
39430   SmallVector<SDValue, 2> ShuffleOps;
39431   if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
39432                              ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
39433     // If all the demanded elts are from one operand and are inline,
39434     // then we can use the operand directly.
39435     int NumOps = ShuffleOps.size();
39436     if (ShuffleMask.size() == (unsigned)NumElts &&
39437         llvm::all_of(ShuffleOps, [VT](SDValue V) {
39438           return VT.getSizeInBits() == V.getValueSizeInBits();
39439         })) {
39440 
39441       if (DemandedElts.isSubsetOf(ShuffleUndef))
39442         return DAG.getUNDEF(VT);
39443       if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
39444         return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
39445 
39446       // Bitmask that indicates which ops have only been accessed 'inline'.
39447       APInt IdentityOp = APInt::getAllOnesValue(NumOps);
39448       for (int i = 0; i != NumElts; ++i) {
39449         int M = ShuffleMask[i];
39450         if (!DemandedElts[i] || ShuffleUndef[i])
39451           continue;
39452         int OpIdx = M / NumElts;
39453         int EltIdx = M % NumElts;
39454         if (M < 0 || EltIdx != i) {
39455           IdentityOp.clearAllBits();
39456           break;
39457         }
39458         IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
39459         if (IdentityOp == 0)
39460           break;
39461       }
39462       assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&
39463              "Multiple identity shuffles detected");
39464 
39465       if (IdentityOp != 0)
39466         return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
39467     }
39468   }
39469 
39470   return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
39471       Op, DemandedBits, DemandedElts, DAG, Depth);
39472 }
39473 
39474 // Helper to peek through bitops/trunc/setcc to determine size of source vector.
39475 // Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
checkBitcastSrcVectorSize(SDValue Src,unsigned Size,bool AllowTruncate)39476 static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
39477                                       bool AllowTruncate) {
39478   switch (Src.getOpcode()) {
39479   case ISD::TRUNCATE:
39480     if (!AllowTruncate)
39481       return false;
39482     LLVM_FALLTHROUGH;
39483   case ISD::SETCC:
39484     return Src.getOperand(0).getValueSizeInBits() == Size;
39485   case ISD::AND:
39486   case ISD::XOR:
39487   case ISD::OR:
39488     return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
39489            checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
39490   }
39491   return false;
39492 }
39493 
39494 // Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
getAltBitOpcode(unsigned Opcode)39495 static unsigned getAltBitOpcode(unsigned Opcode) {
39496   switch(Opcode) {
39497   case ISD::AND: return X86ISD::FAND;
39498   case ISD::OR: return X86ISD::FOR;
39499   case ISD::XOR: return X86ISD::FXOR;
39500   case X86ISD::ANDNP: return X86ISD::FANDN;
39501   }
39502   llvm_unreachable("Unknown bitwise opcode");
39503 }
39504 
39505 // Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
adjustBitcastSrcVectorSSE1(SelectionDAG & DAG,SDValue Src,const SDLoc & DL)39506 static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
39507                                           const SDLoc &DL) {
39508   EVT SrcVT = Src.getValueType();
39509   if (SrcVT != MVT::v4i1)
39510     return SDValue();
39511 
39512   switch (Src.getOpcode()) {
39513   case ISD::SETCC:
39514     if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
39515         ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
39516         cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
39517       SDValue Op0 = Src.getOperand(0);
39518       if (ISD::isNormalLoad(Op0.getNode()))
39519         return DAG.getBitcast(MVT::v4f32, Op0);
39520       if (Op0.getOpcode() == ISD::BITCAST &&
39521           Op0.getOperand(0).getValueType() == MVT::v4f32)
39522         return Op0.getOperand(0);
39523     }
39524     break;
39525   case ISD::AND:
39526   case ISD::XOR:
39527   case ISD::OR: {
39528     SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
39529     SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
39530     if (Op0 && Op1)
39531       return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
39532                          Op1);
39533     break;
39534   }
39535   }
39536   return SDValue();
39537 }
39538 
39539 // Helper to push sign extension of vXi1 SETCC result through bitops.
signExtendBitcastSrcVector(SelectionDAG & DAG,EVT SExtVT,SDValue Src,const SDLoc & DL)39540 static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
39541                                           SDValue Src, const SDLoc &DL) {
39542   switch (Src.getOpcode()) {
39543   case ISD::SETCC:
39544   case ISD::TRUNCATE:
39545     return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
39546   case ISD::AND:
39547   case ISD::XOR:
39548   case ISD::OR:
39549     return DAG.getNode(
39550         Src.getOpcode(), DL, SExtVT,
39551         signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
39552         signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
39553   }
39554   llvm_unreachable("Unexpected node type for vXi1 sign extension");
39555 }
39556 
39557 // Try to match patterns such as
39558 // (i16 bitcast (v16i1 x))
39559 // ->
39560 // (i16 movmsk (16i8 sext (v16i1 x)))
39561 // before the illegal vector is scalarized on subtargets that don't have legal
39562 // vxi1 types.
combineBitcastvxi1(SelectionDAG & DAG,EVT VT,SDValue Src,const SDLoc & DL,const X86Subtarget & Subtarget)39563 static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
39564                                   const SDLoc &DL,
39565                                   const X86Subtarget &Subtarget) {
39566   EVT SrcVT = Src.getValueType();
39567   if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
39568     return SDValue();
39569 
39570   // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
39571   // legalization destroys the v4i32 type.
39572   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
39573     if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
39574       V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
39575                       DAG.getBitcast(MVT::v4f32, V));
39576       return DAG.getZExtOrTrunc(V, DL, VT);
39577     }
39578   }
39579 
39580   // If the input is a truncate from v16i8 or v32i8 go ahead and use a
39581   // movmskb even with avx512. This will be better than truncating to vXi1 and
39582   // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
39583   // vpcmpeqb/vpcmpgtb.
39584   bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
39585                       (Src.getOperand(0).getValueType() == MVT::v16i8 ||
39586                        Src.getOperand(0).getValueType() == MVT::v32i8 ||
39587                        Src.getOperand(0).getValueType() == MVT::v64i8);
39588 
39589   // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
39590   // directly with vpmovmskb/vmovmskps/vmovmskpd.
39591   if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
39592       cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
39593       ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
39594     EVT CmpVT = Src.getOperand(0).getValueType();
39595     EVT EltVT = CmpVT.getVectorElementType();
39596     if (CmpVT.getSizeInBits() <= 256 &&
39597         (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
39598       PreferMovMsk = true;
39599   }
39600 
39601   // With AVX512 vxi1 types are legal and we prefer using k-regs.
39602   // MOVMSK is supported in SSE2 or later.
39603   if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
39604     return SDValue();
39605 
39606   // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
39607   // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
39608   // v8i16 and v16i16.
39609   // For these two cases, we can shuffle the upper element bytes to a
39610   // consecutive sequence at the start of the vector and treat the results as
39611   // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
39612   // for v16i16 this is not the case, because the shuffle is expensive, so we
39613   // avoid sign-extending to this type entirely.
39614   // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
39615   // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
39616   MVT SExtVT;
39617   bool PropagateSExt = false;
39618   switch (SrcVT.getSimpleVT().SimpleTy) {
39619   default:
39620     return SDValue();
39621   case MVT::v2i1:
39622     SExtVT = MVT::v2i64;
39623     break;
39624   case MVT::v4i1:
39625     SExtVT = MVT::v4i32;
39626     // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
39627     // sign-extend to a 256-bit operation to avoid truncation.
39628     if (Subtarget.hasAVX() &&
39629         checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
39630       SExtVT = MVT::v4i64;
39631       PropagateSExt = true;
39632     }
39633     break;
39634   case MVT::v8i1:
39635     SExtVT = MVT::v8i16;
39636     // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
39637     // sign-extend to a 256-bit operation to match the compare.
39638     // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
39639     // 256-bit because the shuffle is cheaper than sign extending the result of
39640     // the compare.
39641     if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
39642                                checkBitcastSrcVectorSize(Src, 512, true))) {
39643       SExtVT = MVT::v8i32;
39644       PropagateSExt = true;
39645     }
39646     break;
39647   case MVT::v16i1:
39648     SExtVT = MVT::v16i8;
39649     // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
39650     // it is not profitable to sign-extend to 256-bit because this will
39651     // require an extra cross-lane shuffle which is more expensive than
39652     // truncating the result of the compare to 128-bits.
39653     break;
39654   case MVT::v32i1:
39655     SExtVT = MVT::v32i8;
39656     break;
39657   case MVT::v64i1:
39658     // If we have AVX512F, but not AVX512BW and the input is truncated from
39659     // v64i8 checked earlier. Then split the input and make two pmovmskbs.
39660     if (Subtarget.hasAVX512()) {
39661       if (Subtarget.hasBWI())
39662         return SDValue();
39663       SExtVT = MVT::v64i8;
39664       break;
39665     }
39666     // Split if this is a <64 x i8> comparison result.
39667     if (checkBitcastSrcVectorSize(Src, 512, false)) {
39668       SExtVT = MVT::v64i8;
39669       break;
39670     }
39671     return SDValue();
39672   };
39673 
39674   SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
39675                             : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
39676 
39677   if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
39678     V = getPMOVMSKB(DL, V, DAG, Subtarget);
39679   } else {
39680     if (SExtVT == MVT::v8i16)
39681       V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
39682                       DAG.getUNDEF(MVT::v8i16));
39683     V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
39684   }
39685 
39686   EVT IntVT =
39687       EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
39688   V = DAG.getZExtOrTrunc(V, DL, IntVT);
39689   return DAG.getBitcast(VT, V);
39690 }
39691 
39692 // Convert a vXi1 constant build vector to the same width scalar integer.
combinevXi1ConstantToInteger(SDValue Op,SelectionDAG & DAG)39693 static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
39694   EVT SrcVT = Op.getValueType();
39695   assert(SrcVT.getVectorElementType() == MVT::i1 &&
39696          "Expected a vXi1 vector");
39697   assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
39698          "Expected a constant build vector");
39699 
39700   APInt Imm(SrcVT.getVectorNumElements(), 0);
39701   for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
39702     SDValue In = Op.getOperand(Idx);
39703     if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
39704       Imm.setBit(Idx);
39705   }
39706   EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
39707   return DAG.getConstant(Imm, SDLoc(Op), IntVT);
39708 }
39709 
combineCastedMaskArithmetic(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)39710 static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
39711                                            TargetLowering::DAGCombinerInfo &DCI,
39712                                            const X86Subtarget &Subtarget) {
39713   assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
39714 
39715   if (!DCI.isBeforeLegalizeOps())
39716     return SDValue();
39717 
39718   // Only do this if we have k-registers.
39719   if (!Subtarget.hasAVX512())
39720     return SDValue();
39721 
39722   EVT DstVT = N->getValueType(0);
39723   SDValue Op = N->getOperand(0);
39724   EVT SrcVT = Op.getValueType();
39725 
39726   if (!Op.hasOneUse())
39727     return SDValue();
39728 
39729   // Look for logic ops.
39730   if (Op.getOpcode() != ISD::AND &&
39731       Op.getOpcode() != ISD::OR &&
39732       Op.getOpcode() != ISD::XOR)
39733     return SDValue();
39734 
39735   // Make sure we have a bitcast between mask registers and a scalar type.
39736   if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
39737         DstVT.isScalarInteger()) &&
39738       !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
39739         SrcVT.isScalarInteger()))
39740     return SDValue();
39741 
39742   SDValue LHS = Op.getOperand(0);
39743   SDValue RHS = Op.getOperand(1);
39744 
39745   if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
39746       LHS.getOperand(0).getValueType() == DstVT)
39747     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
39748                        DAG.getBitcast(DstVT, RHS));
39749 
39750   if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
39751       RHS.getOperand(0).getValueType() == DstVT)
39752     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
39753                        DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
39754 
39755   // If the RHS is a vXi1 build vector, this is a good reason to flip too.
39756   // Most of these have to move a constant from the scalar domain anyway.
39757   if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
39758     RHS = combinevXi1ConstantToInteger(RHS, DAG);
39759     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
39760                        DAG.getBitcast(DstVT, LHS), RHS);
39761   }
39762 
39763   return SDValue();
39764 }
39765 
createMMXBuildVector(BuildVectorSDNode * BV,SelectionDAG & DAG,const X86Subtarget & Subtarget)39766 static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
39767                                     const X86Subtarget &Subtarget) {
39768   SDLoc DL(BV);
39769   unsigned NumElts = BV->getNumOperands();
39770   SDValue Splat = BV->getSplatValue();
39771 
39772   // Build MMX element from integer GPR or SSE float values.
39773   auto CreateMMXElement = [&](SDValue V) {
39774     if (V.isUndef())
39775       return DAG.getUNDEF(MVT::x86mmx);
39776     if (V.getValueType().isFloatingPoint()) {
39777       if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
39778         V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
39779         V = DAG.getBitcast(MVT::v2i64, V);
39780         return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
39781       }
39782       V = DAG.getBitcast(MVT::i32, V);
39783     } else {
39784       V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
39785     }
39786     return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
39787   };
39788 
39789   // Convert build vector ops to MMX data in the bottom elements.
39790   SmallVector<SDValue, 8> Ops;
39791 
39792   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39793 
39794   // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
39795   if (Splat) {
39796     if (Splat.isUndef())
39797       return DAG.getUNDEF(MVT::x86mmx);
39798 
39799     Splat = CreateMMXElement(Splat);
39800 
39801     if (Subtarget.hasSSE1()) {
39802       // Unpack v8i8 to splat i8 elements to lowest 16-bits.
39803       if (NumElts == 8)
39804         Splat = DAG.getNode(
39805             ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
39806             DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
39807                                   TLI.getPointerTy(DAG.getDataLayout())),
39808             Splat, Splat);
39809 
39810       // Use PSHUFW to repeat 16-bit elements.
39811       unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
39812       return DAG.getNode(
39813           ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
39814           DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
39815                                 TLI.getPointerTy(DAG.getDataLayout())),
39816           Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
39817     }
39818     Ops.append(NumElts, Splat);
39819   } else {
39820     for (unsigned i = 0; i != NumElts; ++i)
39821       Ops.push_back(CreateMMXElement(BV->getOperand(i)));
39822   }
39823 
39824   // Use tree of PUNPCKLs to build up general MMX vector.
39825   while (Ops.size() > 1) {
39826     unsigned NumOps = Ops.size();
39827     unsigned IntrinOp =
39828         (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
39829                      : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
39830                                     : Intrinsic::x86_mmx_punpcklbw));
39831     SDValue Intrin = DAG.getTargetConstant(
39832         IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
39833     for (unsigned i = 0; i != NumOps; i += 2)
39834       Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
39835                                Ops[i], Ops[i + 1]);
39836     Ops.resize(NumOps / 2);
39837   }
39838 
39839   return Ops[0];
39840 }
39841 
39842 // Recursive function that attempts to find if a bool vector node was originally
39843 // a vector/float/double that got truncated/extended/bitcast to/from a scalar
39844 // integer. If so, replace the scalar ops with bool vector equivalents back down
39845 // the chain.
combineBitcastToBoolVector(EVT VT,SDValue V,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget)39846 static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
39847                                           SelectionDAG &DAG,
39848                                           const X86Subtarget &Subtarget) {
39849   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39850   unsigned Opc = V.getOpcode();
39851   switch (Opc) {
39852   case ISD::BITCAST: {
39853     // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
39854     SDValue Src = V.getOperand(0);
39855     EVT SrcVT = Src.getValueType();
39856     if (SrcVT.isVector() || SrcVT.isFloatingPoint())
39857       return DAG.getBitcast(VT, Src);
39858     break;
39859   }
39860   case ISD::TRUNCATE: {
39861     // If we find a suitable source, a truncated scalar becomes a subvector.
39862     SDValue Src = V.getOperand(0);
39863     EVT NewSrcVT =
39864         EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
39865     if (TLI.isTypeLegal(NewSrcVT))
39866       if (SDValue N0 =
39867               combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
39868         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
39869                            DAG.getIntPtrConstant(0, DL));
39870     break;
39871   }
39872   case ISD::ANY_EXTEND:
39873   case ISD::ZERO_EXTEND: {
39874     // If we find a suitable source, an extended scalar becomes a subvector.
39875     SDValue Src = V.getOperand(0);
39876     EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
39877                                     Src.getScalarValueSizeInBits());
39878     if (TLI.isTypeLegal(NewSrcVT))
39879       if (SDValue N0 =
39880               combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
39881         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
39882                            Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
39883                                                   : DAG.getConstant(0, DL, VT),
39884                            N0, DAG.getIntPtrConstant(0, DL));
39885     break;
39886   }
39887   case ISD::OR: {
39888     // If we find suitable sources, we can just move an OR to the vector domain.
39889     SDValue Src0 = V.getOperand(0);
39890     SDValue Src1 = V.getOperand(1);
39891     if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
39892       if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
39893         return DAG.getNode(Opc, DL, VT, N0, N1);
39894     break;
39895   }
39896   case ISD::SHL: {
39897     // If we find a suitable source, a SHL becomes a KSHIFTL.
39898     SDValue Src0 = V.getOperand(0);
39899     if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
39900         ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
39901       break;
39902 
39903     if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
39904       if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
39905         return DAG.getNode(
39906             X86ISD::KSHIFTL, DL, VT, N0,
39907             DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
39908     break;
39909   }
39910   }
39911   return SDValue();
39912 }
39913 
combineBitcast(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)39914 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
39915                               TargetLowering::DAGCombinerInfo &DCI,
39916                               const X86Subtarget &Subtarget) {
39917   SDValue N0 = N->getOperand(0);
39918   EVT VT = N->getValueType(0);
39919   EVT SrcVT = N0.getValueType();
39920   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39921 
39922   // Try to match patterns such as
39923   // (i16 bitcast (v16i1 x))
39924   // ->
39925   // (i16 movmsk (16i8 sext (v16i1 x)))
39926   // before the setcc result is scalarized on subtargets that don't have legal
39927   // vxi1 types.
39928   if (DCI.isBeforeLegalize()) {
39929     SDLoc dl(N);
39930     if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
39931       return V;
39932 
39933     // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
39934     // type, widen both sides to avoid a trip through memory.
39935     if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
39936         Subtarget.hasAVX512()) {
39937       N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
39938       N0 = DAG.getBitcast(MVT::v8i1, N0);
39939       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
39940                          DAG.getIntPtrConstant(0, dl));
39941     }
39942 
39943     // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
39944     // type, widen both sides to avoid a trip through memory.
39945     if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
39946         Subtarget.hasAVX512()) {
39947       // Use zeros for the widening if we already have some zeroes. This can
39948       // allow SimplifyDemandedBits to remove scalar ANDs that may be down
39949       // stream of this.
39950       // FIXME: It might make sense to detect a concat_vectors with a mix of
39951       // zeroes and undef and turn it into insert_subvector for i1 vectors as
39952       // a separate combine. What we can't do is canonicalize the operands of
39953       // such a concat or we'll get into a loop with SimplifyDemandedBits.
39954       if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
39955         SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
39956         if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
39957           SrcVT = LastOp.getValueType();
39958           unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
39959           SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
39960           Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
39961           N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
39962           N0 = DAG.getBitcast(MVT::i8, N0);
39963           return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
39964         }
39965       }
39966 
39967       unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
39968       SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
39969       Ops[0] = N0;
39970       N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
39971       N0 = DAG.getBitcast(MVT::i8, N0);
39972       return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
39973     }
39974   } else {
39975     // If we're bitcasting from iX to vXi1, see if the integer originally
39976     // began as a vXi1 and whether we can remove the bitcast entirely.
39977     if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
39978         SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
39979       if (SDValue V =
39980               combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
39981         return V;
39982     }
39983   }
39984 
39985   // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
39986   // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
39987   // due to insert_subvector legalization on KNL. By promoting the copy to i16
39988   // we can help with known bits propagation from the vXi1 domain to the
39989   // scalar domain.
39990   if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
39991       !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39992       N0.getOperand(0).getValueType() == MVT::v16i1 &&
39993       isNullConstant(N0.getOperand(1)))
39994     return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
39995                        DAG.getBitcast(MVT::i16, N0.getOperand(0)));
39996 
39997   // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
39998   // and the vbroadcast_load are both integer or both fp. In some cases this
39999   // will remove the bitcast entirely.
40000   if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
40001        VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
40002     auto *BCast = cast<MemIntrinsicSDNode>(N0);
40003     unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
40004     unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
40005     // Don't swap i8/i16 since don't have fp types that size.
40006     if (MemSize >= 32) {
40007       MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
40008                                        : MVT::getIntegerVT(MemSize);
40009       MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
40010                                         : MVT::getIntegerVT(SrcVTSize);
40011       LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
40012 
40013       SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
40014       SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
40015       SDValue ResNode =
40016           DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
40017                                   MemVT, BCast->getMemOperand());
40018       DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
40019       return DAG.getBitcast(VT, ResNode);
40020     }
40021   }
40022 
40023   // Since MMX types are special and don't usually play with other vector types,
40024   // it's better to handle them early to be sure we emit efficient code by
40025   // avoiding store-load conversions.
40026   if (VT == MVT::x86mmx) {
40027     // Detect MMX constant vectors.
40028     APInt UndefElts;
40029     SmallVector<APInt, 1> EltBits;
40030     if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
40031       SDLoc DL(N0);
40032       // Handle zero-extension of i32 with MOVD.
40033       if (EltBits[0].countLeadingZeros() >= 32)
40034         return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
40035                            DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
40036       // Else, bitcast to a double.
40037       // TODO - investigate supporting sext 32-bit immediates on x86_64.
40038       APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
40039       return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
40040     }
40041 
40042     // Detect bitcasts to x86mmx low word.
40043     if (N0.getOpcode() == ISD::BUILD_VECTOR &&
40044         (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
40045         N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
40046       bool LowUndef = true, AllUndefOrZero = true;
40047       for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
40048         SDValue Op = N0.getOperand(i);
40049         LowUndef &= Op.isUndef() || (i >= e/2);
40050         AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
40051       }
40052       if (AllUndefOrZero) {
40053         SDValue N00 = N0.getOperand(0);
40054         SDLoc dl(N00);
40055         N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
40056                        : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
40057         return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
40058       }
40059     }
40060 
40061     // Detect bitcasts of 64-bit build vectors and convert to a
40062     // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
40063     // lowest element.
40064     if (N0.getOpcode() == ISD::BUILD_VECTOR &&
40065         (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
40066          SrcVT == MVT::v8i8))
40067       return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
40068 
40069     // Detect bitcasts between element or subvector extraction to x86mmx.
40070     if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
40071          N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
40072         isNullConstant(N0.getOperand(1))) {
40073       SDValue N00 = N0.getOperand(0);
40074       if (N00.getValueType().is128BitVector())
40075         return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
40076                            DAG.getBitcast(MVT::v2i64, N00));
40077     }
40078 
40079     // Detect bitcasts from FP_TO_SINT to x86mmx.
40080     if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
40081       SDLoc DL(N0);
40082       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
40083                                 DAG.getUNDEF(MVT::v2i32));
40084       return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
40085                          DAG.getBitcast(MVT::v2i64, Res));
40086     }
40087   }
40088 
40089   // Try to remove a bitcast of constant vXi1 vector. We have to legalize
40090   // most of these to scalar anyway.
40091   if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
40092       SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
40093       ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
40094     return combinevXi1ConstantToInteger(N0, DAG);
40095   }
40096 
40097   if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
40098       VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
40099       isa<ConstantSDNode>(N0)) {
40100     auto *C = cast<ConstantSDNode>(N0);
40101     if (C->isAllOnesValue())
40102       return DAG.getConstant(1, SDLoc(N0), VT);
40103     if (C->isNullValue())
40104       return DAG.getConstant(0, SDLoc(N0), VT);
40105   }
40106 
40107   // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
40108   // Turn it into a sign bit compare that produces a k-register. This avoids
40109   // a trip through a GPR.
40110   if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
40111       VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
40112       isPowerOf2_32(VT.getVectorNumElements())) {
40113     unsigned NumElts = VT.getVectorNumElements();
40114     SDValue Src = N0;
40115 
40116     // Peek through truncate.
40117     if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
40118       Src = N0.getOperand(0);
40119 
40120     if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
40121       SDValue MovmskIn = Src.getOperand(0);
40122       MVT MovmskVT = MovmskIn.getSimpleValueType();
40123       unsigned MovMskElts = MovmskVT.getVectorNumElements();
40124 
40125       // We allow extra bits of the movmsk to be used since they are known zero.
40126       // We can't convert a VPMOVMSKB without avx512bw.
40127       if (MovMskElts <= NumElts &&
40128           (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
40129         EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
40130         MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
40131         SDLoc dl(N);
40132         MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
40133         SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
40134                                    DAG.getConstant(0, dl, IntVT), ISD::SETLT);
40135         if (EVT(CmpVT) == VT)
40136           return Cmp;
40137 
40138         // Pad with zeroes up to original VT to replace the zeroes that were
40139         // being used from the MOVMSK.
40140         unsigned NumConcats = NumElts / MovMskElts;
40141         SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
40142         Ops[0] = Cmp;
40143         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
40144       }
40145     }
40146   }
40147 
40148   // Try to remove bitcasts from input and output of mask arithmetic to
40149   // remove GPR<->K-register crossings.
40150   if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
40151     return V;
40152 
40153   // Convert a bitcasted integer logic operation that has one bitcasted
40154   // floating-point operand into a floating-point logic operation. This may
40155   // create a load of a constant, but that is cheaper than materializing the
40156   // constant in an integer register and transferring it to an SSE register or
40157   // transferring the SSE operand to integer register and back.
40158   unsigned FPOpcode;
40159   switch (N0.getOpcode()) {
40160     case ISD::AND: FPOpcode = X86ISD::FAND; break;
40161     case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
40162     case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
40163     default: return SDValue();
40164   }
40165 
40166   // Check if we have a bitcast from another integer type as well.
40167   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
40168         (Subtarget.hasSSE2() && VT == MVT::f64) ||
40169         (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
40170          TLI.isTypeLegal(VT))))
40171     return SDValue();
40172 
40173   SDValue LogicOp0 = N0.getOperand(0);
40174   SDValue LogicOp1 = N0.getOperand(1);
40175   SDLoc DL0(N0);
40176 
40177   // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
40178   if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
40179       LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
40180       LogicOp0.getOperand(0).getValueType() == VT &&
40181       !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
40182     SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
40183     unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
40184     return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
40185   }
40186   // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
40187   if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
40188       LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
40189       LogicOp1.getOperand(0).getValueType() == VT &&
40190       !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
40191     SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
40192     unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
40193     return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
40194   }
40195 
40196   return SDValue();
40197 }
40198 
40199 // Given a ABS node, detect the following pattern:
40200 // (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
40201 // This is useful as it is the input into a SAD pattern.
detectZextAbsDiff(const SDValue & Abs,SDValue & Op0,SDValue & Op1)40202 static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
40203   SDValue AbsOp1 = Abs->getOperand(0);
40204   if (AbsOp1.getOpcode() != ISD::SUB)
40205     return false;
40206 
40207   Op0 = AbsOp1.getOperand(0);
40208   Op1 = AbsOp1.getOperand(1);
40209 
40210   // Check if the operands of the sub are zero-extended from vectors of i8.
40211   if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
40212       Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
40213       Op1.getOpcode() != ISD::ZERO_EXTEND ||
40214       Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
40215     return false;
40216 
40217   return true;
40218 }
40219 
40220 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
40221 // to these zexts.
createPSADBW(SelectionDAG & DAG,const SDValue & Zext0,const SDValue & Zext1,const SDLoc & DL,const X86Subtarget & Subtarget)40222 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
40223                             const SDValue &Zext1, const SDLoc &DL,
40224                             const X86Subtarget &Subtarget) {
40225   // Find the appropriate width for the PSADBW.
40226   EVT InVT = Zext0.getOperand(0).getValueType();
40227   unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
40228 
40229   // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
40230   // fill in the missing vector elements with 0.
40231   unsigned NumConcat = RegSize / InVT.getSizeInBits();
40232   SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
40233   Ops[0] = Zext0.getOperand(0);
40234   MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
40235   SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
40236   Ops[0] = Zext1.getOperand(0);
40237   SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
40238 
40239   // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
40240   auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
40241                           ArrayRef<SDValue> Ops) {
40242     MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
40243     return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
40244   };
40245   MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
40246   return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
40247                           PSADBWBuilder);
40248 }
40249 
40250 // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
40251 // PHMINPOSUW.
combineMinMaxReduction(SDNode * Extract,SelectionDAG & DAG,const X86Subtarget & Subtarget)40252 static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
40253                                       const X86Subtarget &Subtarget) {
40254   // Bail without SSE41.
40255   if (!Subtarget.hasSSE41())
40256     return SDValue();
40257 
40258   EVT ExtractVT = Extract->getValueType(0);
40259   if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
40260     return SDValue();
40261 
40262   // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
40263   ISD::NodeType BinOp;
40264   SDValue Src = DAG.matchBinOpReduction(
40265       Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
40266   if (!Src)
40267     return SDValue();
40268 
40269   EVT SrcVT = Src.getValueType();
40270   EVT SrcSVT = SrcVT.getScalarType();
40271   if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
40272     return SDValue();
40273 
40274   SDLoc DL(Extract);
40275   SDValue MinPos = Src;
40276 
40277   // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
40278   while (SrcVT.getSizeInBits() > 128) {
40279     SDValue Lo, Hi;
40280     std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
40281     SrcVT = Lo.getValueType();
40282     MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
40283   }
40284   assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
40285           (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
40286          "Unexpected value type");
40287 
40288   // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
40289   // to flip the value accordingly.
40290   SDValue Mask;
40291   unsigned MaskEltsBits = ExtractVT.getSizeInBits();
40292   if (BinOp == ISD::SMAX)
40293     Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
40294   else if (BinOp == ISD::SMIN)
40295     Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
40296   else if (BinOp == ISD::UMAX)
40297     Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
40298 
40299   if (Mask)
40300     MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
40301 
40302   // For v16i8 cases we need to perform UMIN on pairs of byte elements,
40303   // shuffling each upper element down and insert zeros. This means that the
40304   // v16i8 UMIN will leave the upper element as zero, performing zero-extension
40305   // ready for the PHMINPOS.
40306   if (ExtractVT == MVT::i8) {
40307     SDValue Upper = DAG.getVectorShuffle(
40308         SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
40309         {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
40310     MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
40311   }
40312 
40313   // Perform the PHMINPOS on a v8i16 vector,
40314   MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
40315   MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
40316   MinPos = DAG.getBitcast(SrcVT, MinPos);
40317 
40318   if (Mask)
40319     MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
40320 
40321   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
40322                      DAG.getIntPtrConstant(0, DL));
40323 }
40324 
40325 // Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
combinePredicateReduction(SDNode * Extract,SelectionDAG & DAG,const X86Subtarget & Subtarget)40326 static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
40327                                          const X86Subtarget &Subtarget) {
40328   // Bail without SSE2.
40329   if (!Subtarget.hasSSE2())
40330     return SDValue();
40331 
40332   EVT ExtractVT = Extract->getValueType(0);
40333   unsigned BitWidth = ExtractVT.getSizeInBits();
40334   if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
40335       ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
40336     return SDValue();
40337 
40338   // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
40339   ISD::NodeType BinOp;
40340   SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
40341   if (!Match && ExtractVT == MVT::i1)
40342     Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
40343   if (!Match)
40344     return SDValue();
40345 
40346   // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
40347   // which we can't support here for now.
40348   if (Match.getScalarValueSizeInBits() != BitWidth)
40349     return SDValue();
40350 
40351   SDValue Movmsk;
40352   SDLoc DL(Extract);
40353   EVT MatchVT = Match.getValueType();
40354   unsigned NumElts = MatchVT.getVectorNumElements();
40355   unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
40356   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40357 
40358   if (ExtractVT == MVT::i1) {
40359     // Special case for (pre-legalization) vXi1 reductions.
40360     if (NumElts > 64 || !isPowerOf2_32(NumElts))
40361       return SDValue();
40362     if (TLI.isTypeLegal(MatchVT)) {
40363       // If this is a legal AVX512 predicate type then we can just bitcast.
40364       EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
40365       Movmsk = DAG.getBitcast(MovmskVT, Match);
40366     } else {
40367       // For all_of(setcc(vec,0,eq)) - avoid vXi64 comparisons if we don't have
40368       // PCMPEQQ (SSE41+), use PCMPEQD instead.
40369       if (BinOp == ISD::AND && !Subtarget.hasSSE41() &&
40370           Match.getOpcode() == ISD::SETCC &&
40371           ISD::isBuildVectorAllZeros(Match.getOperand(1).getNode()) &&
40372           cast<CondCodeSDNode>(Match.getOperand(2))->get() ==
40373               ISD::CondCode::SETEQ) {
40374         SDValue Vec = Match.getOperand(0);
40375         if (Vec.getValueType().getScalarType() == MVT::i64 &&
40376             (2 * NumElts) <= MaxElts) {
40377           NumElts *= 2;
40378           EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
40379           MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
40380           Match = DAG.getSetCC(
40381               DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),
40382               DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ);
40383         }
40384       }
40385 
40386       // Use combineBitcastvxi1 to create the MOVMSK.
40387       while (NumElts > MaxElts) {
40388         SDValue Lo, Hi;
40389         std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
40390         Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
40391         NumElts /= 2;
40392       }
40393       EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
40394       Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
40395     }
40396     if (!Movmsk)
40397       return SDValue();
40398     Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
40399   } else {
40400     // FIXME: Better handling of k-registers or 512-bit vectors?
40401     unsigned MatchSizeInBits = Match.getValueSizeInBits();
40402     if (!(MatchSizeInBits == 128 ||
40403           (MatchSizeInBits == 256 && Subtarget.hasAVX())))
40404       return SDValue();
40405 
40406     // Make sure this isn't a vector of 1 element. The perf win from using
40407     // MOVMSK diminishes with less elements in the reduction, but it is
40408     // generally better to get the comparison over to the GPRs as soon as
40409     // possible to reduce the number of vector ops.
40410     if (Match.getValueType().getVectorNumElements() < 2)
40411       return SDValue();
40412 
40413     // Check that we are extracting a reduction of all sign bits.
40414     if (DAG.ComputeNumSignBits(Match) != BitWidth)
40415       return SDValue();
40416 
40417     if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
40418       SDValue Lo, Hi;
40419       std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
40420       Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
40421       MatchSizeInBits = Match.getValueSizeInBits();
40422     }
40423 
40424     // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
40425     MVT MaskSrcVT;
40426     if (64 == BitWidth || 32 == BitWidth)
40427       MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
40428                                    MatchSizeInBits / BitWidth);
40429     else
40430       MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
40431 
40432     SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
40433     Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
40434     NumElts = MaskSrcVT.getVectorNumElements();
40435   }
40436   assert((NumElts <= 32 || NumElts == 64) &&
40437          "Not expecting more than 64 elements");
40438 
40439   MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
40440   if (BinOp == ISD::XOR) {
40441     // parity -> (PARITY(MOVMSK X))
40442     SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
40443     return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
40444   }
40445 
40446   SDValue CmpC;
40447   ISD::CondCode CondCode;
40448   if (BinOp == ISD::OR) {
40449     // any_of -> MOVMSK != 0
40450     CmpC = DAG.getConstant(0, DL, CmpVT);
40451     CondCode = ISD::CondCode::SETNE;
40452   } else {
40453     // all_of -> MOVMSK == ((1 << NumElts) - 1)
40454     CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
40455                            DL, CmpVT);
40456     CondCode = ISD::CondCode::SETEQ;
40457   }
40458 
40459   // The setcc produces an i8 of 0/1, so extend that to the result width and
40460   // negate to get the final 0/-1 mask value.
40461   EVT SetccVT =
40462       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
40463   SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
40464   SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
40465   SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
40466   return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
40467 }
40468 
combineBasicSADPattern(SDNode * Extract,SelectionDAG & DAG,const X86Subtarget & Subtarget)40469 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
40470                                       const X86Subtarget &Subtarget) {
40471   // PSADBW is only supported on SSE2 and up.
40472   if (!Subtarget.hasSSE2())
40473     return SDValue();
40474 
40475   EVT ExtractVT = Extract->getValueType(0);
40476   // Verify the type we're extracting is either i32 or i64.
40477   // FIXME: Could support other types, but this is what we have coverage for.
40478   if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
40479     return SDValue();
40480 
40481   EVT VT = Extract->getOperand(0).getValueType();
40482   if (!isPowerOf2_32(VT.getVectorNumElements()))
40483     return SDValue();
40484 
40485   // Match shuffle + add pyramid.
40486   ISD::NodeType BinOp;
40487   SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
40488 
40489   // The operand is expected to be zero extended from i8
40490   // (verified in detectZextAbsDiff).
40491   // In order to convert to i64 and above, additional any/zero/sign
40492   // extend is expected.
40493   // The zero extend from 32 bit has no mathematical effect on the result.
40494   // Also the sign extend is basically zero extend
40495   // (extends the sign bit which is zero).
40496   // So it is correct to skip the sign/zero extend instruction.
40497   if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
40498                Root.getOpcode() == ISD::ZERO_EXTEND ||
40499                Root.getOpcode() == ISD::ANY_EXTEND))
40500     Root = Root.getOperand(0);
40501 
40502   // If there was a match, we want Root to be a select that is the root of an
40503   // abs-diff pattern.
40504   if (!Root || Root.getOpcode() != ISD::ABS)
40505     return SDValue();
40506 
40507   // Check whether we have an abs-diff pattern feeding into the select.
40508   SDValue Zext0, Zext1;
40509   if (!detectZextAbsDiff(Root, Zext0, Zext1))
40510     return SDValue();
40511 
40512   // Create the SAD instruction.
40513   SDLoc DL(Extract);
40514   SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
40515 
40516   // If the original vector was wider than 8 elements, sum over the results
40517   // in the SAD vector.
40518   unsigned Stages = Log2_32(VT.getVectorNumElements());
40519   EVT SadVT = SAD.getValueType();
40520   if (Stages > 3) {
40521     unsigned SadElems = SadVT.getVectorNumElements();
40522 
40523     for(unsigned i = Stages - 3; i > 0; --i) {
40524       SmallVector<int, 16> Mask(SadElems, -1);
40525       for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
40526         Mask[j] = MaskEnd + j;
40527 
40528       SDValue Shuffle =
40529           DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
40530       SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
40531     }
40532   }
40533 
40534   unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
40535   // Return the lowest ExtractSizeInBits bits.
40536   EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
40537                                SadVT.getSizeInBits() / ExtractSizeInBits);
40538   SAD = DAG.getBitcast(ResVT, SAD);
40539   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
40540                      Extract->getOperand(1));
40541 }
40542 
40543 // Attempt to peek through a target shuffle and extract the scalar from the
40544 // source.
combineExtractWithShuffle(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)40545 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
40546                                          TargetLowering::DAGCombinerInfo &DCI,
40547                                          const X86Subtarget &Subtarget) {
40548   if (DCI.isBeforeLegalizeOps())
40549     return SDValue();
40550 
40551   SDLoc dl(N);
40552   SDValue Src = N->getOperand(0);
40553   SDValue Idx = N->getOperand(1);
40554 
40555   EVT VT = N->getValueType(0);
40556   EVT SrcVT = Src.getValueType();
40557   EVT SrcSVT = SrcVT.getVectorElementType();
40558   unsigned SrcEltBits = SrcSVT.getSizeInBits();
40559   unsigned NumSrcElts = SrcVT.getVectorNumElements();
40560 
40561   // Don't attempt this for boolean mask vectors or unknown extraction indices.
40562   if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
40563     return SDValue();
40564 
40565   const APInt &IdxC = N->getConstantOperandAPInt(1);
40566   if (IdxC.uge(NumSrcElts))
40567     return SDValue();
40568 
40569   SDValue SrcBC = peekThroughBitcasts(Src);
40570 
40571   // Handle extract(bitcast(broadcast(scalar_value))).
40572   if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
40573     SDValue SrcOp = SrcBC.getOperand(0);
40574     EVT SrcOpVT = SrcOp.getValueType();
40575     if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
40576         (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
40577       unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
40578       unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
40579       // TODO support non-zero offsets.
40580       if (Offset == 0) {
40581         SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
40582         SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
40583         return SrcOp;
40584       }
40585     }
40586   }
40587 
40588   // If we're extracting a single element from a broadcast load and there are
40589   // no other users, just create a single load.
40590   if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
40591     auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
40592     unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
40593     if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
40594         VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
40595       SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
40596                                  MemIntr->getBasePtr(),
40597                                  MemIntr->getPointerInfo(),
40598                                  MemIntr->getOriginalAlign(),
40599                                  MemIntr->getMemOperand()->getFlags());
40600       DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
40601       return Load;
40602     }
40603   }
40604 
40605   // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
40606   // TODO: Move to DAGCombine?
40607   if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
40608       SrcBC.getValueType().isInteger() &&
40609       (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
40610       SrcBC.getScalarValueSizeInBits() ==
40611           SrcBC.getOperand(0).getValueSizeInBits()) {
40612     unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
40613     if (IdxC.ult(Scale)) {
40614       unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
40615       SDValue Scl = SrcBC.getOperand(0);
40616       EVT SclVT = Scl.getValueType();
40617       if (Offset) {
40618         Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
40619                           DAG.getShiftAmountConstant(Offset, SclVT, dl));
40620       }
40621       Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
40622       Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
40623       return Scl;
40624     }
40625   }
40626 
40627   // Handle extract(truncate(x)) for 0'th index.
40628   // TODO: Treat this as a faux shuffle?
40629   // TODO: When can we use this for general indices?
40630   if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
40631       (SrcVT.getSizeInBits() % 128) == 0) {
40632     Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
40633     MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
40634     return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
40635                        Idx);
40636   }
40637 
40638   // We can only legally extract other elements from 128-bit vectors and in
40639   // certain circumstances, depending on SSE-level.
40640   // TODO: Investigate float/double extraction if it will be just stored.
40641   auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
40642                                                  unsigned Idx) {
40643     EVT VecSVT = VecVT.getScalarType();
40644     if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
40645         (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
40646          VecSVT == MVT::i64)) {
40647       unsigned EltSizeInBits = VecSVT.getSizeInBits();
40648       unsigned NumEltsPerLane = 128 / EltSizeInBits;
40649       unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
40650       unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
40651       VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
40652       Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
40653       Idx &= (NumEltsPerLane - 1);
40654     }
40655     if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
40656         ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
40657       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
40658                          DAG.getBitcast(VecVT, Vec),
40659                          DAG.getIntPtrConstant(Idx, dl));
40660     }
40661     if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
40662         (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
40663       unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
40664       return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
40665                          DAG.getTargetConstant(Idx, dl, MVT::i8));
40666     }
40667     return SDValue();
40668   };
40669 
40670   // Resolve the target shuffle inputs and mask.
40671   SmallVector<int, 16> Mask;
40672   SmallVector<SDValue, 2> Ops;
40673   if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
40674     return SDValue();
40675 
40676   // Shuffle inputs must be the same size as the result.
40677   if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
40678         return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
40679       }))
40680     return SDValue();
40681 
40682   // Attempt to narrow/widen the shuffle mask to the correct size.
40683   if (Mask.size() != NumSrcElts) {
40684     if ((NumSrcElts % Mask.size()) == 0) {
40685       SmallVector<int, 16> ScaledMask;
40686       int Scale = NumSrcElts / Mask.size();
40687       narrowShuffleMaskElts(Scale, Mask, ScaledMask);
40688       Mask = std::move(ScaledMask);
40689     } else if ((Mask.size() % NumSrcElts) == 0) {
40690       // Simplify Mask based on demanded element.
40691       int ExtractIdx = (int)IdxC.getZExtValue();
40692       int Scale = Mask.size() / NumSrcElts;
40693       int Lo = Scale * ExtractIdx;
40694       int Hi = Scale * (ExtractIdx + 1);
40695       for (int i = 0, e = (int)Mask.size(); i != e; ++i)
40696         if (i < Lo || Hi <= i)
40697           Mask[i] = SM_SentinelUndef;
40698 
40699       SmallVector<int, 16> WidenedMask;
40700       while (Mask.size() > NumSrcElts &&
40701              canWidenShuffleElements(Mask, WidenedMask))
40702         Mask = std::move(WidenedMask);
40703     }
40704   }
40705 
40706   // If narrowing/widening failed, see if we can extract+zero-extend.
40707   int ExtractIdx;
40708   EVT ExtractVT;
40709   if (Mask.size() == NumSrcElts) {
40710     ExtractIdx = Mask[IdxC.getZExtValue()];
40711     ExtractVT = SrcVT;
40712   } else {
40713     unsigned Scale = Mask.size() / NumSrcElts;
40714     if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
40715       return SDValue();
40716     unsigned ScaledIdx = Scale * IdxC.getZExtValue();
40717     if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
40718       return SDValue();
40719     ExtractIdx = Mask[ScaledIdx];
40720     EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
40721     ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
40722     assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
40723            "Failed to widen vector type");
40724   }
40725 
40726   // If the shuffle source element is undef/zero then we can just accept it.
40727   if (ExtractIdx == SM_SentinelUndef)
40728     return DAG.getUNDEF(VT);
40729 
40730   if (ExtractIdx == SM_SentinelZero)
40731     return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
40732                                 : DAG.getConstant(0, dl, VT);
40733 
40734   SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
40735   ExtractIdx = ExtractIdx % Mask.size();
40736   if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
40737     return DAG.getZExtOrTrunc(V, dl, VT);
40738 
40739   return SDValue();
40740 }
40741 
40742 /// Extracting a scalar FP value from vector element 0 is free, so extract each
40743 /// operand first, then perform the math as a scalar op.
scalarizeExtEltFP(SDNode * ExtElt,SelectionDAG & DAG)40744 static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
40745   assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
40746   SDValue Vec = ExtElt->getOperand(0);
40747   SDValue Index = ExtElt->getOperand(1);
40748   EVT VT = ExtElt->getValueType(0);
40749   EVT VecVT = Vec.getValueType();
40750 
40751   // TODO: If this is a unary/expensive/expand op, allow extraction from a
40752   // non-zero element because the shuffle+scalar op will be cheaper?
40753   if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
40754     return SDValue();
40755 
40756   // Vector FP compares don't fit the pattern of FP math ops (propagate, not
40757   // extract, the condition code), so deal with those as a special-case.
40758   if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
40759     EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
40760     if (OpVT != MVT::f32 && OpVT != MVT::f64)
40761       return SDValue();
40762 
40763     // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
40764     SDLoc DL(ExtElt);
40765     SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
40766                                Vec.getOperand(0), Index);
40767     SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
40768                                Vec.getOperand(1), Index);
40769     return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
40770   }
40771 
40772   if (VT != MVT::f32 && VT != MVT::f64)
40773     return SDValue();
40774 
40775   // Vector FP selects don't fit the pattern of FP math ops (because the
40776   // condition has a different type and we have to change the opcode), so deal
40777   // with those here.
40778   // FIXME: This is restricted to pre type legalization by ensuring the setcc
40779   // has i1 elements. If we loosen this we need to convert vector bool to a
40780   // scalar bool.
40781   if (Vec.getOpcode() == ISD::VSELECT &&
40782       Vec.getOperand(0).getOpcode() == ISD::SETCC &&
40783       Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
40784       Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
40785     // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
40786     SDLoc DL(ExtElt);
40787     SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
40788                                Vec.getOperand(0).getValueType().getScalarType(),
40789                                Vec.getOperand(0), Index);
40790     SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
40791                                Vec.getOperand(1), Index);
40792     SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
40793                                Vec.getOperand(2), Index);
40794     return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
40795   }
40796 
40797   // TODO: This switch could include FNEG and the x86-specific FP logic ops
40798   // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
40799   // missed load folding and fma+fneg combining.
40800   switch (Vec.getOpcode()) {
40801   case ISD::FMA: // Begin 3 operands
40802   case ISD::FMAD:
40803   case ISD::FADD: // Begin 2 operands
40804   case ISD::FSUB:
40805   case ISD::FMUL:
40806   case ISD::FDIV:
40807   case ISD::FREM:
40808   case ISD::FCOPYSIGN:
40809   case ISD::FMINNUM:
40810   case ISD::FMAXNUM:
40811   case ISD::FMINNUM_IEEE:
40812   case ISD::FMAXNUM_IEEE:
40813   case ISD::FMAXIMUM:
40814   case ISD::FMINIMUM:
40815   case X86ISD::FMAX:
40816   case X86ISD::FMIN:
40817   case ISD::FABS: // Begin 1 operand
40818   case ISD::FSQRT:
40819   case ISD::FRINT:
40820   case ISD::FCEIL:
40821   case ISD::FTRUNC:
40822   case ISD::FNEARBYINT:
40823   case ISD::FROUND:
40824   case ISD::FFLOOR:
40825   case X86ISD::FRCP:
40826   case X86ISD::FRSQRT: {
40827     // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
40828     SDLoc DL(ExtElt);
40829     SmallVector<SDValue, 4> ExtOps;
40830     for (SDValue Op : Vec->ops())
40831       ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
40832     return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
40833   }
40834   default:
40835     return SDValue();
40836   }
40837   llvm_unreachable("All opcodes should return within switch");
40838 }
40839 
40840 /// Try to convert a vector reduction sequence composed of binops and shuffles
40841 /// into horizontal ops.
combineArithReduction(SDNode * ExtElt,SelectionDAG & DAG,const X86Subtarget & Subtarget)40842 static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
40843                                      const X86Subtarget &Subtarget) {
40844   assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
40845 
40846   // We need at least SSE2 to anything here.
40847   if (!Subtarget.hasSSE2())
40848     return SDValue();
40849 
40850   ISD::NodeType Opc;
40851   SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
40852                                         {ISD::ADD, ISD::MUL, ISD::FADD}, true);
40853   if (!Rdx)
40854     return SDValue();
40855 
40856   SDValue Index = ExtElt->getOperand(1);
40857   assert(isNullConstant(Index) &&
40858          "Reduction doesn't end in an extract from index 0");
40859 
40860   EVT VT = ExtElt->getValueType(0);
40861   EVT VecVT = Rdx.getValueType();
40862   if (VecVT.getScalarType() != VT)
40863     return SDValue();
40864 
40865   SDLoc DL(ExtElt);
40866 
40867   // vXi8 mul reduction - promote to vXi16 mul reduction.
40868   if (Opc == ISD::MUL) {
40869     unsigned NumElts = VecVT.getVectorNumElements();
40870     if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
40871       return SDValue();
40872     if (VecVT.getSizeInBits() >= 128) {
40873       EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
40874       SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
40875       SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
40876       Lo = DAG.getBitcast(WideVT, Lo);
40877       Hi = DAG.getBitcast(WideVT, Hi);
40878       Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
40879       while (Rdx.getValueSizeInBits() > 128) {
40880         std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
40881         Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
40882       }
40883     } else {
40884       if (VecVT == MVT::v4i8)
40885         Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
40886                           DAG.getUNDEF(MVT::v4i8));
40887       Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
40888                         DAG.getUNDEF(MVT::v8i8));
40889       Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
40890       Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
40891     }
40892     if (NumElts >= 8)
40893       Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
40894                         DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
40895                                              {4, 5, 6, 7, -1, -1, -1, -1}));
40896     Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
40897                       DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
40898                                            {2, 3, -1, -1, -1, -1, -1, -1}));
40899     Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
40900                       DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
40901                                            {1, -1, -1, -1, -1, -1, -1, -1}));
40902     Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
40903     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
40904   }
40905 
40906   // vXi8 add reduction - sub 128-bit vector.
40907   if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
40908     if (VecVT == MVT::v4i8) {
40909       // Pad with zero.
40910       if (Subtarget.hasSSE41()) {
40911         Rdx = DAG.getBitcast(MVT::i32, Rdx);
40912         Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
40913                           DAG.getConstant(0, DL, MVT::v4i32), Rdx,
40914                           DAG.getIntPtrConstant(0, DL));
40915         Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
40916       } else {
40917         Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
40918                           DAG.getConstant(0, DL, VecVT));
40919       }
40920     }
40921     if (Rdx.getValueType() == MVT::v8i8) {
40922       // Pad with undef.
40923       Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
40924                         DAG.getUNDEF(MVT::v8i8));
40925     }
40926     Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
40927                       DAG.getConstant(0, DL, MVT::v16i8));
40928     Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
40929     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
40930   }
40931 
40932   // Must be a >=128-bit vector with pow2 elements.
40933   if ((VecVT.getSizeInBits() % 128) != 0 ||
40934       !isPowerOf2_32(VecVT.getVectorNumElements()))
40935     return SDValue();
40936 
40937   // vXi8 add reduction - sum lo/hi halves then use PSADBW.
40938   if (VT == MVT::i8) {
40939     while (Rdx.getValueSizeInBits() > 128) {
40940       SDValue Lo, Hi;
40941       std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
40942       VecVT = Lo.getValueType();
40943       Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
40944     }
40945     assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
40946 
40947     SDValue Hi = DAG.getVectorShuffle(
40948         MVT::v16i8, DL, Rdx, Rdx,
40949         {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
40950     Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
40951     Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
40952                       getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
40953     Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
40954     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
40955   }
40956 
40957   // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
40958   if (!shouldUseHorizontalOp(true, DAG, Subtarget))
40959     return SDValue();
40960 
40961   unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
40962 
40963   // 256-bit horizontal instructions operate on 128-bit chunks rather than
40964   // across the whole vector, so we need an extract + hop preliminary stage.
40965   // This is the only step where the operands of the hop are not the same value.
40966   // TODO: We could extend this to handle 512-bit or even longer vectors.
40967   if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
40968       ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
40969     unsigned NumElts = VecVT.getVectorNumElements();
40970     SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
40971     SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
40972     Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
40973     VecVT = Rdx.getValueType();
40974   }
40975   if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
40976       !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
40977     return SDValue();
40978 
40979   // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
40980   unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
40981   for (unsigned i = 0; i != ReductionSteps; ++i)
40982     Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
40983 
40984   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
40985 }
40986 
40987 /// Detect vector gather/scatter index generation and convert it from being a
40988 /// bunch of shuffles and extracts into a somewhat faster sequence.
40989 /// For i686, the best sequence is apparently storing the value and loading
40990 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
combineExtractVectorElt(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)40991 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
40992                                        TargetLowering::DAGCombinerInfo &DCI,
40993                                        const X86Subtarget &Subtarget) {
40994   if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
40995     return NewOp;
40996 
40997   SDValue InputVector = N->getOperand(0);
40998   SDValue EltIdx = N->getOperand(1);
40999   auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
41000 
41001   EVT SrcVT = InputVector.getValueType();
41002   EVT VT = N->getValueType(0);
41003   SDLoc dl(InputVector);
41004   bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
41005   unsigned NumSrcElts = SrcVT.getVectorNumElements();
41006 
41007   if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
41008     return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
41009 
41010   // Integer Constant Folding.
41011   if (CIdx && VT.isInteger()) {
41012     APInt UndefVecElts;
41013     SmallVector<APInt, 16> EltBits;
41014     unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
41015     if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
41016                                       EltBits, true, false)) {
41017       uint64_t Idx = CIdx->getZExtValue();
41018       if (UndefVecElts[Idx])
41019         return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
41020       return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
41021                              dl, VT);
41022     }
41023   }
41024 
41025   if (IsPextr) {
41026     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41027     if (TLI.SimplifyDemandedBits(
41028             SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
41029       return SDValue(N, 0);
41030 
41031     // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
41032     if ((InputVector.getOpcode() == X86ISD::PINSRB ||
41033          InputVector.getOpcode() == X86ISD::PINSRW) &&
41034         InputVector.getOperand(2) == EltIdx) {
41035       assert(SrcVT == InputVector.getOperand(0).getValueType() &&
41036              "Vector type mismatch");
41037       SDValue Scl = InputVector.getOperand(1);
41038       Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
41039       return DAG.getZExtOrTrunc(Scl, dl, VT);
41040     }
41041 
41042     // TODO - Remove this once we can handle the implicit zero-extension of
41043     // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
41044     // combineBasicSADPattern.
41045     return SDValue();
41046   }
41047 
41048   // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
41049   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
41050       VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
41051     SDValue MMXSrc = InputVector.getOperand(0);
41052 
41053     // The bitcast source is a direct mmx result.
41054     if (MMXSrc.getValueType() == MVT::x86mmx)
41055       return DAG.getBitcast(VT, InputVector);
41056   }
41057 
41058   // Detect mmx to i32 conversion through a v2i32 elt extract.
41059   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
41060       VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
41061     SDValue MMXSrc = InputVector.getOperand(0);
41062 
41063     // The bitcast source is a direct mmx result.
41064     if (MMXSrc.getValueType() == MVT::x86mmx)
41065       return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
41066   }
41067 
41068   // Check whether this extract is the root of a sum of absolute differences
41069   // pattern. This has to be done here because we really want it to happen
41070   // pre-legalization,
41071   if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
41072     return SAD;
41073 
41074   // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
41075   if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
41076     return Cmp;
41077 
41078   // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
41079   if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
41080     return MinMax;
41081 
41082   // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
41083   if (SDValue V = combineArithReduction(N, DAG, Subtarget))
41084     return V;
41085 
41086   if (SDValue V = scalarizeExtEltFP(N, DAG))
41087     return V;
41088 
41089   // Attempt to extract a i1 element by using MOVMSK to extract the signbits
41090   // and then testing the relevant element.
41091   //
41092   // Note that we only combine extracts on the *same* result number, i.e.
41093   //   t0 = merge_values a0, a1, a2, a3
41094   //   i1 = extract_vector_elt t0, Constant:i64<2>
41095   //   i1 = extract_vector_elt t0, Constant:i64<3>
41096   // but not
41097   //   i1 = extract_vector_elt t0:1, Constant:i64<2>
41098   // since the latter would need its own MOVMSK.
41099   if (CIdx && SrcVT.getScalarType() == MVT::i1) {
41100     SmallVector<SDNode *, 16> BoolExtracts;
41101     unsigned ResNo = InputVector.getResNo();
41102     auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) {
41103       if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
41104           isa<ConstantSDNode>(Use->getOperand(1)) &&
41105           Use->getOperand(0).getResNo() == ResNo &&
41106           Use->getValueType(0) == MVT::i1) {
41107         BoolExtracts.push_back(Use);
41108         return true;
41109       }
41110       return false;
41111     };
41112     if (all_of(InputVector->uses(), IsBoolExtract) &&
41113         BoolExtracts.size() > 1) {
41114       EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
41115       if (SDValue BC =
41116               combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
41117         for (SDNode *Use : BoolExtracts) {
41118           // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
41119           unsigned MaskIdx = Use->getConstantOperandVal(1);
41120           APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
41121           SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
41122           SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
41123           Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
41124           DCI.CombineTo(Use, Res);
41125         }
41126         return SDValue(N, 0);
41127       }
41128     }
41129   }
41130 
41131   return SDValue();
41132 }
41133 
41134 /// If a vector select has an operand that is -1 or 0, try to simplify the
41135 /// select to a bitwise logic operation.
41136 /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
41137 static SDValue
combineVSelectWithAllOnesOrZeros(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)41138 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
41139                                  TargetLowering::DAGCombinerInfo &DCI,
41140                                  const X86Subtarget &Subtarget) {
41141   SDValue Cond = N->getOperand(0);
41142   SDValue LHS = N->getOperand(1);
41143   SDValue RHS = N->getOperand(2);
41144   EVT VT = LHS.getValueType();
41145   EVT CondVT = Cond.getValueType();
41146   SDLoc DL(N);
41147   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41148 
41149   if (N->getOpcode() != ISD::VSELECT)
41150     return SDValue();
41151 
41152   assert(CondVT.isVector() && "Vector select expects a vector selector!");
41153 
41154   // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
41155   // TODO: Can we assert that both operands are not zeros (because that should
41156   //       get simplified at node creation time)?
41157   bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
41158   bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
41159 
41160   // If both inputs are 0/undef, create a complete zero vector.
41161   // FIXME: As noted above this should be handled by DAGCombiner/getNode.
41162   if (TValIsAllZeros && FValIsAllZeros) {
41163     if (VT.isFloatingPoint())
41164       return DAG.getConstantFP(0.0, DL, VT);
41165     return DAG.getConstant(0, DL, VT);
41166   }
41167 
41168   // To use the condition operand as a bitwise mask, it must have elements that
41169   // are the same size as the select elements. Ie, the condition operand must
41170   // have already been promoted from the IR select condition type <N x i1>.
41171   // Don't check if the types themselves are equal because that excludes
41172   // vector floating-point selects.
41173   if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
41174     return SDValue();
41175 
41176   // Try to invert the condition if true value is not all 1s and false value is
41177   // not all 0s. Only do this if the condition has one use.
41178   bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
41179   if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
41180       // Check if the selector will be produced by CMPP*/PCMP*.
41181       Cond.getOpcode() == ISD::SETCC &&
41182       // Check if SETCC has already been promoted.
41183       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
41184           CondVT) {
41185     bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
41186 
41187     if (TValIsAllZeros || FValIsAllOnes) {
41188       SDValue CC = Cond.getOperand(2);
41189       ISD::CondCode NewCC = ISD::getSetCCInverse(
41190           cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
41191       Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
41192                           NewCC);
41193       std::swap(LHS, RHS);
41194       TValIsAllOnes = FValIsAllOnes;
41195       FValIsAllZeros = TValIsAllZeros;
41196     }
41197   }
41198 
41199   // Cond value must be 'sign splat' to be converted to a logical op.
41200   if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
41201     return SDValue();
41202 
41203   // vselect Cond, 111..., 000... -> Cond
41204   if (TValIsAllOnes && FValIsAllZeros)
41205     return DAG.getBitcast(VT, Cond);
41206 
41207   if (!TLI.isTypeLegal(CondVT))
41208     return SDValue();
41209 
41210   // vselect Cond, 111..., X -> or Cond, X
41211   if (TValIsAllOnes) {
41212     SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
41213     SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
41214     return DAG.getBitcast(VT, Or);
41215   }
41216 
41217   // vselect Cond, X, 000... -> and Cond, X
41218   if (FValIsAllZeros) {
41219     SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
41220     SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
41221     return DAG.getBitcast(VT, And);
41222   }
41223 
41224   // vselect Cond, 000..., X -> andn Cond, X
41225   if (TValIsAllZeros) {
41226     SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
41227     SDValue AndN;
41228     // The canonical form differs for i1 vectors - x86andnp is not used
41229     if (CondVT.getScalarType() == MVT::i1)
41230       AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
41231                          CastRHS);
41232     else
41233       AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
41234     return DAG.getBitcast(VT, AndN);
41235   }
41236 
41237   return SDValue();
41238 }
41239 
41240 /// If both arms of a vector select are concatenated vectors, split the select,
41241 /// and concatenate the result to eliminate a wide (256-bit) vector instruction:
41242 ///   vselect Cond, (concat T0, T1), (concat F0, F1) -->
41243 ///   concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
narrowVectorSelect(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)41244 static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
41245                                   const X86Subtarget &Subtarget) {
41246   unsigned Opcode = N->getOpcode();
41247   if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
41248     return SDValue();
41249 
41250   // TODO: Split 512-bit vectors too?
41251   EVT VT = N->getValueType(0);
41252   if (!VT.is256BitVector())
41253     return SDValue();
41254 
41255   // TODO: Split as long as any 2 of the 3 operands are concatenated?
41256   SDValue Cond = N->getOperand(0);
41257   SDValue TVal = N->getOperand(1);
41258   SDValue FVal = N->getOperand(2);
41259   SmallVector<SDValue, 4> CatOpsT, CatOpsF;
41260   if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
41261       !collectConcatOps(TVal.getNode(), CatOpsT) ||
41262       !collectConcatOps(FVal.getNode(), CatOpsF))
41263     return SDValue();
41264 
41265   auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
41266                             ArrayRef<SDValue> Ops) {
41267     return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
41268   };
41269   return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
41270                           makeBlend, /*CheckBWI*/ false);
41271 }
41272 
combineSelectOfTwoConstants(SDNode * N,SelectionDAG & DAG)41273 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
41274   SDValue Cond = N->getOperand(0);
41275   SDValue LHS = N->getOperand(1);
41276   SDValue RHS = N->getOperand(2);
41277   SDLoc DL(N);
41278 
41279   auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
41280   auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
41281   if (!TrueC || !FalseC)
41282     return SDValue();
41283 
41284   // Don't do this for crazy integer types.
41285   EVT VT = N->getValueType(0);
41286   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
41287     return SDValue();
41288 
41289   // We're going to use the condition bit in math or logic ops. We could allow
41290   // this with a wider condition value (post-legalization it becomes an i8),
41291   // but if nothing is creating selects that late, it doesn't matter.
41292   if (Cond.getValueType() != MVT::i1)
41293     return SDValue();
41294 
41295   // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
41296   // 3, 5, or 9 with i32/i64, so those get transformed too.
41297   // TODO: For constants that overflow or do not differ by power-of-2 or small
41298   // multiplier, convert to 'and' + 'add'.
41299   const APInt &TrueVal = TrueC->getAPIntValue();
41300   const APInt &FalseVal = FalseC->getAPIntValue();
41301   bool OV;
41302   APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
41303   if (OV)
41304     return SDValue();
41305 
41306   APInt AbsDiff = Diff.abs();
41307   if (AbsDiff.isPowerOf2() ||
41308       ((VT == MVT::i32 || VT == MVT::i64) &&
41309        (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
41310 
41311     // We need a positive multiplier constant for shift/LEA codegen. The 'not'
41312     // of the condition can usually be folded into a compare predicate, but even
41313     // without that, the sequence should be cheaper than a CMOV alternative.
41314     if (TrueVal.slt(FalseVal)) {
41315       Cond = DAG.getNOT(DL, Cond, MVT::i1);
41316       std::swap(TrueC, FalseC);
41317     }
41318 
41319     // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
41320     SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
41321 
41322     // Multiply condition by the difference if non-one.
41323     if (!AbsDiff.isOneValue())
41324       R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
41325 
41326     // Add the base if non-zero.
41327     if (!FalseC->isNullValue())
41328       R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
41329 
41330     return R;
41331   }
41332 
41333   return SDValue();
41334 }
41335 
41336 /// If this is a *dynamic* select (non-constant condition) and we can match
41337 /// this node with one of the variable blend instructions, restructure the
41338 /// condition so that blends can use the high (sign) bit of each element.
41339 /// This function will also call SimplifyDemandedBits on already created
41340 /// BLENDV to perform additional simplifications.
combineVSelectToBLENDV(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)41341 static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
41342                                            TargetLowering::DAGCombinerInfo &DCI,
41343                                            const X86Subtarget &Subtarget) {
41344   SDValue Cond = N->getOperand(0);
41345   if ((N->getOpcode() != ISD::VSELECT &&
41346        N->getOpcode() != X86ISD::BLENDV) ||
41347       ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
41348     return SDValue();
41349 
41350   // Don't optimize before the condition has been transformed to a legal type
41351   // and don't ever optimize vector selects that map to AVX512 mask-registers.
41352   unsigned BitWidth = Cond.getScalarValueSizeInBits();
41353   if (BitWidth < 8 || BitWidth > 64)
41354     return SDValue();
41355 
41356   // We can only handle the cases where VSELECT is directly legal on the
41357   // subtarget. We custom lower VSELECT nodes with constant conditions and
41358   // this makes it hard to see whether a dynamic VSELECT will correctly
41359   // lower, so we both check the operation's status and explicitly handle the
41360   // cases where a *dynamic* blend will fail even though a constant-condition
41361   // blend could be custom lowered.
41362   // FIXME: We should find a better way to handle this class of problems.
41363   // Potentially, we should combine constant-condition vselect nodes
41364   // pre-legalization into shuffles and not mark as many types as custom
41365   // lowered.
41366   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41367   EVT VT = N->getValueType(0);
41368   if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
41369     return SDValue();
41370   // FIXME: We don't support i16-element blends currently. We could and
41371   // should support them by making *all* the bits in the condition be set
41372   // rather than just the high bit and using an i8-element blend.
41373   if (VT.getVectorElementType() == MVT::i16)
41374     return SDValue();
41375   // Dynamic blending was only available from SSE4.1 onward.
41376   if (VT.is128BitVector() && !Subtarget.hasSSE41())
41377     return SDValue();
41378   // Byte blends are only available in AVX2
41379   if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
41380     return SDValue();
41381   // There are no 512-bit blend instructions that use sign bits.
41382   if (VT.is512BitVector())
41383     return SDValue();
41384 
41385   auto OnlyUsedAsSelectCond = [](SDValue Cond) {
41386     for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
41387          UI != UE; ++UI)
41388       if ((UI->getOpcode() != ISD::VSELECT &&
41389            UI->getOpcode() != X86ISD::BLENDV) ||
41390           UI.getOperandNo() != 0)
41391         return false;
41392 
41393     return true;
41394   };
41395 
41396   APInt DemandedBits(APInt::getSignMask(BitWidth));
41397 
41398   if (OnlyUsedAsSelectCond(Cond)) {
41399     KnownBits Known;
41400     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
41401                                           !DCI.isBeforeLegalizeOps());
41402     if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
41403       return SDValue();
41404 
41405     // If we changed the computation somewhere in the DAG, this change will
41406     // affect all users of Cond. Update all the nodes so that we do not use
41407     // the generic VSELECT anymore. Otherwise, we may perform wrong
41408     // optimizations as we messed with the actual expectation for the vector
41409     // boolean values.
41410     for (SDNode *U : Cond->uses()) {
41411       if (U->getOpcode() == X86ISD::BLENDV)
41412         continue;
41413 
41414       SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
41415                                Cond, U->getOperand(1), U->getOperand(2));
41416       DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
41417       DCI.AddToWorklist(U);
41418     }
41419     DCI.CommitTargetLoweringOpt(TLO);
41420     return SDValue(N, 0);
41421   }
41422 
41423   // Otherwise we can still at least try to simplify multiple use bits.
41424   if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
41425       return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
41426                          N->getOperand(1), N->getOperand(2));
41427 
41428   return SDValue();
41429 }
41430 
41431 // Try to match:
41432 //   (or (and (M, (sub 0, X)), (pandn M, X)))
41433 // which is a special case of:
41434 //   (select M, (sub 0, X), X)
41435 // Per:
41436 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
41437 // We know that, if fNegate is 0 or 1:
41438 //   (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
41439 //
41440 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
41441 //   ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
41442 //   ( M      ? -X : X) == ((X ^   M     ) + (M & 1))
41443 // This lets us transform our vselect to:
41444 //   (add (xor X, M), (and M, 1))
41445 // And further to:
41446 //   (sub (xor X, M), M)
combineLogicBlendIntoConditionalNegate(EVT VT,SDValue Mask,SDValue X,SDValue Y,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget)41447 static SDValue combineLogicBlendIntoConditionalNegate(
41448     EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
41449     SelectionDAG &DAG, const X86Subtarget &Subtarget) {
41450   EVT MaskVT = Mask.getValueType();
41451   assert(MaskVT.isInteger() &&
41452          DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
41453          "Mask must be zero/all-bits");
41454 
41455   if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
41456     return SDValue();
41457   if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
41458     return SDValue();
41459 
41460   auto IsNegV = [](SDNode *N, SDValue V) {
41461     return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
41462            ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
41463   };
41464 
41465   SDValue V;
41466   if (IsNegV(Y.getNode(), X))
41467     V = X;
41468   else if (IsNegV(X.getNode(), Y))
41469     V = Y;
41470   else
41471     return SDValue();
41472 
41473   SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
41474   SDValue SubOp2 = Mask;
41475 
41476   // If the negate was on the false side of the select, then
41477   // the operands of the SUB need to be swapped. PR 27251.
41478   // This is because the pattern being matched above is
41479   // (vselect M, (sub (0, X), X)  -> (sub (xor X, M), M)
41480   // but if the pattern matched was
41481   // (vselect M, X, (sub (0, X))), that is really negation of the pattern
41482   // above, -(vselect M, (sub 0, X), X), and therefore the replacement
41483   // pattern also needs to be a negation of the replacement pattern above.
41484   // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
41485   // sub accomplishes the negation of the replacement pattern.
41486   if (V == Y)
41487     std::swap(SubOp1, SubOp2);
41488 
41489   SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
41490   return DAG.getBitcast(VT, Res);
41491 }
41492 
41493 /// Do target-specific dag combines on SELECT and VSELECT nodes.
combineSelect(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)41494 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
41495                              TargetLowering::DAGCombinerInfo &DCI,
41496                              const X86Subtarget &Subtarget) {
41497   SDLoc DL(N);
41498   SDValue Cond = N->getOperand(0);
41499   SDValue LHS = N->getOperand(1);
41500   SDValue RHS = N->getOperand(2);
41501 
41502   // Try simplification again because we use this function to optimize
41503   // BLENDV nodes that are not handled by the generic combiner.
41504   if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
41505     return V;
41506 
41507   EVT VT = LHS.getValueType();
41508   EVT CondVT = Cond.getValueType();
41509   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41510   bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
41511 
41512   // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
41513   // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
41514   // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
41515   if (CondVT.isVector() && CondVT.isInteger() &&
41516       CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
41517       (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
41518       DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
41519     if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
41520                                                            DL, DAG, Subtarget))
41521       return V;
41522 
41523   // Convert vselects with constant condition into shuffles.
41524   if (CondConstantVector && DCI.isBeforeLegalizeOps()) {
41525     SmallVector<int, 64> Mask;
41526     if (createShuffleMaskFromVSELECT(Mask, Cond))
41527       return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
41528   }
41529 
41530   // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
41531   // by forcing the unselected elements to zero.
41532   // TODO: Can we handle more shuffles with this?
41533   if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
41534       LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
41535       LHS.hasOneUse() && RHS.hasOneUse()) {
41536     MVT SimpleVT = VT.getSimpleVT();
41537     SmallVector<SDValue, 1> LHSOps, RHSOps;
41538     SmallVector<int, 64> LHSMask, RHSMask, CondMask;
41539     if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
41540         getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&
41541         getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {
41542       int NumElts = VT.getVectorNumElements();
41543       for (int i = 0; i != NumElts; ++i) {
41544         if (CondMask[i] < NumElts)
41545           RHSMask[i] = 0x80;
41546         else
41547           LHSMask[i] = 0x80;
41548       }
41549       LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
41550                         getConstVector(LHSMask, SimpleVT, DAG, DL, true));
41551       RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
41552                         getConstVector(RHSMask, SimpleVT, DAG, DL, true));
41553       return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
41554     }
41555   }
41556 
41557   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
41558   // instructions match the semantics of the common C idiom x<y?x:y but not
41559   // x<=y?x:y, because of how they handle negative zero (which can be
41560   // ignored in unsafe-math mode).
41561   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
41562   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
41563       VT != MVT::f80 && VT != MVT::f128 &&
41564       (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
41565       (Subtarget.hasSSE2() ||
41566        (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
41567     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
41568 
41569     unsigned Opcode = 0;
41570     // Check for x CC y ? x : y.
41571     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
41572         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
41573       switch (CC) {
41574       default: break;
41575       case ISD::SETULT:
41576         // Converting this to a min would handle NaNs incorrectly, and swapping
41577         // the operands would cause it to handle comparisons between positive
41578         // and negative zero incorrectly.
41579         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
41580           if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41581               !(DAG.isKnownNeverZeroFloat(LHS) ||
41582                 DAG.isKnownNeverZeroFloat(RHS)))
41583             break;
41584           std::swap(LHS, RHS);
41585         }
41586         Opcode = X86ISD::FMIN;
41587         break;
41588       case ISD::SETOLE:
41589         // Converting this to a min would handle comparisons between positive
41590         // and negative zero incorrectly.
41591         if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41592             !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
41593           break;
41594         Opcode = X86ISD::FMIN;
41595         break;
41596       case ISD::SETULE:
41597         // Converting this to a min would handle both negative zeros and NaNs
41598         // incorrectly, but we can swap the operands to fix both.
41599         std::swap(LHS, RHS);
41600         LLVM_FALLTHROUGH;
41601       case ISD::SETOLT:
41602       case ISD::SETLT:
41603       case ISD::SETLE:
41604         Opcode = X86ISD::FMIN;
41605         break;
41606 
41607       case ISD::SETOGE:
41608         // Converting this to a max would handle comparisons between positive
41609         // and negative zero incorrectly.
41610         if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41611             !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
41612           break;
41613         Opcode = X86ISD::FMAX;
41614         break;
41615       case ISD::SETUGT:
41616         // Converting this to a max would handle NaNs incorrectly, and swapping
41617         // the operands would cause it to handle comparisons between positive
41618         // and negative zero incorrectly.
41619         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
41620           if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41621               !(DAG.isKnownNeverZeroFloat(LHS) ||
41622                 DAG.isKnownNeverZeroFloat(RHS)))
41623             break;
41624           std::swap(LHS, RHS);
41625         }
41626         Opcode = X86ISD::FMAX;
41627         break;
41628       case ISD::SETUGE:
41629         // Converting this to a max would handle both negative zeros and NaNs
41630         // incorrectly, but we can swap the operands to fix both.
41631         std::swap(LHS, RHS);
41632         LLVM_FALLTHROUGH;
41633       case ISD::SETOGT:
41634       case ISD::SETGT:
41635       case ISD::SETGE:
41636         Opcode = X86ISD::FMAX;
41637         break;
41638       }
41639     // Check for x CC y ? y : x -- a min/max with reversed arms.
41640     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
41641                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
41642       switch (CC) {
41643       default: break;
41644       case ISD::SETOGE:
41645         // Converting this to a min would handle comparisons between positive
41646         // and negative zero incorrectly, and swapping the operands would
41647         // cause it to handle NaNs incorrectly.
41648         if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41649             !(DAG.isKnownNeverZeroFloat(LHS) ||
41650               DAG.isKnownNeverZeroFloat(RHS))) {
41651           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
41652             break;
41653           std::swap(LHS, RHS);
41654         }
41655         Opcode = X86ISD::FMIN;
41656         break;
41657       case ISD::SETUGT:
41658         // Converting this to a min would handle NaNs incorrectly.
41659         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
41660           break;
41661         Opcode = X86ISD::FMIN;
41662         break;
41663       case ISD::SETUGE:
41664         // Converting this to a min would handle both negative zeros and NaNs
41665         // incorrectly, but we can swap the operands to fix both.
41666         std::swap(LHS, RHS);
41667         LLVM_FALLTHROUGH;
41668       case ISD::SETOGT:
41669       case ISD::SETGT:
41670       case ISD::SETGE:
41671         Opcode = X86ISD::FMIN;
41672         break;
41673 
41674       case ISD::SETULT:
41675         // Converting this to a max would handle NaNs incorrectly.
41676         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
41677           break;
41678         Opcode = X86ISD::FMAX;
41679         break;
41680       case ISD::SETOLE:
41681         // Converting this to a max would handle comparisons between positive
41682         // and negative zero incorrectly, and swapping the operands would
41683         // cause it to handle NaNs incorrectly.
41684         if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41685             !DAG.isKnownNeverZeroFloat(LHS) &&
41686             !DAG.isKnownNeverZeroFloat(RHS)) {
41687           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
41688             break;
41689           std::swap(LHS, RHS);
41690         }
41691         Opcode = X86ISD::FMAX;
41692         break;
41693       case ISD::SETULE:
41694         // Converting this to a max would handle both negative zeros and NaNs
41695         // incorrectly, but we can swap the operands to fix both.
41696         std::swap(LHS, RHS);
41697         LLVM_FALLTHROUGH;
41698       case ISD::SETOLT:
41699       case ISD::SETLT:
41700       case ISD::SETLE:
41701         Opcode = X86ISD::FMAX;
41702         break;
41703       }
41704     }
41705 
41706     if (Opcode)
41707       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
41708   }
41709 
41710   // Some mask scalar intrinsics rely on checking if only one bit is set
41711   // and implement it in C code like this:
41712   // A[0] = (U & 1) ? A[0] : W[0];
41713   // This creates some redundant instructions that break pattern matching.
41714   // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
41715   if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
41716       Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
41717     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
41718     SDValue AndNode = Cond.getOperand(0);
41719     if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
41720         isNullConstant(Cond.getOperand(1)) &&
41721         isOneConstant(AndNode.getOperand(1))) {
41722       // LHS and RHS swapped due to
41723       // setcc outputting 1 when AND resulted in 0 and vice versa.
41724       AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
41725       return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
41726     }
41727   }
41728 
41729   // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
41730   // lowering on KNL. In this case we convert it to
41731   // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
41732   // The same situation all vectors of i8 and i16 without BWI.
41733   // Make sure we extend these even before type legalization gets a chance to
41734   // split wide vectors.
41735   // Since SKX these selects have a proper lowering.
41736   if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
41737       CondVT.getVectorElementType() == MVT::i1 &&
41738       (VT.getVectorElementType() == MVT::i8 ||
41739        VT.getVectorElementType() == MVT::i16)) {
41740     Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
41741     return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
41742   }
41743 
41744   // AVX512 - Extend select with zero to merge with target shuffle.
41745   // select(mask, extract_subvector(shuffle(x)), zero) -->
41746   // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
41747   // TODO - support non target shuffles as well.
41748   if (Subtarget.hasAVX512() && CondVT.isVector() &&
41749       CondVT.getVectorElementType() == MVT::i1) {
41750     auto SelectableOp = [&TLI](SDValue Op) {
41751       return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41752              isTargetShuffle(Op.getOperand(0).getOpcode()) &&
41753              isNullConstant(Op.getOperand(1)) &&
41754              TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
41755              Op.hasOneUse() && Op.getOperand(0).hasOneUse();
41756     };
41757 
41758     bool SelectableLHS = SelectableOp(LHS);
41759     bool SelectableRHS = SelectableOp(RHS);
41760     bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
41761     bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
41762 
41763     if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
41764       EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
41765                                 : RHS.getOperand(0).getValueType();
41766       EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
41767       LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
41768                             VT.getSizeInBits());
41769       RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
41770                             VT.getSizeInBits());
41771       Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
41772                          DAG.getUNDEF(SrcCondVT), Cond,
41773                          DAG.getIntPtrConstant(0, DL));
41774       SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
41775       return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
41776     }
41777   }
41778 
41779   if (SDValue V = combineSelectOfTwoConstants(N, DAG))
41780     return V;
41781 
41782   // Canonicalize min/max:
41783   // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
41784   // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
41785   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
41786   // the need for an extra compare against zero. e.g.
41787   // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
41788   // subl   %esi, %edi
41789   // testl  %edi, %edi
41790   // movl   $0, %eax
41791   // cmovgl %edi, %eax
41792   // =>
41793   // xorl   %eax, %eax
41794   // subl   %esi, $edi
41795   // cmovsl %eax, %edi
41796   //
41797   // We can also canonicalize
41798   //  (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
41799   //  (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
41800   // This allows the use of a test instruction for the compare.
41801   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
41802       Cond.hasOneUse() &&
41803       LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
41804     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
41805     if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
41806         (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
41807       ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
41808       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
41809                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
41810       return DAG.getSelect(DL, VT, Cond, LHS, RHS);
41811     }
41812     if (CC == ISD::SETUGT && isOneConstant(RHS)) {
41813       ISD::CondCode NewCC = ISD::SETUGE;
41814       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
41815                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
41816       return DAG.getSelect(DL, VT, Cond, LHS, RHS);
41817     }
41818   }
41819 
41820   // Check if the first operand is all zeros and Cond type is vXi1.
41821   // If this an avx512 target we can improve the use of zero masking by
41822   // swapping the operands and inverting the condition.
41823   if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
41824        Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
41825       ISD::isBuildVectorAllZeros(LHS.getNode()) &&
41826       !ISD::isBuildVectorAllZeros(RHS.getNode())) {
41827     // Invert the cond to not(cond) : xor(op,allones)=not(op)
41828     SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
41829     // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
41830     return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
41831   }
41832 
41833   // Early exit check
41834   if (!TLI.isTypeLegal(VT))
41835     return SDValue();
41836 
41837   if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
41838     return V;
41839 
41840   if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
41841     return V;
41842 
41843   if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
41844     return V;
41845 
41846   // select(~Cond, X, Y) -> select(Cond, Y, X)
41847   if (CondVT.getScalarType() != MVT::i1) {
41848     if (SDValue CondNot = IsNOT(Cond, DAG))
41849       return DAG.getNode(N->getOpcode(), DL, VT,
41850                          DAG.getBitcast(CondVT, CondNot), RHS, LHS);
41851     // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the signbit.
41852     if (Cond.getOpcode() == X86ISD::PCMPGT && Cond.hasOneUse() &&
41853         ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode())) {
41854       Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
41855                          DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
41856       return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
41857     }
41858   }
41859 
41860   // Try to optimize vXi1 selects if both operands are either all constants or
41861   // bitcasts from scalar integer type. In that case we can convert the operands
41862   // to integer and use an integer select which will be converted to a CMOV.
41863   // We need to take a little bit of care to avoid creating an i64 type after
41864   // type legalization.
41865   if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
41866       VT.getVectorElementType() == MVT::i1 &&
41867       (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
41868     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
41869     bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
41870     bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
41871 
41872     if ((LHSIsConst ||
41873          (LHS.getOpcode() == ISD::BITCAST &&
41874           LHS.getOperand(0).getValueType() == IntVT)) &&
41875         (RHSIsConst ||
41876          (RHS.getOpcode() == ISD::BITCAST &&
41877           RHS.getOperand(0).getValueType() == IntVT))) {
41878       if (LHSIsConst)
41879         LHS = combinevXi1ConstantToInteger(LHS, DAG);
41880       else
41881         LHS = LHS.getOperand(0);
41882 
41883       if (RHSIsConst)
41884         RHS = combinevXi1ConstantToInteger(RHS, DAG);
41885       else
41886         RHS = RHS.getOperand(0);
41887 
41888       SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
41889       return DAG.getBitcast(VT, Select);
41890     }
41891   }
41892 
41893   // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
41894   // single bits, then invert the predicate and swap the select operands.
41895   // This can lower using a vector shift bit-hack rather than mask and compare.
41896   if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
41897       N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
41898       Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
41899       Cond.getOperand(0).getOpcode() == ISD::AND &&
41900       isNullOrNullSplat(Cond.getOperand(1)) &&
41901       cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
41902       Cond.getOperand(0).getValueType() == VT) {
41903     // The 'and' mask must be composed of power-of-2 constants.
41904     SDValue And = Cond.getOperand(0);
41905     auto *C = isConstOrConstSplat(And.getOperand(1));
41906     if (C && C->getAPIntValue().isPowerOf2()) {
41907       // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
41908       SDValue NotCond =
41909           DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
41910       return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
41911     }
41912 
41913     // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
41914     // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
41915     // 16-bit lacks a proper blendv.
41916     unsigned EltBitWidth = VT.getScalarSizeInBits();
41917     bool CanShiftBlend =
41918         TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
41919                                 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
41920                                 (Subtarget.hasXOP()));
41921     if (CanShiftBlend &&
41922         ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
41923           return C->getAPIntValue().isPowerOf2();
41924         })) {
41925       // Create a left-shift constant to get the mask bits over to the sign-bit.
41926       SDValue Mask = And.getOperand(1);
41927       SmallVector<int, 32> ShlVals;
41928       for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
41929         auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
41930         ShlVals.push_back(EltBitWidth - 1 -
41931                           MaskVal->getAPIntValue().exactLogBase2());
41932       }
41933       // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
41934       SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
41935       SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
41936       SDValue NewCond =
41937           DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
41938       return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
41939     }
41940   }
41941 
41942   return SDValue();
41943 }
41944 
41945 /// Combine:
41946 ///   (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
41947 /// to:
41948 ///   (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
41949 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
41950 /// Note that this is only legal for some op/cc combinations.
combineSetCCAtomicArith(SDValue Cmp,X86::CondCode & CC,SelectionDAG & DAG,const X86Subtarget & Subtarget)41951 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
41952                                        SelectionDAG &DAG,
41953                                        const X86Subtarget &Subtarget) {
41954   // This combine only operates on CMP-like nodes.
41955   if (!(Cmp.getOpcode() == X86ISD::CMP ||
41956         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
41957     return SDValue();
41958 
41959   // Can't replace the cmp if it has more uses than the one we're looking at.
41960   // FIXME: We would like to be able to handle this, but would need to make sure
41961   // all uses were updated.
41962   if (!Cmp.hasOneUse())
41963     return SDValue();
41964 
41965   // This only applies to variations of the common case:
41966   //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
41967   //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
41968   //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
41969   //   (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
41970   // Using the proper condcodes (see below), overflow is checked for.
41971 
41972   // FIXME: We can generalize both constraints:
41973   // - XOR/OR/AND (if they were made to survive AtomicExpand)
41974   // - LHS != 1
41975   // if the result is compared.
41976 
41977   SDValue CmpLHS = Cmp.getOperand(0);
41978   SDValue CmpRHS = Cmp.getOperand(1);
41979   EVT CmpVT = CmpLHS.getValueType();
41980 
41981   if (!CmpLHS.hasOneUse())
41982     return SDValue();
41983 
41984   unsigned Opc = CmpLHS.getOpcode();
41985   if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
41986     return SDValue();
41987 
41988   SDValue OpRHS = CmpLHS.getOperand(2);
41989   auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
41990   if (!OpRHSC)
41991     return SDValue();
41992 
41993   APInt Addend = OpRHSC->getAPIntValue();
41994   if (Opc == ISD::ATOMIC_LOAD_SUB)
41995     Addend = -Addend;
41996 
41997   auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
41998   if (!CmpRHSC)
41999     return SDValue();
42000 
42001   APInt Comparison = CmpRHSC->getAPIntValue();
42002   APInt NegAddend = -Addend;
42003 
42004   // If the addend is the negation of the comparison value, then we can do
42005   // a full comparison by emitting the atomic arithmetic as a locked sub.
42006   if (Comparison == NegAddend) {
42007     // The CC is fine, but we need to rewrite the LHS of the comparison as an
42008     // atomic sub.
42009     auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
42010     auto AtomicSub = DAG.getAtomic(
42011         ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
42012         /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
42013         /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
42014         AN->getMemOperand());
42015     auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
42016     DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
42017     DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
42018     return LockOp;
42019   }
42020 
42021   // We can handle comparisons with zero in a number of cases by manipulating
42022   // the CC used.
42023   if (!Comparison.isNullValue())
42024     return SDValue();
42025 
42026   if (CC == X86::COND_S && Addend == 1)
42027     CC = X86::COND_LE;
42028   else if (CC == X86::COND_NS && Addend == 1)
42029     CC = X86::COND_G;
42030   else if (CC == X86::COND_G && Addend == -1)
42031     CC = X86::COND_GE;
42032   else if (CC == X86::COND_LE && Addend == -1)
42033     CC = X86::COND_L;
42034   else
42035     return SDValue();
42036 
42037   SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
42038   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
42039   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
42040   return LockOp;
42041 }
42042 
42043 // Check whether a boolean test is testing a boolean value generated by
42044 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
42045 // code.
42046 //
42047 // Simplify the following patterns:
42048 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
42049 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
42050 // to (Op EFLAGS Cond)
42051 //
42052 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
42053 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
42054 // to (Op EFLAGS !Cond)
42055 //
42056 // where Op could be BRCOND or CMOV.
42057 //
checkBoolTestSetCCCombine(SDValue Cmp,X86::CondCode & CC)42058 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
42059   // This combine only operates on CMP-like nodes.
42060   if (!(Cmp.getOpcode() == X86ISD::CMP ||
42061         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
42062     return SDValue();
42063 
42064   // Quit if not used as a boolean value.
42065   if (CC != X86::COND_E && CC != X86::COND_NE)
42066     return SDValue();
42067 
42068   // Check CMP operands. One of them should be 0 or 1 and the other should be
42069   // an SetCC or extended from it.
42070   SDValue Op1 = Cmp.getOperand(0);
42071   SDValue Op2 = Cmp.getOperand(1);
42072 
42073   SDValue SetCC;
42074   const ConstantSDNode* C = nullptr;
42075   bool needOppositeCond = (CC == X86::COND_E);
42076   bool checkAgainstTrue = false; // Is it a comparison against 1?
42077 
42078   if ((C = dyn_cast<ConstantSDNode>(Op1)))
42079     SetCC = Op2;
42080   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
42081     SetCC = Op1;
42082   else // Quit if all operands are not constants.
42083     return SDValue();
42084 
42085   if (C->getZExtValue() == 1) {
42086     needOppositeCond = !needOppositeCond;
42087     checkAgainstTrue = true;
42088   } else if (C->getZExtValue() != 0)
42089     // Quit if the constant is neither 0 or 1.
42090     return SDValue();
42091 
42092   bool truncatedToBoolWithAnd = false;
42093   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
42094   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
42095          SetCC.getOpcode() == ISD::TRUNCATE ||
42096          SetCC.getOpcode() == ISD::AND) {
42097     if (SetCC.getOpcode() == ISD::AND) {
42098       int OpIdx = -1;
42099       if (isOneConstant(SetCC.getOperand(0)))
42100         OpIdx = 1;
42101       if (isOneConstant(SetCC.getOperand(1)))
42102         OpIdx = 0;
42103       if (OpIdx < 0)
42104         break;
42105       SetCC = SetCC.getOperand(OpIdx);
42106       truncatedToBoolWithAnd = true;
42107     } else
42108       SetCC = SetCC.getOperand(0);
42109   }
42110 
42111   switch (SetCC.getOpcode()) {
42112   case X86ISD::SETCC_CARRY:
42113     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
42114     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
42115     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
42116     // truncated to i1 using 'and'.
42117     if (checkAgainstTrue && !truncatedToBoolWithAnd)
42118       break;
42119     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
42120            "Invalid use of SETCC_CARRY!");
42121     LLVM_FALLTHROUGH;
42122   case X86ISD::SETCC:
42123     // Set the condition code or opposite one if necessary.
42124     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
42125     if (needOppositeCond)
42126       CC = X86::GetOppositeBranchCondition(CC);
42127     return SetCC.getOperand(1);
42128   case X86ISD::CMOV: {
42129     // Check whether false/true value has canonical one, i.e. 0 or 1.
42130     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
42131     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
42132     // Quit if true value is not a constant.
42133     if (!TVal)
42134       return SDValue();
42135     // Quit if false value is not a constant.
42136     if (!FVal) {
42137       SDValue Op = SetCC.getOperand(0);
42138       // Skip 'zext' or 'trunc' node.
42139       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
42140           Op.getOpcode() == ISD::TRUNCATE)
42141         Op = Op.getOperand(0);
42142       // A special case for rdrand/rdseed, where 0 is set if false cond is
42143       // found.
42144       if ((Op.getOpcode() != X86ISD::RDRAND &&
42145            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
42146         return SDValue();
42147     }
42148     // Quit if false value is not the constant 0 or 1.
42149     bool FValIsFalse = true;
42150     if (FVal && FVal->getZExtValue() != 0) {
42151       if (FVal->getZExtValue() != 1)
42152         return SDValue();
42153       // If FVal is 1, opposite cond is needed.
42154       needOppositeCond = !needOppositeCond;
42155       FValIsFalse = false;
42156     }
42157     // Quit if TVal is not the constant opposite of FVal.
42158     if (FValIsFalse && TVal->getZExtValue() != 1)
42159       return SDValue();
42160     if (!FValIsFalse && TVal->getZExtValue() != 0)
42161       return SDValue();
42162     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
42163     if (needOppositeCond)
42164       CC = X86::GetOppositeBranchCondition(CC);
42165     return SetCC.getOperand(3);
42166   }
42167   }
42168 
42169   return SDValue();
42170 }
42171 
42172 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
42173 /// Match:
42174 ///   (X86or (X86setcc) (X86setcc))
42175 ///   (X86cmp (and (X86setcc) (X86setcc)), 0)
checkBoolTestAndOrSetCCCombine(SDValue Cond,X86::CondCode & CC0,X86::CondCode & CC1,SDValue & Flags,bool & isAnd)42176 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
42177                                            X86::CondCode &CC1, SDValue &Flags,
42178                                            bool &isAnd) {
42179   if (Cond->getOpcode() == X86ISD::CMP) {
42180     if (!isNullConstant(Cond->getOperand(1)))
42181       return false;
42182 
42183     Cond = Cond->getOperand(0);
42184   }
42185 
42186   isAnd = false;
42187 
42188   SDValue SetCC0, SetCC1;
42189   switch (Cond->getOpcode()) {
42190   default: return false;
42191   case ISD::AND:
42192   case X86ISD::AND:
42193     isAnd = true;
42194     LLVM_FALLTHROUGH;
42195   case ISD::OR:
42196   case X86ISD::OR:
42197     SetCC0 = Cond->getOperand(0);
42198     SetCC1 = Cond->getOperand(1);
42199     break;
42200   };
42201 
42202   // Make sure we have SETCC nodes, using the same flags value.
42203   if (SetCC0.getOpcode() != X86ISD::SETCC ||
42204       SetCC1.getOpcode() != X86ISD::SETCC ||
42205       SetCC0->getOperand(1) != SetCC1->getOperand(1))
42206     return false;
42207 
42208   CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
42209   CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
42210   Flags = SetCC0->getOperand(1);
42211   return true;
42212 }
42213 
42214 // When legalizing carry, we create carries via add X, -1
42215 // If that comes from an actual carry, via setcc, we use the
42216 // carry directly.
combineCarryThroughADD(SDValue EFLAGS,SelectionDAG & DAG)42217 static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
42218   if (EFLAGS.getOpcode() == X86ISD::ADD) {
42219     if (isAllOnesConstant(EFLAGS.getOperand(1))) {
42220       SDValue Carry = EFLAGS.getOperand(0);
42221       while (Carry.getOpcode() == ISD::TRUNCATE ||
42222              Carry.getOpcode() == ISD::ZERO_EXTEND ||
42223              Carry.getOpcode() == ISD::SIGN_EXTEND ||
42224              Carry.getOpcode() == ISD::ANY_EXTEND ||
42225              (Carry.getOpcode() == ISD::AND &&
42226               isOneConstant(Carry.getOperand(1))))
42227         Carry = Carry.getOperand(0);
42228       if (Carry.getOpcode() == X86ISD::SETCC ||
42229           Carry.getOpcode() == X86ISD::SETCC_CARRY) {
42230         // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
42231         uint64_t CarryCC = Carry.getConstantOperandVal(0);
42232         SDValue CarryOp1 = Carry.getOperand(1);
42233         if (CarryCC == X86::COND_B)
42234           return CarryOp1;
42235         if (CarryCC == X86::COND_A) {
42236           // Try to convert COND_A into COND_B in an attempt to facilitate
42237           // materializing "setb reg".
42238           //
42239           // Do not flip "e > c", where "c" is a constant, because Cmp
42240           // instruction cannot take an immediate as its first operand.
42241           //
42242           if (CarryOp1.getOpcode() == X86ISD::SUB &&
42243               CarryOp1.getNode()->hasOneUse() &&
42244               CarryOp1.getValueType().isInteger() &&
42245               !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
42246             SDValue SubCommute =
42247                 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
42248                             CarryOp1.getOperand(1), CarryOp1.getOperand(0));
42249             return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
42250           }
42251         }
42252         // If this is a check of the z flag of an add with 1, switch to the
42253         // C flag.
42254         if (CarryCC == X86::COND_E &&
42255             CarryOp1.getOpcode() == X86ISD::ADD &&
42256             isOneConstant(CarryOp1.getOperand(1)))
42257           return CarryOp1;
42258       }
42259     }
42260   }
42261 
42262   return SDValue();
42263 }
42264 
42265 /// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
42266 /// to avoid the inversion.
combinePTESTCC(SDValue EFLAGS,X86::CondCode & CC,SelectionDAG & DAG,const X86Subtarget & Subtarget)42267 static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
42268                               SelectionDAG &DAG,
42269                               const X86Subtarget &Subtarget) {
42270   // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
42271   if (EFLAGS.getOpcode() != X86ISD::PTEST &&
42272       EFLAGS.getOpcode() != X86ISD::TESTP)
42273     return SDValue();
42274 
42275   // PTEST/TESTP sets EFLAGS as:
42276   // TESTZ: ZF = (Op0 & Op1) == 0
42277   // TESTC: CF = (~Op0 & Op1) == 0
42278   // TESTNZC: ZF == 0 && CF == 0
42279   EVT VT = EFLAGS.getValueType();
42280   SDValue Op0 = EFLAGS.getOperand(0);
42281   SDValue Op1 = EFLAGS.getOperand(1);
42282   EVT OpVT = Op0.getValueType();
42283 
42284   // TEST*(~X,Y) == TEST*(X,Y)
42285   if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
42286     X86::CondCode InvCC;
42287     switch (CC) {
42288     case X86::COND_B:
42289       // testc -> testz.
42290       InvCC = X86::COND_E;
42291       break;
42292     case X86::COND_AE:
42293       // !testc -> !testz.
42294       InvCC = X86::COND_NE;
42295       break;
42296     case X86::COND_E:
42297       // testz -> testc.
42298       InvCC = X86::COND_B;
42299       break;
42300     case X86::COND_NE:
42301       // !testz -> !testc.
42302       InvCC = X86::COND_AE;
42303       break;
42304     case X86::COND_A:
42305     case X86::COND_BE:
42306       // testnzc -> testnzc (no change).
42307       InvCC = CC;
42308       break;
42309     default:
42310       InvCC = X86::COND_INVALID;
42311       break;
42312     }
42313 
42314     if (InvCC != X86::COND_INVALID) {
42315       CC = InvCC;
42316       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
42317                          DAG.getBitcast(OpVT, NotOp0), Op1);
42318     }
42319   }
42320 
42321   if (CC == X86::COND_E || CC == X86::COND_NE) {
42322     // TESTZ(X,~Y) == TESTC(Y,X)
42323     if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
42324       CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
42325       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
42326                          DAG.getBitcast(OpVT, NotOp1), Op0);
42327     }
42328 
42329     if (Op0 == Op1) {
42330       SDValue BC = peekThroughBitcasts(Op0);
42331       EVT BCVT = BC.getValueType();
42332       assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
42333              "Unexpected vector type");
42334 
42335       // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
42336       if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
42337         return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
42338                            DAG.getBitcast(OpVT, BC.getOperand(0)),
42339                            DAG.getBitcast(OpVT, BC.getOperand(1)));
42340       }
42341 
42342       // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
42343       if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
42344         CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
42345         return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
42346                            DAG.getBitcast(OpVT, BC.getOperand(0)),
42347                            DAG.getBitcast(OpVT, BC.getOperand(1)));
42348       }
42349 
42350       // If every element is an all-sign value, see if we can use MOVMSK to
42351       // more efficiently extract the sign bits and compare that.
42352       // TODO: Handle TESTC with comparison inversion.
42353       // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
42354       // MOVMSK combines to make sure its never worse than PTEST?
42355       unsigned EltBits = BCVT.getScalarSizeInBits();
42356       if (DAG.ComputeNumSignBits(BC) == EltBits) {
42357         assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
42358         APInt SignMask = APInt::getSignMask(EltBits);
42359         const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42360         if (SDValue Res =
42361                 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
42362           // For vXi16 cases we need to use pmovmksb and extract every other
42363           // sign bit.
42364           SDLoc DL(EFLAGS);
42365           if (EltBits == 16) {
42366             MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
42367             Res = DAG.getBitcast(MovmskVT, Res);
42368             Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
42369             Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
42370                               DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
42371           } else {
42372             Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
42373           }
42374           return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
42375                              DAG.getConstant(0, DL, MVT::i32));
42376         }
42377       }
42378     }
42379 
42380     // TESTZ(-1,X) == TESTZ(X,X)
42381     if (ISD::isBuildVectorAllOnes(Op0.getNode()))
42382       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
42383 
42384     // TESTZ(X,-1) == TESTZ(X,X)
42385     if (ISD::isBuildVectorAllOnes(Op1.getNode()))
42386       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
42387   }
42388 
42389   return SDValue();
42390 }
42391 
42392 // Attempt to simplify the MOVMSK input based on the comparison type.
combineSetCCMOVMSK(SDValue EFLAGS,X86::CondCode & CC,SelectionDAG & DAG,const X86Subtarget & Subtarget)42393 static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
42394                                   SelectionDAG &DAG,
42395                                   const X86Subtarget &Subtarget) {
42396   // Handle eq/ne against zero (any_of).
42397   // Handle eq/ne against -1 (all_of).
42398   if (!(CC == X86::COND_E || CC == X86::COND_NE))
42399     return SDValue();
42400   if (EFLAGS.getValueType() != MVT::i32)
42401     return SDValue();
42402   unsigned CmpOpcode = EFLAGS.getOpcode();
42403   if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
42404     return SDValue();
42405   auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
42406   if (!CmpConstant)
42407     return SDValue();
42408   const APInt &CmpVal = CmpConstant->getAPIntValue();
42409 
42410   SDValue CmpOp = EFLAGS.getOperand(0);
42411   unsigned CmpBits = CmpOp.getValueSizeInBits();
42412   assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
42413 
42414   // Peek through any truncate.
42415   if (CmpOp.getOpcode() == ISD::TRUNCATE)
42416     CmpOp = CmpOp.getOperand(0);
42417 
42418   // Bail if we don't find a MOVMSK.
42419   if (CmpOp.getOpcode() != X86ISD::MOVMSK)
42420     return SDValue();
42421 
42422   SDValue Vec = CmpOp.getOperand(0);
42423   MVT VecVT = Vec.getSimpleValueType();
42424   assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
42425          "Unexpected MOVMSK operand");
42426   unsigned NumElts = VecVT.getVectorNumElements();
42427   unsigned NumEltBits = VecVT.getScalarSizeInBits();
42428 
42429   bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isNullValue();
42430   bool IsAllOf = CmpOpcode == X86ISD::SUB && NumElts <= CmpBits &&
42431                  CmpVal.isMask(NumElts);
42432   if (!IsAnyOf && !IsAllOf)
42433     return SDValue();
42434 
42435   // See if we can peek through to a vector with a wider element type, if the
42436   // signbits extend down to all the sub-elements as well.
42437   // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
42438   // potential SimplifyDemandedBits/Elts cases.
42439   if (Vec.getOpcode() == ISD::BITCAST) {
42440     SDValue BC = peekThroughBitcasts(Vec);
42441     MVT BCVT = BC.getSimpleValueType();
42442     unsigned BCNumElts = BCVT.getVectorNumElements();
42443     unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
42444     if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
42445         BCNumEltBits > NumEltBits &&
42446         DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
42447       SDLoc DL(EFLAGS);
42448       unsigned CmpMask = IsAnyOf ? 0 : ((1 << BCNumElts) - 1);
42449       return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
42450                          DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
42451                          DAG.getConstant(CmpMask, DL, MVT::i32));
42452     }
42453   }
42454 
42455   // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
42456   // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
42457   if (IsAllOf && Subtarget.hasSSE41()) {
42458     SDValue BC = peekThroughBitcasts(Vec);
42459     if (BC.getOpcode() == X86ISD::PCMPEQ &&
42460         ISD::isBuildVectorAllZeros(BC.getOperand(1).getNode())) {
42461       MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
42462       SDValue V = DAG.getBitcast(TestVT, BC.getOperand(0));
42463       return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
42464     }
42465   }
42466 
42467   // See if we can avoid a PACKSS by calling MOVMSK on the sources.
42468   // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
42469   // sign bits prior to the comparison with zero unless we know that
42470   // the vXi16 splats the sign bit down to the lower i8 half.
42471   // TODO: Handle all_of patterns.
42472   if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
42473     SDValue VecOp0 = Vec.getOperand(0);
42474     SDValue VecOp1 = Vec.getOperand(1);
42475     bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
42476     bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
42477     // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
42478     if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
42479       SDLoc DL(EFLAGS);
42480       SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
42481       Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
42482       Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
42483       if (!SignExt0) {
42484         Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
42485                              DAG.getConstant(0xAAAA, DL, MVT::i16));
42486       }
42487       return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
42488                          DAG.getConstant(0, DL, MVT::i16));
42489     }
42490     // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
42491     // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
42492     if (CmpBits >= 16 && Subtarget.hasInt256() &&
42493         VecOp0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42494         VecOp1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42495         VecOp0.getOperand(0) == VecOp1.getOperand(0) &&
42496         VecOp0.getConstantOperandAPInt(1) == 0 &&
42497         VecOp1.getConstantOperandAPInt(1) == 8 &&
42498         (IsAnyOf || (SignExt0 && SignExt1))) {
42499       SDLoc DL(EFLAGS);
42500       SDValue Result = DAG.getBitcast(MVT::v32i8, VecOp0.getOperand(0));
42501       Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
42502       unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
42503       if (!SignExt0 || !SignExt1) {
42504         assert(IsAnyOf && "Only perform v16i16 signmasks for any_of patterns");
42505         Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
42506                              DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
42507       }
42508       return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
42509                          DAG.getConstant(CmpMask, DL, MVT::i32));
42510     }
42511   }
42512 
42513   // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
42514   SmallVector<int, 32> ShuffleMask;
42515   SmallVector<SDValue, 2> ShuffleInputs;
42516   if (NumElts <= CmpBits &&
42517       getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
42518                              ShuffleMask, DAG) &&
42519       ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
42520       ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
42521     unsigned NumShuffleElts = ShuffleMask.size();
42522     APInt DemandedElts = APInt::getNullValue(NumShuffleElts);
42523     for (int M : ShuffleMask) {
42524       assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index");
42525       DemandedElts.setBit(M);
42526     }
42527     if (DemandedElts.isAllOnesValue()) {
42528       SDLoc DL(EFLAGS);
42529       SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
42530       Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
42531       Result =
42532           DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
42533       return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
42534                          EFLAGS.getOperand(1));
42535     }
42536   }
42537 
42538   return SDValue();
42539 }
42540 
42541 /// Optimize an EFLAGS definition used according to the condition code \p CC
42542 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
42543 /// uses of chain values.
combineSetCCEFLAGS(SDValue EFLAGS,X86::CondCode & CC,SelectionDAG & DAG,const X86Subtarget & Subtarget)42544 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
42545                                   SelectionDAG &DAG,
42546                                   const X86Subtarget &Subtarget) {
42547   if (CC == X86::COND_B)
42548     if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
42549       return Flags;
42550 
42551   if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
42552     return R;
42553 
42554   if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
42555     return R;
42556 
42557   if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
42558     return R;
42559 
42560   return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
42561 }
42562 
42563 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
combineCMov(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)42564 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
42565                            TargetLowering::DAGCombinerInfo &DCI,
42566                            const X86Subtarget &Subtarget) {
42567   SDLoc DL(N);
42568 
42569   SDValue FalseOp = N->getOperand(0);
42570   SDValue TrueOp = N->getOperand(1);
42571   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
42572   SDValue Cond = N->getOperand(3);
42573 
42574   // cmov X, X, ?, ? --> X
42575   if (TrueOp == FalseOp)
42576     return TrueOp;
42577 
42578   // Try to simplify the EFLAGS and condition code operands.
42579   // We can't always do this as FCMOV only supports a subset of X86 cond.
42580   if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
42581     if (!(FalseOp.getValueType() == MVT::f80 ||
42582           (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
42583           (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
42584         !Subtarget.hasCMov() || hasFPCMov(CC)) {
42585       SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
42586                        Flags};
42587       return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
42588     }
42589   }
42590 
42591   // If this is a select between two integer constants, try to do some
42592   // optimizations.  Note that the operands are ordered the opposite of SELECT
42593   // operands.
42594   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
42595     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
42596       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
42597       // larger than FalseC (the false value).
42598       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
42599         CC = X86::GetOppositeBranchCondition(CC);
42600         std::swap(TrueC, FalseC);
42601         std::swap(TrueOp, FalseOp);
42602       }
42603 
42604       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
42605       // This is efficient for any integer data type (including i8/i16) and
42606       // shift amount.
42607       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
42608         Cond = getSETCC(CC, Cond, DL, DAG);
42609 
42610         // Zero extend the condition if needed.
42611         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
42612 
42613         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
42614         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
42615                            DAG.getConstant(ShAmt, DL, MVT::i8));
42616         return Cond;
42617       }
42618 
42619       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
42620       // for any integer data type, including i8/i16.
42621       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
42622         Cond = getSETCC(CC, Cond, DL, DAG);
42623 
42624         // Zero extend the condition if needed.
42625         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
42626                            FalseC->getValueType(0), Cond);
42627         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
42628                            SDValue(FalseC, 0));
42629         return Cond;
42630       }
42631 
42632       // Optimize cases that will turn into an LEA instruction.  This requires
42633       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
42634       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
42635         APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
42636         assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
42637                "Implicit constant truncation");
42638 
42639         bool isFastMultiplier = false;
42640         if (Diff.ult(10)) {
42641           switch (Diff.getZExtValue()) {
42642           default: break;
42643           case 1:  // result = add base, cond
42644           case 2:  // result = lea base(    , cond*2)
42645           case 3:  // result = lea base(cond, cond*2)
42646           case 4:  // result = lea base(    , cond*4)
42647           case 5:  // result = lea base(cond, cond*4)
42648           case 8:  // result = lea base(    , cond*8)
42649           case 9:  // result = lea base(cond, cond*8)
42650             isFastMultiplier = true;
42651             break;
42652           }
42653         }
42654 
42655         if (isFastMultiplier) {
42656           Cond = getSETCC(CC, Cond, DL ,DAG);
42657           // Zero extend the condition if needed.
42658           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
42659                              Cond);
42660           // Scale the condition by the difference.
42661           if (Diff != 1)
42662             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
42663                                DAG.getConstant(Diff, DL, Cond.getValueType()));
42664 
42665           // Add the base if non-zero.
42666           if (FalseC->getAPIntValue() != 0)
42667             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
42668                                SDValue(FalseC, 0));
42669           return Cond;
42670         }
42671       }
42672     }
42673   }
42674 
42675   // Handle these cases:
42676   //   (select (x != c), e, c) -> select (x != c), e, x),
42677   //   (select (x == c), c, e) -> select (x == c), x, e)
42678   // where the c is an integer constant, and the "select" is the combination
42679   // of CMOV and CMP.
42680   //
42681   // The rationale for this change is that the conditional-move from a constant
42682   // needs two instructions, however, conditional-move from a register needs
42683   // only one instruction.
42684   //
42685   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
42686   //  some instruction-combining opportunities. This opt needs to be
42687   //  postponed as late as possible.
42688   //
42689   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
42690     // the DCI.xxxx conditions are provided to postpone the optimization as
42691     // late as possible.
42692 
42693     ConstantSDNode *CmpAgainst = nullptr;
42694     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
42695         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
42696         !isa<ConstantSDNode>(Cond.getOperand(0))) {
42697 
42698       if (CC == X86::COND_NE &&
42699           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
42700         CC = X86::GetOppositeBranchCondition(CC);
42701         std::swap(TrueOp, FalseOp);
42702       }
42703 
42704       if (CC == X86::COND_E &&
42705           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
42706         SDValue Ops[] = {FalseOp, Cond.getOperand(0),
42707                          DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
42708         return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
42709       }
42710     }
42711   }
42712 
42713   // Fold and/or of setcc's to double CMOV:
42714   //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
42715   //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
42716   //
42717   // This combine lets us generate:
42718   //   cmovcc1 (jcc1 if we don't have CMOV)
42719   //   cmovcc2 (same)
42720   // instead of:
42721   //   setcc1
42722   //   setcc2
42723   //   and/or
42724   //   cmovne (jne if we don't have CMOV)
42725   // When we can't use the CMOV instruction, it might increase branch
42726   // mispredicts.
42727   // When we can use CMOV, or when there is no mispredict, this improves
42728   // throughput and reduces register pressure.
42729   //
42730   if (CC == X86::COND_NE) {
42731     SDValue Flags;
42732     X86::CondCode CC0, CC1;
42733     bool isAndSetCC;
42734     if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
42735       if (isAndSetCC) {
42736         std::swap(FalseOp, TrueOp);
42737         CC0 = X86::GetOppositeBranchCondition(CC0);
42738         CC1 = X86::GetOppositeBranchCondition(CC1);
42739       }
42740 
42741       SDValue LOps[] = {FalseOp, TrueOp,
42742                         DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
42743       SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
42744       SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
42745                        Flags};
42746       SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
42747       return CMOV;
42748     }
42749   }
42750 
42751   // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
42752   //      (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
42753   // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
42754   //    (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
42755   if ((CC == X86::COND_NE || CC == X86::COND_E) &&
42756       Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
42757     SDValue Add = TrueOp;
42758     SDValue Const = FalseOp;
42759     // Canonicalize the condition code for easier matching and output.
42760     if (CC == X86::COND_E)
42761       std::swap(Add, Const);
42762 
42763     // We might have replaced the constant in the cmov with the LHS of the
42764     // compare. If so change it to the RHS of the compare.
42765     if (Const == Cond.getOperand(0))
42766       Const = Cond.getOperand(1);
42767 
42768     // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
42769     if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
42770         Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
42771         (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
42772          Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
42773         Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
42774       EVT VT = N->getValueType(0);
42775       // This should constant fold.
42776       SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
42777       SDValue CMov =
42778           DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
42779                       DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
42780       return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
42781     }
42782   }
42783 
42784   return SDValue();
42785 }
42786 
42787 /// Different mul shrinking modes.
42788 enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
42789 
canReduceVMulWidth(SDNode * N,SelectionDAG & DAG,ShrinkMode & Mode)42790 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
42791   EVT VT = N->getOperand(0).getValueType();
42792   if (VT.getScalarSizeInBits() != 32)
42793     return false;
42794 
42795   assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
42796   unsigned SignBits[2] = {1, 1};
42797   bool IsPositive[2] = {false, false};
42798   for (unsigned i = 0; i < 2; i++) {
42799     SDValue Opd = N->getOperand(i);
42800 
42801     SignBits[i] = DAG.ComputeNumSignBits(Opd);
42802     IsPositive[i] = DAG.SignBitIsZero(Opd);
42803   }
42804 
42805   bool AllPositive = IsPositive[0] && IsPositive[1];
42806   unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
42807   // When ranges are from -128 ~ 127, use MULS8 mode.
42808   if (MinSignBits >= 25)
42809     Mode = ShrinkMode::MULS8;
42810   // When ranges are from 0 ~ 255, use MULU8 mode.
42811   else if (AllPositive && MinSignBits >= 24)
42812     Mode = ShrinkMode::MULU8;
42813   // When ranges are from -32768 ~ 32767, use MULS16 mode.
42814   else if (MinSignBits >= 17)
42815     Mode = ShrinkMode::MULS16;
42816   // When ranges are from 0 ~ 65535, use MULU16 mode.
42817   else if (AllPositive && MinSignBits >= 16)
42818     Mode = ShrinkMode::MULU16;
42819   else
42820     return false;
42821   return true;
42822 }
42823 
42824 /// When the operands of vector mul are extended from smaller size values,
42825 /// like i8 and i16, the type of mul may be shrinked to generate more
42826 /// efficient code. Two typical patterns are handled:
42827 /// Pattern1:
42828 ///     %2 = sext/zext <N x i8> %1 to <N x i32>
42829 ///     %4 = sext/zext <N x i8> %3 to <N x i32>
42830 //   or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
42831 ///     %5 = mul <N x i32> %2, %4
42832 ///
42833 /// Pattern2:
42834 ///     %2 = zext/sext <N x i16> %1 to <N x i32>
42835 ///     %4 = zext/sext <N x i16> %3 to <N x i32>
42836 ///  or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
42837 ///     %5 = mul <N x i32> %2, %4
42838 ///
42839 /// There are four mul shrinking modes:
42840 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
42841 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
42842 /// generate pmullw+sext32 for it (MULS8 mode).
42843 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
42844 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
42845 /// generate pmullw+zext32 for it (MULU8 mode).
42846 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
42847 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
42848 /// generate pmullw+pmulhw for it (MULS16 mode).
42849 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
42850 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
42851 /// generate pmullw+pmulhuw for it (MULU16 mode).
reduceVMULWidth(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)42852 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
42853                                const X86Subtarget &Subtarget) {
42854   // Check for legality
42855   // pmullw/pmulhw are not supported by SSE.
42856   if (!Subtarget.hasSSE2())
42857     return SDValue();
42858 
42859   // Check for profitability
42860   // pmulld is supported since SSE41. It is better to use pmulld
42861   // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
42862   // the expansion.
42863   bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
42864   if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
42865     return SDValue();
42866 
42867   ShrinkMode Mode;
42868   if (!canReduceVMulWidth(N, DAG, Mode))
42869     return SDValue();
42870 
42871   SDLoc DL(N);
42872   SDValue N0 = N->getOperand(0);
42873   SDValue N1 = N->getOperand(1);
42874   EVT VT = N->getOperand(0).getValueType();
42875   unsigned NumElts = VT.getVectorNumElements();
42876   if ((NumElts % 2) != 0)
42877     return SDValue();
42878 
42879   EVT ReducedVT = VT.changeVectorElementType(MVT::i16);
42880 
42881   // Shrink the operands of mul.
42882   SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
42883   SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
42884 
42885   // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
42886   // lower part is needed.
42887   SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
42888   if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
42889     return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
42890                                                    : ISD::SIGN_EXTEND,
42891                        DL, VT, MulLo);
42892 
42893   EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
42894   // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
42895   // the higher part is also needed.
42896   SDValue MulHi =
42897       DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
42898                   ReducedVT, NewN0, NewN1);
42899 
42900   // Repack the lower part and higher part result of mul into a wider
42901   // result.
42902   // Generate shuffle functioning as punpcklwd.
42903   SmallVector<int, 16> ShuffleMask(NumElts);
42904   for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
42905     ShuffleMask[2 * i] = i;
42906     ShuffleMask[2 * i + 1] = i + NumElts;
42907   }
42908   SDValue ResLo =
42909       DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
42910   ResLo = DAG.getBitcast(ResVT, ResLo);
42911   // Generate shuffle functioning as punpckhwd.
42912   for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
42913     ShuffleMask[2 * i] = i + NumElts / 2;
42914     ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
42915   }
42916   SDValue ResHi =
42917       DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
42918   ResHi = DAG.getBitcast(ResVT, ResHi);
42919   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
42920 }
42921 
combineMulSpecial(uint64_t MulAmt,SDNode * N,SelectionDAG & DAG,EVT VT,const SDLoc & DL)42922 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
42923                                  EVT VT, const SDLoc &DL) {
42924 
42925   auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
42926     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
42927                                  DAG.getConstant(Mult, DL, VT));
42928     Result = DAG.getNode(ISD::SHL, DL, VT, Result,
42929                          DAG.getConstant(Shift, DL, MVT::i8));
42930     Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
42931                          N->getOperand(0));
42932     return Result;
42933   };
42934 
42935   auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
42936     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
42937                                  DAG.getConstant(Mul1, DL, VT));
42938     Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
42939                          DAG.getConstant(Mul2, DL, VT));
42940     Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
42941                          N->getOperand(0));
42942     return Result;
42943   };
42944 
42945   switch (MulAmt) {
42946   default:
42947     break;
42948   case 11:
42949     // mul x, 11 => add ((shl (mul x, 5), 1), x)
42950     return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
42951   case 21:
42952     // mul x, 21 => add ((shl (mul x, 5), 2), x)
42953     return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
42954   case 41:
42955     // mul x, 41 => add ((shl (mul x, 5), 3), x)
42956     return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
42957   case 22:
42958     // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
42959     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
42960                        combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
42961   case 19:
42962     // mul x, 19 => add ((shl (mul x, 9), 1), x)
42963     return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
42964   case 37:
42965     // mul x, 37 => add ((shl (mul x, 9), 2), x)
42966     return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
42967   case 73:
42968     // mul x, 73 => add ((shl (mul x, 9), 3), x)
42969     return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
42970   case 13:
42971     // mul x, 13 => add ((shl (mul x, 3), 2), x)
42972     return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
42973   case 23:
42974     // mul x, 23 => sub ((shl (mul x, 3), 3), x)
42975     return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
42976   case 26:
42977     // mul x, 26 => add ((mul (mul x, 5), 5), x)
42978     return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
42979   case 28:
42980     // mul x, 28 => add ((mul (mul x, 9), 3), x)
42981     return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
42982   case 29:
42983     // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
42984     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
42985                        combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
42986   }
42987 
42988   // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
42989   // by a single LEA.
42990   // First check if this a sum of two power of 2s because that's easy. Then
42991   // count how many zeros are up to the first bit.
42992   // TODO: We can do this even without LEA at a cost of two shifts and an add.
42993   if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
42994     unsigned ScaleShift = countTrailingZeros(MulAmt);
42995     if (ScaleShift >= 1 && ScaleShift < 4) {
42996       unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
42997       SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
42998                                    DAG.getConstant(ShiftAmt, DL, MVT::i8));
42999       SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43000                                    DAG.getConstant(ScaleShift, DL, MVT::i8));
43001       return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
43002     }
43003   }
43004 
43005   return SDValue();
43006 }
43007 
43008 // If the upper 17 bits of each element are zero then we can use PMADDWD,
43009 // which is always at least as quick as PMULLD, except on KNL.
combineMulToPMADDWD(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)43010 static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
43011                                    const X86Subtarget &Subtarget) {
43012   if (!Subtarget.hasSSE2())
43013     return SDValue();
43014 
43015   if (Subtarget.isPMADDWDSlow())
43016     return SDValue();
43017 
43018   EVT VT = N->getValueType(0);
43019 
43020   // Only support vXi32 vectors.
43021   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
43022     return SDValue();
43023 
43024   // Make sure the type is legal or will be widened to a legal type.
43025   if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT))
43026     return SDValue();
43027 
43028   MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
43029 
43030   // Without BWI, we would need to split v32i16.
43031   if (WVT == MVT::v32i16 && !Subtarget.hasBWI())
43032     return SDValue();
43033 
43034   SDValue N0 = N->getOperand(0);
43035   SDValue N1 = N->getOperand(1);
43036 
43037   // If we are zero extending two steps without SSE4.1, its better to reduce
43038   // the vmul width instead.
43039   if (!Subtarget.hasSSE41() &&
43040       (N0.getOpcode() == ISD::ZERO_EXTEND &&
43041        N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
43042       (N1.getOpcode() == ISD::ZERO_EXTEND &&
43043        N1.getOperand(0).getScalarValueSizeInBits() <= 8))
43044     return SDValue();
43045 
43046   APInt Mask17 = APInt::getHighBitsSet(32, 17);
43047   if (!DAG.MaskedValueIsZero(N1, Mask17) ||
43048       !DAG.MaskedValueIsZero(N0, Mask17))
43049     return SDValue();
43050 
43051   // Use SplitOpsAndApply to handle AVX splitting.
43052   auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43053                            ArrayRef<SDValue> Ops) {
43054     MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
43055     return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
43056   };
43057   return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
43058                           { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
43059                           PMADDWDBuilder);
43060 }
43061 
combineMulToPMULDQ(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)43062 static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
43063                                   const X86Subtarget &Subtarget) {
43064   if (!Subtarget.hasSSE2())
43065     return SDValue();
43066 
43067   EVT VT = N->getValueType(0);
43068 
43069   // Only support vXi64 vectors.
43070   if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
43071       VT.getVectorNumElements() < 2 ||
43072       !isPowerOf2_32(VT.getVectorNumElements()))
43073     return SDValue();
43074 
43075   SDValue N0 = N->getOperand(0);
43076   SDValue N1 = N->getOperand(1);
43077 
43078   // MULDQ returns the 64-bit result of the signed multiplication of the lower
43079   // 32-bits. We can lower with this if the sign bits stretch that far.
43080   if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
43081       DAG.ComputeNumSignBits(N1) > 32) {
43082     auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43083                             ArrayRef<SDValue> Ops) {
43084       return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
43085     };
43086     return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
43087                             PMULDQBuilder, /*CheckBWI*/false);
43088   }
43089 
43090   // If the upper bits are zero we can use a single pmuludq.
43091   APInt Mask = APInt::getHighBitsSet(64, 32);
43092   if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
43093     auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43094                              ArrayRef<SDValue> Ops) {
43095       return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
43096     };
43097     return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
43098                             PMULUDQBuilder, /*CheckBWI*/false);
43099   }
43100 
43101   return SDValue();
43102 }
43103 
43104 /// Optimize a single multiply with constant into two operations in order to
43105 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
combineMul(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)43106 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
43107                           TargetLowering::DAGCombinerInfo &DCI,
43108                           const X86Subtarget &Subtarget) {
43109   EVT VT = N->getValueType(0);
43110 
43111   if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
43112     return V;
43113 
43114   if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
43115     return V;
43116 
43117   if (DCI.isBeforeLegalize() && VT.isVector())
43118     return reduceVMULWidth(N, DAG, Subtarget);
43119 
43120   if (!MulConstantOptimization)
43121     return SDValue();
43122   // An imul is usually smaller than the alternative sequence.
43123   if (DAG.getMachineFunction().getFunction().hasMinSize())
43124     return SDValue();
43125 
43126   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
43127     return SDValue();
43128 
43129   if (VT != MVT::i64 && VT != MVT::i32)
43130     return SDValue();
43131 
43132   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
43133   if (!C)
43134     return SDValue();
43135   if (isPowerOf2_64(C->getZExtValue()))
43136     return SDValue();
43137 
43138   int64_t SignMulAmt = C->getSExtValue();
43139   assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
43140   uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
43141 
43142   SDLoc DL(N);
43143   if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
43144     SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
43145                                  DAG.getConstant(AbsMulAmt, DL, VT));
43146     if (SignMulAmt < 0)
43147       NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
43148                            NewMul);
43149 
43150     return NewMul;
43151   }
43152 
43153   uint64_t MulAmt1 = 0;
43154   uint64_t MulAmt2 = 0;
43155   if ((AbsMulAmt % 9) == 0) {
43156     MulAmt1 = 9;
43157     MulAmt2 = AbsMulAmt / 9;
43158   } else if ((AbsMulAmt % 5) == 0) {
43159     MulAmt1 = 5;
43160     MulAmt2 = AbsMulAmt / 5;
43161   } else if ((AbsMulAmt % 3) == 0) {
43162     MulAmt1 = 3;
43163     MulAmt2 = AbsMulAmt / 3;
43164   }
43165 
43166   SDValue NewMul;
43167   // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
43168   if (MulAmt2 &&
43169       (isPowerOf2_64(MulAmt2) ||
43170        (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
43171 
43172     if (isPowerOf2_64(MulAmt2) &&
43173         !(SignMulAmt >= 0 && N->hasOneUse() &&
43174           N->use_begin()->getOpcode() == ISD::ADD))
43175       // If second multiplifer is pow2, issue it first. We want the multiply by
43176       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
43177       // is an add. Only do this for positive multiply amounts since the
43178       // negate would prevent it from being used as an address mode anyway.
43179       std::swap(MulAmt1, MulAmt2);
43180 
43181     if (isPowerOf2_64(MulAmt1))
43182       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43183                            DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
43184     else
43185       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
43186                            DAG.getConstant(MulAmt1, DL, VT));
43187 
43188     if (isPowerOf2_64(MulAmt2))
43189       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
43190                            DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
43191     else
43192       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
43193                            DAG.getConstant(MulAmt2, DL, VT));
43194 
43195     // Negate the result.
43196     if (SignMulAmt < 0)
43197       NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
43198                            NewMul);
43199   } else if (!Subtarget.slowLEA())
43200     NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
43201 
43202   if (!NewMul) {
43203     assert(C->getZExtValue() != 0 &&
43204            C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
43205            "Both cases that could cause potential overflows should have "
43206            "already been handled.");
43207     if (isPowerOf2_64(AbsMulAmt - 1)) {
43208       // (mul x, 2^N + 1) => (add (shl x, N), x)
43209       NewMul = DAG.getNode(
43210           ISD::ADD, DL, VT, N->getOperand(0),
43211           DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43212                       DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
43213                                       MVT::i8)));
43214       // To negate, subtract the number from zero
43215       if (SignMulAmt < 0)
43216         NewMul = DAG.getNode(ISD::SUB, DL, VT,
43217                              DAG.getConstant(0, DL, VT), NewMul);
43218     } else if (isPowerOf2_64(AbsMulAmt + 1)) {
43219       // (mul x, 2^N - 1) => (sub (shl x, N), x)
43220       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43221                            DAG.getConstant(Log2_64(AbsMulAmt + 1),
43222                                            DL, MVT::i8));
43223       // To negate, reverse the operands of the subtract.
43224       if (SignMulAmt < 0)
43225         NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
43226       else
43227         NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
43228     } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
43229       // (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
43230       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43231                            DAG.getConstant(Log2_64(AbsMulAmt - 2),
43232                                            DL, MVT::i8));
43233       NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
43234       NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
43235     } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
43236       // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
43237       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43238                            DAG.getConstant(Log2_64(AbsMulAmt + 2),
43239                                            DL, MVT::i8));
43240       NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
43241       NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
43242     }
43243   }
43244 
43245   return NewMul;
43246 }
43247 
43248 // Try to form a MULHU or MULHS node by looking for
43249 // (srl (mul ext, ext), 16)
43250 // TODO: This is X86 specific because we want to be able to handle wide types
43251 // before type legalization. But we can only do it if the vector will be
43252 // legalized via widening/splitting. Type legalization can't handle promotion
43253 // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
43254 // combiner.
combineShiftToPMULH(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)43255 static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
43256                                    const X86Subtarget &Subtarget) {
43257   assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
43258            "SRL or SRA node is required here!");
43259   SDLoc DL(N);
43260 
43261   // Only do this with SSE4.1. On earlier targets reduceVMULWidth will expand
43262   // the multiply.
43263   if (!Subtarget.hasSSE41())
43264     return SDValue();
43265 
43266   // The operation feeding into the shift must be a multiply.
43267   SDValue ShiftOperand = N->getOperand(0);
43268   if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
43269     return SDValue();
43270 
43271   // Input type should be at least vXi32.
43272   EVT VT = N->getValueType(0);
43273   if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
43274     return SDValue();
43275 
43276   // Need a shift by 16.
43277   APInt ShiftAmt;
43278   if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
43279       ShiftAmt != 16)
43280     return SDValue();
43281 
43282   SDValue LHS = ShiftOperand.getOperand(0);
43283   SDValue RHS = ShiftOperand.getOperand(1);
43284 
43285   unsigned ExtOpc = LHS.getOpcode();
43286   if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
43287       RHS.getOpcode() != ExtOpc)
43288     return SDValue();
43289 
43290   // Peek through the extends.
43291   LHS = LHS.getOperand(0);
43292   RHS = RHS.getOperand(0);
43293 
43294   // Ensure the input types match.
43295   EVT MulVT = LHS.getValueType();
43296   if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
43297     return SDValue();
43298 
43299   unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
43300   SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
43301 
43302   ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
43303   return DAG.getNode(ExtOpc, DL, VT, Mulh);
43304 }
43305 
combineShiftLeft(SDNode * N,SelectionDAG & DAG)43306 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
43307   SDValue N0 = N->getOperand(0);
43308   SDValue N1 = N->getOperand(1);
43309   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
43310   EVT VT = N0.getValueType();
43311 
43312   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
43313   // since the result of setcc_c is all zero's or all ones.
43314   if (VT.isInteger() && !VT.isVector() &&
43315       N1C && N0.getOpcode() == ISD::AND &&
43316       N0.getOperand(1).getOpcode() == ISD::Constant) {
43317     SDValue N00 = N0.getOperand(0);
43318     APInt Mask = N0.getConstantOperandAPInt(1);
43319     Mask <<= N1C->getAPIntValue();
43320     bool MaskOK = false;
43321     // We can handle cases concerning bit-widening nodes containing setcc_c if
43322     // we carefully interrogate the mask to make sure we are semantics
43323     // preserving.
43324     // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
43325     // of the underlying setcc_c operation if the setcc_c was zero extended.
43326     // Consider the following example:
43327     //   zext(setcc_c)                 -> i32 0x0000FFFF
43328     //   c1                            -> i32 0x0000FFFF
43329     //   c2                            -> i32 0x00000001
43330     //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
43331     //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE
43332     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
43333       MaskOK = true;
43334     } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
43335                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
43336       MaskOK = true;
43337     } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
43338                 N00.getOpcode() == ISD::ANY_EXTEND) &&
43339                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
43340       MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
43341     }
43342     if (MaskOK && Mask != 0) {
43343       SDLoc DL(N);
43344       return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
43345     }
43346   }
43347 
43348   // Hardware support for vector shifts is sparse which makes us scalarize the
43349   // vector operations in many cases. Also, on sandybridge ADD is faster than
43350   // shl.
43351   // (shl V, 1) -> add V,V
43352   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
43353     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
43354       assert(N0.getValueType().isVector() && "Invalid vector shift type");
43355       // We shift all of the values by one. In many cases we do not have
43356       // hardware support for this operation. This is better expressed as an ADD
43357       // of two values.
43358       if (N1SplatC->isOne())
43359         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
43360     }
43361 
43362   return SDValue();
43363 }
43364 
combineShiftRightArithmetic(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)43365 static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
43366                                            const X86Subtarget &Subtarget) {
43367   SDValue N0 = N->getOperand(0);
43368   SDValue N1 = N->getOperand(1);
43369   EVT VT = N0.getValueType();
43370   unsigned Size = VT.getSizeInBits();
43371 
43372   if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
43373     return V;
43374 
43375   // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
43376   // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
43377   // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
43378   // depending on sign of (SarConst - [56,48,32,24,16])
43379 
43380   // sexts in X86 are MOVs. The MOVs have the same code size
43381   // as above SHIFTs (only SHIFT on 1 has lower code size).
43382   // However the MOVs have 2 advantages to a SHIFT:
43383   // 1. MOVs can write to a register that differs from source
43384   // 2. MOVs accept memory operands
43385 
43386   if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
43387       N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
43388       N0.getOperand(1).getOpcode() != ISD::Constant)
43389     return SDValue();
43390 
43391   SDValue N00 = N0.getOperand(0);
43392   SDValue N01 = N0.getOperand(1);
43393   APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
43394   APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
43395   EVT CVT = N1.getValueType();
43396 
43397   if (SarConst.isNegative())
43398     return SDValue();
43399 
43400   for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
43401     unsigned ShiftSize = SVT.getSizeInBits();
43402     // skipping types without corresponding sext/zext and
43403     // ShlConst that is not one of [56,48,32,24,16]
43404     if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
43405       continue;
43406     SDLoc DL(N);
43407     SDValue NN =
43408         DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
43409     SarConst = SarConst - (Size - ShiftSize);
43410     if (SarConst == 0)
43411       return NN;
43412     else if (SarConst.isNegative())
43413       return DAG.getNode(ISD::SHL, DL, VT, NN,
43414                          DAG.getConstant(-SarConst, DL, CVT));
43415     else
43416       return DAG.getNode(ISD::SRA, DL, VT, NN,
43417                          DAG.getConstant(SarConst, DL, CVT));
43418   }
43419   return SDValue();
43420 }
43421 
combineShiftRightLogical(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)43422 static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
43423                                         TargetLowering::DAGCombinerInfo &DCI,
43424                                         const X86Subtarget &Subtarget) {
43425   SDValue N0 = N->getOperand(0);
43426   SDValue N1 = N->getOperand(1);
43427   EVT VT = N0.getValueType();
43428 
43429   if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
43430     return V;
43431 
43432   // Only do this on the last DAG combine as it can interfere with other
43433   // combines.
43434   if (!DCI.isAfterLegalizeDAG())
43435     return SDValue();
43436 
43437   // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
43438   // TODO: This is a generic DAG combine that became an x86-only combine to
43439   // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
43440   // and-not ('andn').
43441   if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
43442     return SDValue();
43443 
43444   auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
43445   auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
43446   if (!ShiftC || !AndC)
43447     return SDValue();
43448 
43449   // If we can shrink the constant mask below 8-bits or 32-bits, then this
43450   // transform should reduce code size. It may also enable secondary transforms
43451   // from improved known-bits analysis or instruction selection.
43452   APInt MaskVal = AndC->getAPIntValue();
43453 
43454   // If this can be matched by a zero extend, don't optimize.
43455   if (MaskVal.isMask()) {
43456     unsigned TO = MaskVal.countTrailingOnes();
43457     if (TO >= 8 && isPowerOf2_32(TO))
43458       return SDValue();
43459   }
43460 
43461   APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
43462   unsigned OldMaskSize = MaskVal.getMinSignedBits();
43463   unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
43464   if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
43465       (OldMaskSize > 32 && NewMaskSize <= 32)) {
43466     // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
43467     SDLoc DL(N);
43468     SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
43469     SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
43470     return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
43471   }
43472   return SDValue();
43473 }
43474 
combineHorizOpWithShuffle(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)43475 static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
43476                                          const X86Subtarget &Subtarget) {
43477   unsigned Opcode = N->getOpcode();
43478   assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
43479 
43480   SDLoc DL(N);
43481   EVT VT = N->getValueType(0);
43482   SDValue N0 = N->getOperand(0);
43483   SDValue N1 = N->getOperand(1);
43484   EVT SrcVT = N0.getValueType();
43485 
43486   SDValue BC0 =
43487       N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
43488   SDValue BC1 =
43489       N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
43490 
43491   // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
43492   // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
43493   // truncation trees that help us avoid lane crossing shuffles.
43494   // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
43495   // TODO: We don't handle vXf64 shuffles yet.
43496   if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32 &&
43497       BC0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
43498       BC1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
43499       BC0.getOperand(0) == BC1.getOperand(0) &&
43500       BC0.getOperand(0).getValueType().is256BitVector() &&
43501       BC0.getConstantOperandAPInt(1) == 0 &&
43502       BC1.getConstantOperandAPInt(1) ==
43503           BC0.getValueType().getVectorNumElements()) {
43504     SmallVector<SDValue> ShuffleOps;
43505     SmallVector<int> ShuffleMask, ScaledMask;
43506     SDValue Vec = peekThroughBitcasts(BC0.getOperand(0));
43507     if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
43508       resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);
43509       // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
43510       // shuffle to a v4X64 width - we can probably relax this in the future.
43511       if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
43512           ShuffleOps[0].getValueType().is256BitVector() &&
43513           scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
43514         SDValue Lo, Hi;
43515         MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
43516         std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
43517         Lo = DAG.getBitcast(SrcVT, Lo);
43518         Hi = DAG.getBitcast(SrcVT, Hi);
43519         SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
43520         Res = DAG.getBitcast(ShufVT, Res);
43521         Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
43522         return DAG.getBitcast(VT, Res);
43523       }
43524     }
43525   }
43526 
43527   // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
43528   if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
43529     // If either/both ops are a shuffle that can scale to v2x64,
43530     // then see if we can perform this as a v4x32 post shuffle.
43531     SmallVector<SDValue> Ops0, Ops1;
43532     SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
43533     bool IsShuf0 =
43534         getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
43535         scaleShuffleElements(Mask0, 2, ScaledMask0) &&
43536         all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
43537     bool IsShuf1 =
43538         getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
43539         scaleShuffleElements(Mask1, 2, ScaledMask1) &&
43540         all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
43541     if (IsShuf0 || IsShuf1) {
43542       if (!IsShuf0) {
43543         Ops0.assign({BC0});
43544         ScaledMask0.assign({0, 1});
43545       }
43546       if (!IsShuf1) {
43547         Ops1.assign({BC1});
43548         ScaledMask1.assign({0, 1});
43549       }
43550 
43551       SDValue LHS, RHS;
43552       int PostShuffle[4] = {-1, -1, -1, -1};
43553       auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
43554         if (M < 0)
43555           return true;
43556         Idx = M % 2;
43557         SDValue Src = Ops[M / 2];
43558         if (!LHS || LHS == Src) {
43559           LHS = Src;
43560           return true;
43561         }
43562         if (!RHS || RHS == Src) {
43563           Idx += 2;
43564           RHS = Src;
43565           return true;
43566         }
43567         return false;
43568       };
43569       if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
43570           FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
43571           FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
43572           FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
43573         LHS = DAG.getBitcast(SrcVT, LHS);
43574         RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
43575         MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
43576         SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
43577         Res = DAG.getBitcast(ShufVT, Res);
43578         Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
43579         return DAG.getBitcast(VT, Res);
43580       }
43581     }
43582   }
43583 
43584   // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
43585   if (VT.is256BitVector() && Subtarget.hasInt256()) {
43586     SmallVector<int> Mask0, Mask1;
43587     SmallVector<SDValue> Ops0, Ops1;
43588     SmallVector<int, 2> ScaledMask0, ScaledMask1;
43589     if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
43590         getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
43591         !Ops0.empty() && !Ops1.empty() &&
43592         all_of(Ops0,
43593                [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
43594         all_of(Ops1,
43595                [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
43596         scaleShuffleElements(Mask0, 2, ScaledMask0) &&
43597         scaleShuffleElements(Mask1, 2, ScaledMask1)) {
43598       SDValue Op00 = peekThroughBitcasts(Ops0.front());
43599       SDValue Op10 = peekThroughBitcasts(Ops1.front());
43600       SDValue Op01 = peekThroughBitcasts(Ops0.back());
43601       SDValue Op11 = peekThroughBitcasts(Ops1.back());
43602       if ((Op00 == Op11) && (Op01 == Op10)) {
43603         std::swap(Op10, Op11);
43604         ShuffleVectorSDNode::commuteMask(ScaledMask1);
43605       }
43606       if ((Op00 == Op10) && (Op01 == Op11)) {
43607         SmallVector<int, 4> ShuffleMask;
43608         ShuffleMask.append(ScaledMask0.begin(), ScaledMask0.end());
43609         ShuffleMask.append(ScaledMask1.begin(), ScaledMask1.end());
43610         MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
43611         SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
43612                                   DAG.getBitcast(SrcVT, Op01));
43613         Res = DAG.getBitcast(ShufVT, Res);
43614         Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
43615         return DAG.getBitcast(VT, Res);
43616       }
43617     }
43618   }
43619 
43620   return SDValue();
43621 }
43622 
combineVectorPack(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)43623 static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
43624                                  TargetLowering::DAGCombinerInfo &DCI,
43625                                  const X86Subtarget &Subtarget) {
43626   unsigned Opcode = N->getOpcode();
43627   assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
43628          "Unexpected pack opcode");
43629 
43630   EVT VT = N->getValueType(0);
43631   SDValue N0 = N->getOperand(0);
43632   SDValue N1 = N->getOperand(1);
43633   unsigned NumDstElts = VT.getVectorNumElements();
43634   unsigned DstBitsPerElt = VT.getScalarSizeInBits();
43635   unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
43636   assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
43637          N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
43638          "Unexpected PACKSS/PACKUS input type");
43639 
43640   bool IsSigned = (X86ISD::PACKSS == Opcode);
43641 
43642   // Constant Folding.
43643   APInt UndefElts0, UndefElts1;
43644   SmallVector<APInt, 32> EltBits0, EltBits1;
43645   if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
43646       (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
43647       getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
43648       getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
43649     unsigned NumLanes = VT.getSizeInBits() / 128;
43650     unsigned NumSrcElts = NumDstElts / 2;
43651     unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
43652     unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
43653 
43654     APInt Undefs(NumDstElts, 0);
43655     SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
43656     for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
43657       for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
43658         unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
43659         auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
43660         auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
43661 
43662         if (UndefElts[SrcIdx]) {
43663           Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
43664           continue;
43665         }
43666 
43667         APInt &Val = EltBits[SrcIdx];
43668         if (IsSigned) {
43669           // PACKSS: Truncate signed value with signed saturation.
43670           // Source values less than dst minint are saturated to minint.
43671           // Source values greater than dst maxint are saturated to maxint.
43672           if (Val.isSignedIntN(DstBitsPerElt))
43673             Val = Val.trunc(DstBitsPerElt);
43674           else if (Val.isNegative())
43675             Val = APInt::getSignedMinValue(DstBitsPerElt);
43676           else
43677             Val = APInt::getSignedMaxValue(DstBitsPerElt);
43678         } else {
43679           // PACKUS: Truncate signed value with unsigned saturation.
43680           // Source values less than zero are saturated to zero.
43681           // Source values greater than dst maxuint are saturated to maxuint.
43682           if (Val.isIntN(DstBitsPerElt))
43683             Val = Val.trunc(DstBitsPerElt);
43684           else if (Val.isNegative())
43685             Val = APInt::getNullValue(DstBitsPerElt);
43686           else
43687             Val = APInt::getAllOnesValue(DstBitsPerElt);
43688         }
43689         Bits[Lane * NumDstEltsPerLane + Elt] = Val;
43690       }
43691     }
43692 
43693     return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
43694   }
43695 
43696   // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
43697   if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
43698     return V;
43699 
43700   // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
43701   // truncate to create a larger truncate.
43702   if (Subtarget.hasAVX512() &&
43703       N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
43704       N0.getOperand(0).getValueType() == MVT::v8i32) {
43705     if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
43706         (!IsSigned &&
43707          DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
43708       if (Subtarget.hasVLX())
43709         return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
43710 
43711       // Widen input to v16i32 so we can truncate that.
43712       SDLoc dl(N);
43713       SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
43714                                    N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
43715       return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
43716     }
43717   }
43718 
43719   // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
43720   if (VT.is128BitVector()) {
43721     unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
43722     SDValue Src0, Src1;
43723     if (N0.getOpcode() == ExtOpc &&
43724         N0.getOperand(0).getValueType().is64BitVector() &&
43725         N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
43726       Src0 = N0.getOperand(0);
43727     }
43728     if (N1.getOpcode() == ExtOpc &&
43729         N1.getOperand(0).getValueType().is64BitVector() &&
43730         N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
43731       Src1 = N1.getOperand(0);
43732     }
43733     if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
43734       assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
43735       Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
43736       Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
43737       return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
43738     }
43739   }
43740 
43741   // Attempt to combine as shuffle.
43742   SDValue Op(N, 0);
43743   if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43744     return Res;
43745 
43746   return SDValue();
43747 }
43748 
combineVectorHADDSUB(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)43749 static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
43750                                     TargetLowering::DAGCombinerInfo &DCI,
43751                                     const X86Subtarget &Subtarget) {
43752   assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
43753           X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
43754          "Unexpected horizontal add/sub opcode");
43755 
43756   if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
43757     // For slow-hop targets, if we have a hop with a single op, see if we already
43758     // have another user that we can reuse and shuffle the result.
43759     MVT VT = N->getSimpleValueType(0);
43760     SDValue LHS = N->getOperand(0);
43761     SDValue RHS = N->getOperand(1);
43762     if (VT.is128BitVector() && LHS == RHS) {
43763       for (SDNode *User : LHS->uses()) {
43764         if (User != N && User->getOpcode() == N->getOpcode()) {
43765           MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
43766           if (User->getOperand(0) == LHS && !User->getOperand(1).isUndef()) {
43767             return DAG.getBitcast(
43768                 VT,
43769                 DAG.getVectorShuffle(ShufVT, SDLoc(N),
43770                                      DAG.getBitcast(ShufVT, SDValue(User, 0)),
43771                                      DAG.getUNDEF(ShufVT), {0, 1, 0, 1}));
43772           }
43773           if (User->getOperand(1) == LHS && !User->getOperand(0).isUndef()) {
43774             return DAG.getBitcast(
43775                 VT,
43776                 DAG.getVectorShuffle(ShufVT, SDLoc(N),
43777                                      DAG.getBitcast(ShufVT, SDValue(User, 0)),
43778                                      DAG.getUNDEF(ShufVT), {2, 3, 2, 3}));
43779           }
43780         }
43781       }
43782     }
43783 
43784     // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
43785     if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
43786         LHS.getOpcode() == RHS.getOpcode() &&
43787         LHS.getValueType() == RHS.getValueType()) {
43788       SDValue LHS0 = LHS.getOperand(0);
43789       SDValue RHS0 = LHS.getOperand(1);
43790       SDValue LHS1 = RHS.getOperand(0);
43791       SDValue RHS1 = RHS.getOperand(1);
43792       if ((LHS0 == RHS0 || LHS0.isUndef() || RHS0.isUndef()) &&
43793           (LHS1 == RHS1 || LHS1.isUndef() || RHS1.isUndef())) {
43794         SDLoc DL(N);
43795         SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
43796                                   LHS0.isUndef() ? RHS0 : LHS0,
43797                                   LHS1.isUndef() ? RHS1 : LHS1);
43798         MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
43799         Res = DAG.getBitcast(ShufVT, Res);
43800         SDValue NewLHS =
43801             DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
43802                         getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
43803         SDValue NewRHS =
43804             DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
43805                         getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
43806         DAG.ReplaceAllUsesOfValueWith(LHS, DAG.getBitcast(VT, NewLHS));
43807         DAG.ReplaceAllUsesOfValueWith(RHS, DAG.getBitcast(VT, NewRHS));
43808         return SDValue(N, 0);
43809       }
43810     }
43811   }
43812 
43813   // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
43814   if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
43815     return V;
43816 
43817   return SDValue();
43818 }
43819 
combineVectorShiftVar(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)43820 static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
43821                                      TargetLowering::DAGCombinerInfo &DCI,
43822                                      const X86Subtarget &Subtarget) {
43823   assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
43824           X86ISD::VSRL == N->getOpcode()) &&
43825          "Unexpected shift opcode");
43826   EVT VT = N->getValueType(0);
43827   SDValue N0 = N->getOperand(0);
43828   SDValue N1 = N->getOperand(1);
43829 
43830   // Shift zero -> zero.
43831   if (ISD::isBuildVectorAllZeros(N0.getNode()))
43832     return DAG.getConstant(0, SDLoc(N), VT);
43833 
43834   // Detect constant shift amounts.
43835   APInt UndefElts;
43836   SmallVector<APInt, 32> EltBits;
43837   if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
43838     unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
43839     return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
43840                                       EltBits[0].getZExtValue(), DAG);
43841   }
43842 
43843   APInt KnownUndef, KnownZero;
43844   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43845   APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
43846   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
43847                                      KnownZero, DCI))
43848     return SDValue(N, 0);
43849 
43850   return SDValue();
43851 }
43852 
combineVectorShiftImm(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)43853 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
43854                                      TargetLowering::DAGCombinerInfo &DCI,
43855                                      const X86Subtarget &Subtarget) {
43856   unsigned Opcode = N->getOpcode();
43857   assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
43858           X86ISD::VSRLI == Opcode) &&
43859          "Unexpected shift opcode");
43860   bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
43861   EVT VT = N->getValueType(0);
43862   SDValue N0 = N->getOperand(0);
43863   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
43864   assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
43865          "Unexpected value type");
43866   assert(N->getOperand(1).getValueType() == MVT::i8 &&
43867          "Unexpected shift amount type");
43868 
43869   // Out of range logical bit shifts are guaranteed to be zero.
43870   // Out of range arithmetic bit shifts splat the sign bit.
43871   unsigned ShiftVal = N->getConstantOperandVal(1);
43872   if (ShiftVal >= NumBitsPerElt) {
43873     if (LogicalShift)
43874       return DAG.getConstant(0, SDLoc(N), VT);
43875     ShiftVal = NumBitsPerElt - 1;
43876   }
43877 
43878   // (shift X, 0) -> X
43879   if (!ShiftVal)
43880     return N0;
43881 
43882   // (shift 0, C) -> 0
43883   if (ISD::isBuildVectorAllZeros(N0.getNode()))
43884     // N0 is all zeros or undef. We guarantee that the bits shifted into the
43885     // result are all zeros, not undef.
43886     return DAG.getConstant(0, SDLoc(N), VT);
43887 
43888   // (VSRAI -1, C) -> -1
43889   if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
43890     // N0 is all ones or undef. We guarantee that the bits shifted into the
43891     // result are all ones, not undef.
43892     return DAG.getConstant(-1, SDLoc(N), VT);
43893 
43894   // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
43895   if (Opcode == N0.getOpcode()) {
43896     unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
43897     unsigned NewShiftVal = ShiftVal + ShiftVal2;
43898     if (NewShiftVal >= NumBitsPerElt) {
43899       // Out of range logical bit shifts are guaranteed to be zero.
43900       // Out of range arithmetic bit shifts splat the sign bit.
43901       if (LogicalShift)
43902         return DAG.getConstant(0, SDLoc(N), VT);
43903       NewShiftVal = NumBitsPerElt - 1;
43904     }
43905     return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
43906                        DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
43907   }
43908 
43909   // We can decode 'whole byte' logical bit shifts as shuffles.
43910   if (LogicalShift && (ShiftVal % 8) == 0) {
43911     SDValue Op(N, 0);
43912     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43913       return Res;
43914   }
43915 
43916   // Constant Folding.
43917   APInt UndefElts;
43918   SmallVector<APInt, 32> EltBits;
43919   if (N->isOnlyUserOf(N0.getNode()) &&
43920       getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
43921     assert(EltBits.size() == VT.getVectorNumElements() &&
43922            "Unexpected shift value type");
43923     // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
43924     // created an undef input due to no input bits being demanded, but user
43925     // still expects 0 in other bits.
43926     for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
43927       APInt &Elt = EltBits[i];
43928       if (UndefElts[i])
43929         Elt = 0;
43930       else if (X86ISD::VSHLI == Opcode)
43931         Elt <<= ShiftVal;
43932       else if (X86ISD::VSRAI == Opcode)
43933         Elt.ashrInPlace(ShiftVal);
43934       else
43935         Elt.lshrInPlace(ShiftVal);
43936     }
43937     // Reset undef elements since they were zeroed above.
43938     UndefElts = 0;
43939     return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
43940   }
43941 
43942   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43943   if (TLI.SimplifyDemandedBits(SDValue(N, 0),
43944                                APInt::getAllOnesValue(NumBitsPerElt), DCI))
43945     return SDValue(N, 0);
43946 
43947   return SDValue();
43948 }
43949 
combineVectorInsert(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)43950 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
43951                                    TargetLowering::DAGCombinerInfo &DCI,
43952                                    const X86Subtarget &Subtarget) {
43953   EVT VT = N->getValueType(0);
43954   assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||
43955           (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) ||
43956           N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&
43957          "Unexpected vector insertion");
43958 
43959   if (N->getOpcode() == X86ISD::PINSRB || N->getOpcode() == X86ISD::PINSRW) {
43960     unsigned NumBitsPerElt = VT.getScalarSizeInBits();
43961     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43962     if (TLI.SimplifyDemandedBits(SDValue(N, 0),
43963                                  APInt::getAllOnesValue(NumBitsPerElt), DCI))
43964       return SDValue(N, 0);
43965   }
43966 
43967   // Attempt to combine insertion patterns to a shuffle.
43968   if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
43969     SDValue Op(N, 0);
43970     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43971       return Res;
43972   }
43973 
43974   return SDValue();
43975 }
43976 
43977 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
43978 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
43979 /// OR -> CMPNEQSS.
combineCompareEqual(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)43980 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
43981                                    TargetLowering::DAGCombinerInfo &DCI,
43982                                    const X86Subtarget &Subtarget) {
43983   unsigned opcode;
43984 
43985   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
43986   // we're requiring SSE2 for both.
43987   if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
43988     SDValue N0 = N->getOperand(0);
43989     SDValue N1 = N->getOperand(1);
43990     SDValue CMP0 = N0.getOperand(1);
43991     SDValue CMP1 = N1.getOperand(1);
43992     SDLoc DL(N);
43993 
43994     // The SETCCs should both refer to the same CMP.
43995     if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
43996       return SDValue();
43997 
43998     SDValue CMP00 = CMP0->getOperand(0);
43999     SDValue CMP01 = CMP0->getOperand(1);
44000     EVT     VT    = CMP00.getValueType();
44001 
44002     if (VT == MVT::f32 || VT == MVT::f64) {
44003       bool ExpectingFlags = false;
44004       // Check for any users that want flags:
44005       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
44006            !ExpectingFlags && UI != UE; ++UI)
44007         switch (UI->getOpcode()) {
44008         default:
44009         case ISD::BR_CC:
44010         case ISD::BRCOND:
44011         case ISD::SELECT:
44012           ExpectingFlags = true;
44013           break;
44014         case ISD::CopyToReg:
44015         case ISD::SIGN_EXTEND:
44016         case ISD::ZERO_EXTEND:
44017         case ISD::ANY_EXTEND:
44018           break;
44019         }
44020 
44021       if (!ExpectingFlags) {
44022         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
44023         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
44024 
44025         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
44026           X86::CondCode tmp = cc0;
44027           cc0 = cc1;
44028           cc1 = tmp;
44029         }
44030 
44031         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
44032             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
44033           // FIXME: need symbolic constants for these magic numbers.
44034           // See X86ATTInstPrinter.cpp:printSSECC().
44035           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
44036           if (Subtarget.hasAVX512()) {
44037             SDValue FSetCC =
44038                 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
44039                             DAG.getTargetConstant(x86cc, DL, MVT::i8));
44040             // Need to fill with zeros to ensure the bitcast will produce zeroes
44041             // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
44042             SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
44043                                       DAG.getConstant(0, DL, MVT::v16i1),
44044                                       FSetCC, DAG.getIntPtrConstant(0, DL));
44045             return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
44046                                       N->getSimpleValueType(0));
44047           }
44048           SDValue OnesOrZeroesF =
44049               DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
44050                           CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
44051 
44052           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
44053           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
44054 
44055           if (is64BitFP && !Subtarget.is64Bit()) {
44056             // On a 32-bit target, we cannot bitcast the 64-bit float to a
44057             // 64-bit integer, since that's not a legal type. Since
44058             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
44059             // bits, but can do this little dance to extract the lowest 32 bits
44060             // and work with those going forward.
44061             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
44062                                            OnesOrZeroesF);
44063             SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
44064             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
44065                                         Vector32, DAG.getIntPtrConstant(0, DL));
44066             IntVT = MVT::i32;
44067           }
44068 
44069           SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
44070           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
44071                                       DAG.getConstant(1, DL, IntVT));
44072           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
44073                                               ANDed);
44074           return OneBitOfTruth;
44075         }
44076       }
44077     }
44078   }
44079   return SDValue();
44080 }
44081 
44082 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
combineANDXORWithAllOnesIntoANDNP(SDNode * N,SelectionDAG & DAG)44083 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
44084   assert(N->getOpcode() == ISD::AND);
44085 
44086   MVT VT = N->getSimpleValueType(0);
44087   if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
44088     return SDValue();
44089 
44090   SDValue X, Y;
44091   SDValue N0 = N->getOperand(0);
44092   SDValue N1 = N->getOperand(1);
44093 
44094   auto GetNot = [&VT, &DAG](SDValue V) {
44095     // Basic X = NOT(Y) detection.
44096     if (SDValue Not = IsNOT(V, DAG))
44097       return Not;
44098     // Fold BROADCAST(NOT(Y)) -> BROADCAST(Y).
44099     if (V.getOpcode() == X86ISD::VBROADCAST) {
44100       SDValue Src = V.getOperand(0);
44101       EVT SrcVT = Src.getValueType();
44102       if (!SrcVT.isVector())
44103         return SDValue();
44104       if (SDValue Not = IsNOT(Src, DAG))
44105         return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT,
44106                            DAG.getBitcast(SrcVT, Not));
44107     }
44108     return SDValue();
44109   };
44110 
44111   if (SDValue Not = GetNot(N0)) {
44112     X = Not;
44113     Y = N1;
44114   } else if (SDValue Not = GetNot(N1)) {
44115     X = Not;
44116     Y = N0;
44117   } else
44118     return SDValue();
44119 
44120   X = DAG.getBitcast(VT, X);
44121   Y = DAG.getBitcast(VT, Y);
44122   return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
44123 }
44124 
44125 // Try to widen AND, OR and XOR nodes to VT in order to remove casts around
44126 // logical operations, like in the example below.
44127 //   or (and (truncate x, truncate y)),
44128 //      (xor (truncate z, build_vector (constants)))
44129 // Given a target type \p VT, we generate
44130 //   or (and x, y), (xor z, zext(build_vector (constants)))
44131 // given x, y and z are of type \p VT. We can do so, if operands are either
44132 // truncates from VT types, the second operand is a vector of constants or can
44133 // be recursively promoted.
PromoteMaskArithmetic(SDNode * N,EVT VT,SelectionDAG & DAG,unsigned Depth)44134 static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
44135                                      unsigned Depth) {
44136   // Limit recursion to avoid excessive compile times.
44137   if (Depth >= SelectionDAG::MaxRecursionDepth)
44138     return SDValue();
44139 
44140   if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
44141       N->getOpcode() != ISD::OR)
44142     return SDValue();
44143 
44144   SDValue N0 = N->getOperand(0);
44145   SDValue N1 = N->getOperand(1);
44146   SDLoc DL(N);
44147 
44148   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44149   if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
44150     return SDValue();
44151 
44152   if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
44153     N0 = NN0;
44154   else {
44155     // The Left side has to be a trunc.
44156     if (N0.getOpcode() != ISD::TRUNCATE)
44157       return SDValue();
44158 
44159     // The type of the truncated inputs.
44160     if (N0.getOperand(0).getValueType() != VT)
44161       return SDValue();
44162 
44163     N0 = N0.getOperand(0);
44164   }
44165 
44166   if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
44167     N1 = NN1;
44168   else {
44169     // The right side has to be a 'trunc' or a constant vector.
44170     bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
44171                     N1.getOperand(0).getValueType() == VT;
44172     if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
44173       return SDValue();
44174 
44175     if (RHSTrunc)
44176       N1 = N1.getOperand(0);
44177     else
44178       N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
44179   }
44180 
44181   return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
44182 }
44183 
44184 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
44185 // register. In most cases we actually compare or select YMM-sized registers
44186 // and mixing the two types creates horrible code. This method optimizes
44187 // some of the transition sequences.
44188 // Even with AVX-512 this is still useful for removing casts around logical
44189 // operations on vXi1 mask types.
PromoteMaskArithmetic(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)44190 static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
44191                                      const X86Subtarget &Subtarget) {
44192   EVT VT = N->getValueType(0);
44193   assert(VT.isVector() && "Expected vector type");
44194 
44195   SDLoc DL(N);
44196   assert((N->getOpcode() == ISD::ANY_EXTEND ||
44197           N->getOpcode() == ISD::ZERO_EXTEND ||
44198           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
44199 
44200   SDValue Narrow = N->getOperand(0);
44201   EVT NarrowVT = Narrow.getValueType();
44202 
44203   // Generate the wide operation.
44204   SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
44205   if (!Op)
44206     return SDValue();
44207   switch (N->getOpcode()) {
44208   default: llvm_unreachable("Unexpected opcode");
44209   case ISD::ANY_EXTEND:
44210     return Op;
44211   case ISD::ZERO_EXTEND:
44212     return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
44213   case ISD::SIGN_EXTEND:
44214     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
44215                        Op, DAG.getValueType(NarrowVT));
44216   }
44217 }
44218 
convertIntLogicToFPLogicOpcode(unsigned Opcode)44219 static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
44220   unsigned FPOpcode;
44221   switch (Opcode) {
44222   default: llvm_unreachable("Unexpected input node for FP logic conversion");
44223   case ISD::AND: FPOpcode = X86ISD::FAND; break;
44224   case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
44225   case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
44226   }
44227   return FPOpcode;
44228 }
44229 
44230 /// If both input operands of a logic op are being cast from floating point
44231 /// types, try to convert this into a floating point logic node to avoid
44232 /// unnecessary moves from SSE to integer registers.
convertIntLogicToFPLogic(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)44233 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
44234                                         const X86Subtarget &Subtarget) {
44235   EVT VT = N->getValueType(0);
44236   SDValue N0 = N->getOperand(0);
44237   SDValue N1 = N->getOperand(1);
44238   SDLoc DL(N);
44239 
44240   if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST)
44241     return SDValue();
44242 
44243   SDValue N00 = N0.getOperand(0);
44244   SDValue N10 = N1.getOperand(0);
44245   EVT N00Type = N00.getValueType();
44246   EVT N10Type = N10.getValueType();
44247 
44248   // Ensure that both types are the same and are legal scalar fp types.
44249   if (N00Type != N10Type ||
44250       !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
44251         (Subtarget.hasSSE2() && N00Type == MVT::f64)))
44252     return SDValue();
44253 
44254   unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
44255   SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
44256   return DAG.getBitcast(VT, FPLogic);
44257 }
44258 
44259 // Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
44260 // to reduce XMM->GPR traffic.
combineBitOpWithMOVMSK(SDNode * N,SelectionDAG & DAG)44261 static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
44262   unsigned Opc = N->getOpcode();
44263   assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
44264          "Unexpected bit opcode");
44265 
44266   SDValue N0 = N->getOperand(0);
44267   SDValue N1 = N->getOperand(1);
44268 
44269   // Both operands must be single use MOVMSK.
44270   if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
44271       N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
44272     return SDValue();
44273 
44274   SDValue Vec0 = N0.getOperand(0);
44275   SDValue Vec1 = N1.getOperand(0);
44276   EVT VecVT0 = Vec0.getValueType();
44277   EVT VecVT1 = Vec1.getValueType();
44278 
44279   // Both MOVMSK operands must be from vectors of the same size and same element
44280   // size, but its OK for a fp/int diff.
44281   if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
44282       VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
44283     return SDValue();
44284 
44285   SDLoc DL(N);
44286   unsigned VecOpc =
44287       VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
44288   SDValue Result =
44289       DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
44290   return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
44291 }
44292 
44293 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
44294 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
44295 /// with a shift-right to eliminate loading the vector constant mask value.
combineAndMaskToShift(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)44296 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
44297                                      const X86Subtarget &Subtarget) {
44298   SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
44299   SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
44300   EVT VT0 = Op0.getValueType();
44301   EVT VT1 = Op1.getValueType();
44302 
44303   if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
44304     return SDValue();
44305 
44306   APInt SplatVal;
44307   if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
44308       !SplatVal.isMask())
44309     return SDValue();
44310 
44311   // Don't prevent creation of ANDN.
44312   if (isBitwiseNot(Op0))
44313     return SDValue();
44314 
44315   if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
44316     return SDValue();
44317 
44318   unsigned EltBitWidth = VT0.getScalarSizeInBits();
44319   if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
44320     return SDValue();
44321 
44322   SDLoc DL(N);
44323   unsigned ShiftVal = SplatVal.countTrailingOnes();
44324   SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
44325   SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
44326   return DAG.getBitcast(N->getValueType(0), Shift);
44327 }
44328 
44329 // Get the index node from the lowered DAG of a GEP IR instruction with one
44330 // indexing dimension.
getIndexFromUnindexedLoad(LoadSDNode * Ld)44331 static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
44332   if (Ld->isIndexed())
44333     return SDValue();
44334 
44335   SDValue Base = Ld->getBasePtr();
44336 
44337   if (Base.getOpcode() != ISD::ADD)
44338     return SDValue();
44339 
44340   SDValue ShiftedIndex = Base.getOperand(0);
44341 
44342   if (ShiftedIndex.getOpcode() != ISD::SHL)
44343     return SDValue();
44344 
44345   return ShiftedIndex.getOperand(0);
44346 
44347 }
44348 
hasBZHI(const X86Subtarget & Subtarget,MVT VT)44349 static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
44350   if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
44351     switch (VT.getSizeInBits()) {
44352     default: return false;
44353     case 64: return Subtarget.is64Bit() ? true : false;
44354     case 32: return true;
44355     }
44356   }
44357   return false;
44358 }
44359 
44360 // This function recognizes cases where X86 bzhi instruction can replace and
44361 // 'and-load' sequence.
44362 // In case of loading integer value from an array of constants which is defined
44363 // as follows:
44364 //
44365 //   int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
44366 //
44367 // then applying a bitwise and on the result with another input.
44368 // It's equivalent to performing bzhi (zero high bits) on the input, with the
44369 // same index of the load.
combineAndLoadToBZHI(SDNode * Node,SelectionDAG & DAG,const X86Subtarget & Subtarget)44370 static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
44371                                     const X86Subtarget &Subtarget) {
44372   MVT VT = Node->getSimpleValueType(0);
44373   SDLoc dl(Node);
44374 
44375   // Check if subtarget has BZHI instruction for the node's type
44376   if (!hasBZHI(Subtarget, VT))
44377     return SDValue();
44378 
44379   // Try matching the pattern for both operands.
44380   for (unsigned i = 0; i < 2; i++) {
44381     SDValue N = Node->getOperand(i);
44382     LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
44383 
44384      // continue if the operand is not a load instruction
44385     if (!Ld)
44386       return SDValue();
44387 
44388     const Value *MemOp = Ld->getMemOperand()->getValue();
44389 
44390     if (!MemOp)
44391       return SDValue();
44392 
44393     if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
44394       if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
44395         if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
44396 
44397           Constant *Init = GV->getInitializer();
44398           Type *Ty = Init->getType();
44399           if (!isa<ConstantDataArray>(Init) ||
44400               !Ty->getArrayElementType()->isIntegerTy() ||
44401               Ty->getArrayElementType()->getScalarSizeInBits() !=
44402                   VT.getSizeInBits() ||
44403               Ty->getArrayNumElements() >
44404                   Ty->getArrayElementType()->getScalarSizeInBits())
44405             continue;
44406 
44407           // Check if the array's constant elements are suitable to our case.
44408           uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
44409           bool ConstantsMatch = true;
44410           for (uint64_t j = 0; j < ArrayElementCount; j++) {
44411             auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
44412             if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
44413               ConstantsMatch = false;
44414               break;
44415             }
44416           }
44417           if (!ConstantsMatch)
44418             continue;
44419 
44420           // Do the transformation (For 32-bit type):
44421           // -> (and (load arr[idx]), inp)
44422           // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
44423           //    that will be replaced with one bzhi instruction.
44424           SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
44425           SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
44426 
44427           // Get the Node which indexes into the array.
44428           SDValue Index = getIndexFromUnindexedLoad(Ld);
44429           if (!Index)
44430             return SDValue();
44431           Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
44432 
44433           SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
44434           Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
44435 
44436           SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
44437           SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
44438 
44439           return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
44440         }
44441       }
44442     }
44443   }
44444   return SDValue();
44445 }
44446 
44447 // Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
44448 // Where C is a mask containing the same number of bits as the setcc and
44449 // where the setcc will freely 0 upper bits of k-register. We can replace the
44450 // undef in the concat with 0s and remove the AND. This mainly helps with
44451 // v2i1/v4i1 setcc being casted to scalar.
combineScalarAndWithMaskSetcc(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)44452 static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
44453                                              const X86Subtarget &Subtarget) {
44454   assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
44455 
44456   EVT VT = N->getValueType(0);
44457 
44458   // Make sure this is an AND with constant. We will check the value of the
44459   // constant later.
44460   if (!isa<ConstantSDNode>(N->getOperand(1)))
44461     return SDValue();
44462 
44463   // This is implied by the ConstantSDNode.
44464   assert(!VT.isVector() && "Expected scalar VT!");
44465 
44466   if (N->getOperand(0).getOpcode() != ISD::BITCAST ||
44467       !N->getOperand(0).hasOneUse() ||
44468       !N->getOperand(0).getOperand(0).hasOneUse())
44469     return SDValue();
44470 
44471   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44472   SDValue Src = N->getOperand(0).getOperand(0);
44473   EVT SrcVT = Src.getValueType();
44474   if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
44475       !TLI.isTypeLegal(SrcVT))
44476     return SDValue();
44477 
44478   if (Src.getOpcode() != ISD::CONCAT_VECTORS)
44479     return SDValue();
44480 
44481   // We only care about the first subvector of the concat, we expect the
44482   // other subvectors to be ignored due to the AND if we make the change.
44483   SDValue SubVec = Src.getOperand(0);
44484   EVT SubVecVT = SubVec.getValueType();
44485 
44486   // First subvector should be a setcc with a legal result type. The RHS of the
44487   // AND should be a mask with this many bits.
44488   if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) ||
44489       !N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements()))
44490     return SDValue();
44491 
44492   EVT SetccVT = SubVec.getOperand(0).getValueType();
44493   if (!TLI.isTypeLegal(SetccVT) ||
44494       !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
44495     return SDValue();
44496 
44497   if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
44498     return SDValue();
44499 
44500   // We passed all the checks. Rebuild the concat_vectors with zeroes
44501   // and cast it back to VT.
44502   SDLoc dl(N);
44503   SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
44504                               DAG.getConstant(0, dl, SubVecVT));
44505   Ops[0] = SubVec;
44506   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
44507                                Ops);
44508   return DAG.getBitcast(VT, Concat);
44509 }
44510 
combineAnd(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)44511 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
44512                           TargetLowering::DAGCombinerInfo &DCI,
44513                           const X86Subtarget &Subtarget) {
44514   EVT VT = N->getValueType(0);
44515 
44516   // If this is SSE1 only convert to FAND to avoid scalarization.
44517   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
44518     return DAG.getBitcast(
44519         MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
44520                                 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
44521                                 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
44522   }
44523 
44524   // Use a 32-bit and+zext if upper bits known zero.
44525   if (VT == MVT::i64 && Subtarget.is64Bit() &&
44526       !isa<ConstantSDNode>(N->getOperand(1))) {
44527     APInt HiMask = APInt::getHighBitsSet(64, 32);
44528     if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
44529         DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
44530       SDLoc dl(N);
44531       SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
44532       SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
44533       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
44534                          DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
44535     }
44536   }
44537 
44538   // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
44539   // TODO: Support multiple SrcOps.
44540   if (VT == MVT::i1) {
44541     SmallVector<SDValue, 2> SrcOps;
44542     SmallVector<APInt, 2> SrcPartials;
44543     if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
44544         SrcOps.size() == 1) {
44545       SDLoc dl(N);
44546       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44547       unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
44548       EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
44549       SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
44550       if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
44551         Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
44552       if (Mask) {
44553         assert(SrcPartials[0].getBitWidth() == NumElts &&
44554                "Unexpected partial reduction mask");
44555         SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
44556         Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
44557         return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
44558       }
44559     }
44560   }
44561 
44562   if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
44563     return V;
44564 
44565   if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
44566     return R;
44567 
44568   if (DCI.isBeforeLegalizeOps())
44569     return SDValue();
44570 
44571   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
44572     return R;
44573 
44574   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
44575     return FPLogic;
44576 
44577   if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
44578     return R;
44579 
44580   if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
44581     return ShiftRight;
44582 
44583   if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
44584     return R;
44585 
44586   // Attempt to recursively combine a bitmask AND with shuffles.
44587   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
44588     SDValue Op(N, 0);
44589     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44590       return Res;
44591   }
44592 
44593   // Attempt to combine a scalar bitmask AND with an extracted shuffle.
44594   if ((VT.getScalarSizeInBits() % 8) == 0 &&
44595       N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
44596       isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
44597     SDValue BitMask = N->getOperand(1);
44598     SDValue SrcVec = N->getOperand(0).getOperand(0);
44599     EVT SrcVecVT = SrcVec.getValueType();
44600 
44601     // Check that the constant bitmask masks whole bytes.
44602     APInt UndefElts;
44603     SmallVector<APInt, 64> EltBits;
44604     if (VT == SrcVecVT.getScalarType() &&
44605         N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
44606         getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
44607         llvm::all_of(EltBits, [](const APInt &M) {
44608           return M.isNullValue() || M.isAllOnesValue();
44609         })) {
44610       unsigned NumElts = SrcVecVT.getVectorNumElements();
44611       unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
44612       unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
44613 
44614       // Create a root shuffle mask from the byte mask and the extracted index.
44615       SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
44616       for (unsigned i = 0; i != Scale; ++i) {
44617         if (UndefElts[i])
44618           continue;
44619         int VecIdx = Scale * Idx + i;
44620         ShuffleMask[VecIdx] =
44621             EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
44622       }
44623 
44624       if (SDValue Shuffle = combineX86ShufflesRecursively(
44625               {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
44626               X86::MaxShuffleCombineDepth,
44627               /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
44628         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
44629                            N->getOperand(0).getOperand(1));
44630     }
44631   }
44632 
44633   return SDValue();
44634 }
44635 
44636 // Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
canonicalizeBitSelect(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)44637 static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
44638                                      const X86Subtarget &Subtarget) {
44639   assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
44640 
44641   MVT VT = N->getSimpleValueType(0);
44642   if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0)
44643     return SDValue();
44644 
44645   SDValue N0 = peekThroughBitcasts(N->getOperand(0));
44646   SDValue N1 = peekThroughBitcasts(N->getOperand(1));
44647   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
44648     return SDValue();
44649 
44650   // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
44651   // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
44652   bool UseVPTERNLOG = (Subtarget.hasAVX512() && VT.is512BitVector()) ||
44653                       Subtarget.hasVLX();
44654   if (!(Subtarget.hasXOP() || UseVPTERNLOG ||
44655         !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
44656     return SDValue();
44657 
44658   // Attempt to extract constant byte masks.
44659   APInt UndefElts0, UndefElts1;
44660   SmallVector<APInt, 32> EltBits0, EltBits1;
44661   if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
44662                                      false, false))
44663     return SDValue();
44664   if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
44665                                      false, false))
44666     return SDValue();
44667 
44668   for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
44669     // TODO - add UNDEF elts support.
44670     if (UndefElts0[i] || UndefElts1[i])
44671       return SDValue();
44672     if (EltBits0[i] != ~EltBits1[i])
44673       return SDValue();
44674   }
44675 
44676   SDLoc DL(N);
44677 
44678   if (UseVPTERNLOG) {
44679     // Emit a VPTERNLOG node directly.
44680     SDValue A = DAG.getBitcast(VT, N0.getOperand(1));
44681     SDValue B = DAG.getBitcast(VT, N0.getOperand(0));
44682     SDValue C = DAG.getBitcast(VT, N1.getOperand(0));
44683     SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
44684     return DAG.getNode(X86ISD::VPTERNLOG, DL, VT, A, B, C, Imm);
44685   }
44686 
44687   SDValue X = N->getOperand(0);
44688   SDValue Y =
44689       DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
44690                   DAG.getBitcast(VT, N1.getOperand(0)));
44691   return DAG.getNode(ISD::OR, DL, VT, X, Y);
44692 }
44693 
44694 // Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
matchLogicBlend(SDNode * N,SDValue & X,SDValue & Y,SDValue & Mask)44695 static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
44696   if (N->getOpcode() != ISD::OR)
44697     return false;
44698 
44699   SDValue N0 = N->getOperand(0);
44700   SDValue N1 = N->getOperand(1);
44701 
44702   // Canonicalize AND to LHS.
44703   if (N1.getOpcode() == ISD::AND)
44704     std::swap(N0, N1);
44705 
44706   // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
44707   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
44708     return false;
44709 
44710   Mask = N1.getOperand(0);
44711   X = N1.getOperand(1);
44712 
44713   // Check to see if the mask appeared in both the AND and ANDNP.
44714   if (N0.getOperand(0) == Mask)
44715     Y = N0.getOperand(1);
44716   else if (N0.getOperand(1) == Mask)
44717     Y = N0.getOperand(0);
44718   else
44719     return false;
44720 
44721   // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
44722   // ANDNP combine allows other combines to happen that prevent matching.
44723   return true;
44724 }
44725 
44726 // Try to fold:
44727 //   (or (and (m, y), (pandn m, x)))
44728 // into:
44729 //   (vselect m, x, y)
44730 // As a special case, try to fold:
44731 //   (or (and (m, (sub 0, x)), (pandn m, x)))
44732 // into:
44733 //   (sub (xor X, M), M)
combineLogicBlendIntoPBLENDV(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)44734 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
44735                                             const X86Subtarget &Subtarget) {
44736   assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
44737 
44738   EVT VT = N->getValueType(0);
44739   if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
44740         (VT.is256BitVector() && Subtarget.hasInt256())))
44741     return SDValue();
44742 
44743   SDValue X, Y, Mask;
44744   if (!matchLogicBlend(N, X, Y, Mask))
44745     return SDValue();
44746 
44747   // Validate that X, Y, and Mask are bitcasts, and see through them.
44748   Mask = peekThroughBitcasts(Mask);
44749   X = peekThroughBitcasts(X);
44750   Y = peekThroughBitcasts(Y);
44751 
44752   EVT MaskVT = Mask.getValueType();
44753   unsigned EltBits = MaskVT.getScalarSizeInBits();
44754 
44755   // TODO: Attempt to handle floating point cases as well?
44756   if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
44757     return SDValue();
44758 
44759   SDLoc DL(N);
44760 
44761   // Attempt to combine to conditional negate: (sub (xor X, M), M)
44762   if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
44763                                                            DAG, Subtarget))
44764     return Res;
44765 
44766   // PBLENDVB is only available on SSE 4.1.
44767   if (!Subtarget.hasSSE41())
44768     return SDValue();
44769 
44770   // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
44771   if (Subtarget.hasVLX())
44772     return SDValue();
44773 
44774   MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
44775 
44776   X = DAG.getBitcast(BlendVT, X);
44777   Y = DAG.getBitcast(BlendVT, Y);
44778   Mask = DAG.getBitcast(BlendVT, Mask);
44779   Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
44780   return DAG.getBitcast(VT, Mask);
44781 }
44782 
44783 // Helper function for combineOrCmpEqZeroToCtlzSrl
44784 // Transforms:
44785 //   seteq(cmp x, 0)
44786 //   into:
44787 //   srl(ctlz x), log2(bitsize(x))
44788 // Input pattern is checked by caller.
lowerX86CmpEqZeroToCtlzSrl(SDValue Op,EVT ExtTy,SelectionDAG & DAG)44789 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
44790                                           SelectionDAG &DAG) {
44791   SDValue Cmp = Op.getOperand(1);
44792   EVT VT = Cmp.getOperand(0).getValueType();
44793   unsigned Log2b = Log2_32(VT.getSizeInBits());
44794   SDLoc dl(Op);
44795   SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
44796   // The result of the shift is true or false, and on X86, the 32-bit
44797   // encoding of shr and lzcnt is more desirable.
44798   SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
44799   SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
44800                             DAG.getConstant(Log2b, dl, MVT::i8));
44801   return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
44802 }
44803 
44804 // Try to transform:
44805 //   zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
44806 //   into:
44807 //   srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
44808 // Will also attempt to match more generic cases, eg:
44809 //   zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
44810 // Only applies if the target supports the FastLZCNT feature.
combineOrCmpEqZeroToCtlzSrl(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)44811 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
44812                                            TargetLowering::DAGCombinerInfo &DCI,
44813                                            const X86Subtarget &Subtarget) {
44814   if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
44815     return SDValue();
44816 
44817   auto isORCandidate = [](SDValue N) {
44818     return (N->getOpcode() == ISD::OR && N->hasOneUse());
44819   };
44820 
44821   // Check the zero extend is extending to 32-bit or more. The code generated by
44822   // srl(ctlz) for 16-bit or less variants of the pattern would require extra
44823   // instructions to clear the upper bits.
44824   if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
44825       !isORCandidate(N->getOperand(0)))
44826     return SDValue();
44827 
44828   // Check the node matches: setcc(eq, cmp 0)
44829   auto isSetCCCandidate = [](SDValue N) {
44830     return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
44831            X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
44832            N->getOperand(1).getOpcode() == X86ISD::CMP &&
44833            isNullConstant(N->getOperand(1).getOperand(1)) &&
44834            N->getOperand(1).getValueType().bitsGE(MVT::i32);
44835   };
44836 
44837   SDNode *OR = N->getOperand(0).getNode();
44838   SDValue LHS = OR->getOperand(0);
44839   SDValue RHS = OR->getOperand(1);
44840 
44841   // Save nodes matching or(or, setcc(eq, cmp 0)).
44842   SmallVector<SDNode *, 2> ORNodes;
44843   while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
44844           (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
44845     ORNodes.push_back(OR);
44846     OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
44847     LHS = OR->getOperand(0);
44848     RHS = OR->getOperand(1);
44849   }
44850 
44851   // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
44852   if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
44853       !isORCandidate(SDValue(OR, 0)))
44854     return SDValue();
44855 
44856   // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
44857   // to
44858   // or(srl(ctlz),srl(ctlz)).
44859   // The dag combiner can then fold it into:
44860   // srl(or(ctlz, ctlz)).
44861   EVT VT = OR->getValueType(0);
44862   SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
44863   SDValue Ret, NewRHS;
44864   if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
44865     Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
44866 
44867   if (!Ret)
44868     return SDValue();
44869 
44870   // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
44871   while (ORNodes.size() > 0) {
44872     OR = ORNodes.pop_back_val();
44873     LHS = OR->getOperand(0);
44874     RHS = OR->getOperand(1);
44875     // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
44876     if (RHS->getOpcode() == ISD::OR)
44877       std::swap(LHS, RHS);
44878     NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
44879     if (!NewRHS)
44880       return SDValue();
44881     Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
44882   }
44883 
44884   if (Ret)
44885     Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
44886 
44887   return Ret;
44888 }
44889 
combineOr(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)44890 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
44891                          TargetLowering::DAGCombinerInfo &DCI,
44892                          const X86Subtarget &Subtarget) {
44893   SDValue N0 = N->getOperand(0);
44894   SDValue N1 = N->getOperand(1);
44895   EVT VT = N->getValueType(0);
44896 
44897   // If this is SSE1 only convert to FOR to avoid scalarization.
44898   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
44899     return DAG.getBitcast(MVT::v4i32,
44900                           DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
44901                                       DAG.getBitcast(MVT::v4f32, N0),
44902                                       DAG.getBitcast(MVT::v4f32, N1)));
44903   }
44904 
44905   // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
44906   // TODO: Support multiple SrcOps.
44907   if (VT == MVT::i1) {
44908     SmallVector<SDValue, 2> SrcOps;
44909     SmallVector<APInt, 2> SrcPartials;
44910     if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
44911         SrcOps.size() == 1) {
44912       SDLoc dl(N);
44913       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44914       unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
44915       EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
44916       SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
44917       if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
44918         Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
44919       if (Mask) {
44920         assert(SrcPartials[0].getBitWidth() == NumElts &&
44921                "Unexpected partial reduction mask");
44922         SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
44923         SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
44924         Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
44925         return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
44926       }
44927     }
44928   }
44929 
44930   if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
44931     return R;
44932 
44933   if (DCI.isBeforeLegalizeOps())
44934     return SDValue();
44935 
44936   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
44937     return R;
44938 
44939   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
44940     return FPLogic;
44941 
44942   if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
44943     return R;
44944 
44945   if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
44946     return R;
44947 
44948   // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
44949   // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
44950   // iff the upper elements of the non-shifted arg are zero.
44951   // KUNPCK require 16+ bool vector elements.
44952   if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
44953     unsigned NumElts = VT.getVectorNumElements();
44954     unsigned HalfElts = NumElts / 2;
44955     APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
44956     if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
44957         N1.getConstantOperandAPInt(1) == HalfElts &&
44958         DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {
44959       SDLoc dl(N);
44960       return DAG.getNode(
44961           ISD::CONCAT_VECTORS, dl, VT,
44962           extractSubVector(N0, 0, DAG, dl, HalfElts),
44963           extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
44964     }
44965     if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
44966         N0.getConstantOperandAPInt(1) == HalfElts &&
44967         DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {
44968       SDLoc dl(N);
44969       return DAG.getNode(
44970           ISD::CONCAT_VECTORS, dl, VT,
44971           extractSubVector(N1, 0, DAG, dl, HalfElts),
44972           extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
44973     }
44974   }
44975 
44976   // Attempt to recursively combine an OR of shuffles.
44977   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
44978     SDValue Op(N, 0);
44979     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44980       return Res;
44981   }
44982 
44983   return SDValue();
44984 }
44985 
44986 /// Try to turn tests against the signbit in the form of:
44987 ///   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
44988 /// into:
44989 ///   SETGT(X, -1)
foldXorTruncShiftIntoCmp(SDNode * N,SelectionDAG & DAG)44990 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
44991   // This is only worth doing if the output type is i8 or i1.
44992   EVT ResultType = N->getValueType(0);
44993   if (ResultType != MVT::i8 && ResultType != MVT::i1)
44994     return SDValue();
44995 
44996   SDValue N0 = N->getOperand(0);
44997   SDValue N1 = N->getOperand(1);
44998 
44999   // We should be performing an xor against a truncated shift.
45000   if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
45001     return SDValue();
45002 
45003   // Make sure we are performing an xor against one.
45004   if (!isOneConstant(N1))
45005     return SDValue();
45006 
45007   // SetCC on x86 zero extends so only act on this if it's a logical shift.
45008   SDValue Shift = N0.getOperand(0);
45009   if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
45010     return SDValue();
45011 
45012   // Make sure we are truncating from one of i16, i32 or i64.
45013   EVT ShiftTy = Shift.getValueType();
45014   if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
45015     return SDValue();
45016 
45017   // Make sure the shift amount extracts the sign bit.
45018   if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
45019       Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
45020     return SDValue();
45021 
45022   // Create a greater-than comparison against -1.
45023   // N.B. Using SETGE against 0 works but we want a canonical looking
45024   // comparison, using SETGT matches up with what TranslateX86CC.
45025   SDLoc DL(N);
45026   SDValue ShiftOp = Shift.getOperand(0);
45027   EVT ShiftOpTy = ShiftOp.getValueType();
45028   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45029   EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
45030                                                *DAG.getContext(), ResultType);
45031   SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
45032                               DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
45033   if (SetCCResultType != ResultType)
45034     Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
45035   return Cond;
45036 }
45037 
45038 /// Turn vector tests of the signbit in the form of:
45039 ///   xor (sra X, elt_size(X)-1), -1
45040 /// into:
45041 ///   pcmpgt X, -1
45042 ///
45043 /// This should be called before type legalization because the pattern may not
45044 /// persist after that.
foldVectorXorShiftIntoCmp(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)45045 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
45046                                          const X86Subtarget &Subtarget) {
45047   EVT VT = N->getValueType(0);
45048   if (!VT.isSimple())
45049     return SDValue();
45050 
45051   switch (VT.getSimpleVT().SimpleTy) {
45052   default: return SDValue();
45053   case MVT::v16i8:
45054   case MVT::v8i16:
45055   case MVT::v4i32:
45056   case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
45057   case MVT::v32i8:
45058   case MVT::v16i16:
45059   case MVT::v8i32:
45060   case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
45061   }
45062 
45063   // There must be a shift right algebraic before the xor, and the xor must be a
45064   // 'not' operation.
45065   SDValue Shift = N->getOperand(0);
45066   SDValue Ones = N->getOperand(1);
45067   if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
45068       !ISD::isBuildVectorAllOnes(Ones.getNode()))
45069     return SDValue();
45070 
45071   // The shift should be smearing the sign bit across each vector element.
45072   auto *ShiftAmt =
45073       isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
45074   if (!ShiftAmt ||
45075       ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
45076     return SDValue();
45077 
45078   // Create a greater-than comparison against -1. We don't use the more obvious
45079   // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
45080   return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
45081 }
45082 
45083 /// Detect patterns of truncation with unsigned saturation:
45084 ///
45085 /// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
45086 ///   Return the source value x to be truncated or SDValue() if the pattern was
45087 ///   not matched.
45088 ///
45089 /// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
45090 ///   where C1 >= 0 and C2 is unsigned max of destination type.
45091 ///
45092 ///    (truncate (smax (smin (x, C2), C1)) to dest_type)
45093 ///   where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
45094 ///
45095 ///   These two patterns are equivalent to:
45096 ///   (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
45097 ///   So return the smax(x, C1) value to be truncated or SDValue() if the
45098 ///   pattern was not matched.
detectUSatPattern(SDValue In,EVT VT,SelectionDAG & DAG,const SDLoc & DL)45099 static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
45100                                  const SDLoc &DL) {
45101   EVT InVT = In.getValueType();
45102 
45103   // Saturation with truncation. We truncate from InVT to VT.
45104   assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
45105          "Unexpected types for truncate operation");
45106 
45107   // Match min/max and return limit value as a parameter.
45108   auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
45109     if (V.getOpcode() == Opcode &&
45110         ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
45111       return V.getOperand(0);
45112     return SDValue();
45113   };
45114 
45115   APInt C1, C2;
45116   if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
45117     // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
45118     // the element size of the destination type.
45119     if (C2.isMask(VT.getScalarSizeInBits()))
45120       return UMin;
45121 
45122   if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
45123     if (MatchMinMax(SMin, ISD::SMAX, C1))
45124       if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
45125         return SMin;
45126 
45127   if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
45128     if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
45129       if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
45130           C2.uge(C1)) {
45131         return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
45132       }
45133 
45134   return SDValue();
45135 }
45136 
45137 /// Detect patterns of truncation with signed saturation:
45138 /// (truncate (smin ((smax (x, signed_min_of_dest_type)),
45139 ///                  signed_max_of_dest_type)) to dest_type)
45140 /// or:
45141 /// (truncate (smax ((smin (x, signed_max_of_dest_type)),
45142 ///                  signed_min_of_dest_type)) to dest_type).
45143 /// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
45144 /// Return the source value to be truncated or SDValue() if the pattern was not
45145 /// matched.
detectSSatPattern(SDValue In,EVT VT,bool MatchPackUS=false)45146 static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
45147   unsigned NumDstBits = VT.getScalarSizeInBits();
45148   unsigned NumSrcBits = In.getScalarValueSizeInBits();
45149   assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
45150 
45151   auto MatchMinMax = [](SDValue V, unsigned Opcode,
45152                         const APInt &Limit) -> SDValue {
45153     APInt C;
45154     if (V.getOpcode() == Opcode &&
45155         ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
45156       return V.getOperand(0);
45157     return SDValue();
45158   };
45159 
45160   APInt SignedMax, SignedMin;
45161   if (MatchPackUS) {
45162     SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
45163     SignedMin = APInt(NumSrcBits, 0);
45164   } else {
45165     SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
45166     SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
45167   }
45168 
45169   if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
45170     if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
45171       return SMax;
45172 
45173   if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
45174     if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
45175       return SMin;
45176 
45177   return SDValue();
45178 }
45179 
combineTruncateWithSat(SDValue In,EVT VT,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget)45180 static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
45181                                       SelectionDAG &DAG,
45182                                       const X86Subtarget &Subtarget) {
45183   if (!Subtarget.hasSSE2() || !VT.isVector())
45184     return SDValue();
45185 
45186   EVT SVT = VT.getVectorElementType();
45187   EVT InVT = In.getValueType();
45188   EVT InSVT = InVT.getVectorElementType();
45189 
45190   // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
45191   // split across two registers. We can use a packusdw+perm to clamp to 0-65535
45192   // and concatenate at the same time. Then we can use a final vpmovuswb to
45193   // clip to 0-255.
45194   if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
45195       InVT == MVT::v16i32 && VT == MVT::v16i8) {
45196     if (auto USatVal = detectSSatPattern(In, VT, true)) {
45197       // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
45198       SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
45199                                            DL, DAG, Subtarget);
45200       assert(Mid && "Failed to pack!");
45201       return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
45202     }
45203   }
45204 
45205   // vXi32 truncate instructions are available with AVX512F.
45206   // vXi16 truncate instructions are only available with AVX512BW.
45207   // For 256-bit or smaller vectors, we require VLX.
45208   // FIXME: We could widen truncates to 512 to remove the VLX restriction.
45209   // If the result type is 256-bits or larger and we have disable 512-bit
45210   // registers, we should go ahead and use the pack instructions if possible.
45211   bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
45212                        (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
45213                       (InVT.getSizeInBits() > 128) &&
45214                       (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
45215                       !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
45216 
45217   if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
45218       VT.getSizeInBits() >= 64 &&
45219       (SVT == MVT::i8 || SVT == MVT::i16) &&
45220       (InSVT == MVT::i16 || InSVT == MVT::i32)) {
45221     if (auto USatVal = detectSSatPattern(In, VT, true)) {
45222       // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
45223       // Only do this when the result is at least 64 bits or we'll leaving
45224       // dangling PACKSSDW nodes.
45225       if (SVT == MVT::i8 && InSVT == MVT::i32) {
45226         EVT MidVT = VT.changeVectorElementType(MVT::i16);
45227         SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
45228                                              DAG, Subtarget);
45229         assert(Mid && "Failed to pack!");
45230         SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
45231                                            Subtarget);
45232         assert(V && "Failed to pack!");
45233         return V;
45234       } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
45235         return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
45236                                       Subtarget);
45237     }
45238     if (auto SSatVal = detectSSatPattern(In, VT))
45239       return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
45240                                     Subtarget);
45241   }
45242 
45243   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45244   if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
45245       Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI())) {
45246     unsigned TruncOpc = 0;
45247     SDValue SatVal;
45248     if (auto SSatVal = detectSSatPattern(In, VT)) {
45249       SatVal = SSatVal;
45250       TruncOpc = X86ISD::VTRUNCS;
45251     } else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
45252       SatVal = USatVal;
45253       TruncOpc = X86ISD::VTRUNCUS;
45254     }
45255     if (SatVal) {
45256       unsigned ResElts = VT.getVectorNumElements();
45257       // If the input type is less than 512 bits and we don't have VLX, we need
45258       // to widen to 512 bits.
45259       if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
45260         unsigned NumConcats = 512 / InVT.getSizeInBits();
45261         ResElts *= NumConcats;
45262         SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
45263         ConcatOps[0] = SatVal;
45264         InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
45265                                 NumConcats * InVT.getVectorNumElements());
45266         SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
45267       }
45268       // Widen the result if its narrower than 128 bits.
45269       if (ResElts * SVT.getSizeInBits() < 128)
45270         ResElts = 128 / SVT.getSizeInBits();
45271       EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
45272       SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
45273       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
45274                          DAG.getIntPtrConstant(0, DL));
45275     }
45276   }
45277 
45278   return SDValue();
45279 }
45280 
45281 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
45282 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
45283 /// X86ISD::AVG instruction.
detectAVGPattern(SDValue In,EVT VT,SelectionDAG & DAG,const X86Subtarget & Subtarget,const SDLoc & DL)45284 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
45285                                 const X86Subtarget &Subtarget,
45286                                 const SDLoc &DL) {
45287   if (!VT.isVector())
45288     return SDValue();
45289   EVT InVT = In.getValueType();
45290   unsigned NumElems = VT.getVectorNumElements();
45291 
45292   EVT ScalarVT = VT.getVectorElementType();
45293   if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
45294     return SDValue();
45295 
45296   // InScalarVT is the intermediate type in AVG pattern and it should be greater
45297   // than the original input type (i8/i16).
45298   EVT InScalarVT = InVT.getVectorElementType();
45299   if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
45300     return SDValue();
45301 
45302   if (!Subtarget.hasSSE2())
45303     return SDValue();
45304 
45305   // Detect the following pattern:
45306   //
45307   //   %1 = zext <N x i8> %a to <N x i32>
45308   //   %2 = zext <N x i8> %b to <N x i32>
45309   //   %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
45310   //   %4 = add nuw nsw <N x i32> %3, %2
45311   //   %5 = lshr <N x i32> %N, <i32 1 x N>
45312   //   %6 = trunc <N x i32> %5 to <N x i8>
45313   //
45314   // In AVX512, the last instruction can also be a trunc store.
45315   if (In.getOpcode() != ISD::SRL)
45316     return SDValue();
45317 
45318   // A lambda checking the given SDValue is a constant vector and each element
45319   // is in the range [Min, Max].
45320   auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
45321     return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
45322       return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
45323     });
45324   };
45325 
45326   // Check if each element of the vector is right-shifted by one.
45327   SDValue LHS = In.getOperand(0);
45328   SDValue RHS = In.getOperand(1);
45329   if (!IsConstVectorInRange(RHS, 1, 1))
45330     return SDValue();
45331   if (LHS.getOpcode() != ISD::ADD)
45332     return SDValue();
45333 
45334   // Detect a pattern of a + b + 1 where the order doesn't matter.
45335   SDValue Operands[3];
45336   Operands[0] = LHS.getOperand(0);
45337   Operands[1] = LHS.getOperand(1);
45338 
45339   auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45340                        ArrayRef<SDValue> Ops) {
45341     return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
45342   };
45343 
45344   auto AVGSplitter = [&](SDValue Op0, SDValue Op1) {
45345     // Pad to a power-of-2 vector, split+apply and extract the original vector.
45346     unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
45347     EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
45348     if (NumElemsPow2 != NumElems) {
45349       SmallVector<SDValue, 32> Ops0(NumElemsPow2, DAG.getUNDEF(ScalarVT));
45350       SmallVector<SDValue, 32> Ops1(NumElemsPow2, DAG.getUNDEF(ScalarVT));
45351       for (unsigned i = 0; i != NumElems; ++i) {
45352         SDValue Idx = DAG.getIntPtrConstant(i, DL);
45353         Ops0[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op0, Idx);
45354         Ops1[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op1, Idx);
45355       }
45356       Op0 = DAG.getBuildVector(Pow2VT, DL, Ops0);
45357       Op1 = DAG.getBuildVector(Pow2VT, DL, Ops1);
45358     }
45359     SDValue Res =
45360         SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, {Op0, Op1}, AVGBuilder);
45361     if (NumElemsPow2 == NumElems)
45362       return Res;
45363     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
45364                        DAG.getIntPtrConstant(0, DL));
45365   };
45366 
45367   // Take care of the case when one of the operands is a constant vector whose
45368   // element is in the range [1, 256].
45369   if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
45370       Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
45371       Operands[0].getOperand(0).getValueType() == VT) {
45372     // The pattern is detected. Subtract one from the constant vector, then
45373     // demote it and emit X86ISD::AVG instruction.
45374     SDValue VecOnes = DAG.getConstant(1, DL, InVT);
45375     Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
45376     Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
45377     return AVGSplitter(Operands[0].getOperand(0), Operands[1]);
45378   }
45379 
45380   // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
45381   // Match the or case only if its 'add-like' - can be replaced by an add.
45382   auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
45383     if (ISD::ADD == V.getOpcode()) {
45384       Op0 = V.getOperand(0);
45385       Op1 = V.getOperand(1);
45386       return true;
45387     }
45388     if (ISD::ZERO_EXTEND != V.getOpcode())
45389       return false;
45390     V = V.getOperand(0);
45391     if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
45392         !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
45393       return false;
45394     Op0 = V.getOperand(0);
45395     Op1 = V.getOperand(1);
45396     return true;
45397   };
45398 
45399   SDValue Op0, Op1;
45400   if (FindAddLike(Operands[0], Op0, Op1))
45401     std::swap(Operands[0], Operands[1]);
45402   else if (!FindAddLike(Operands[1], Op0, Op1))
45403     return SDValue();
45404   Operands[2] = Op0;
45405   Operands[1] = Op1;
45406 
45407   // Now we have three operands of two additions. Check that one of them is a
45408   // constant vector with ones, and the other two can be promoted from i8/i16.
45409   for (int i = 0; i < 3; ++i) {
45410     if (!IsConstVectorInRange(Operands[i], 1, 1))
45411       continue;
45412     std::swap(Operands[i], Operands[2]);
45413 
45414     // Check if Operands[0] and Operands[1] are results of type promotion.
45415     for (int j = 0; j < 2; ++j)
45416       if (Operands[j].getValueType() != VT) {
45417         if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
45418             Operands[j].getOperand(0).getValueType() != VT)
45419           return SDValue();
45420         Operands[j] = Operands[j].getOperand(0);
45421       }
45422 
45423     // The pattern is detected, emit X86ISD::AVG instruction(s).
45424     return AVGSplitter(Operands[0], Operands[1]);
45425   }
45426 
45427   return SDValue();
45428 }
45429 
combineLoad(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)45430 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
45431                            TargetLowering::DAGCombinerInfo &DCI,
45432                            const X86Subtarget &Subtarget) {
45433   LoadSDNode *Ld = cast<LoadSDNode>(N);
45434   EVT RegVT = Ld->getValueType(0);
45435   EVT MemVT = Ld->getMemoryVT();
45436   SDLoc dl(Ld);
45437   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45438 
45439   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
45440   // into two 16-byte operations. Also split non-temporal aligned loads on
45441   // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
45442   ISD::LoadExtType Ext = Ld->getExtensionType();
45443   bool Fast;
45444   if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
45445       Ext == ISD::NON_EXTLOAD &&
45446       ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
45447         Ld->getAlignment() >= 16) ||
45448        (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
45449                                *Ld->getMemOperand(), &Fast) &&
45450         !Fast))) {
45451     unsigned NumElems = RegVT.getVectorNumElements();
45452     if (NumElems < 2)
45453       return SDValue();
45454 
45455     unsigned HalfOffset = 16;
45456     SDValue Ptr1 = Ld->getBasePtr();
45457     SDValue Ptr2 =
45458         DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
45459     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
45460                                   NumElems / 2);
45461     SDValue Load1 =
45462         DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
45463                     Ld->getOriginalAlign(),
45464                     Ld->getMemOperand()->getFlags());
45465     SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
45466                                 Ld->getPointerInfo().getWithOffset(HalfOffset),
45467                                 Ld->getOriginalAlign(),
45468                                 Ld->getMemOperand()->getFlags());
45469     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
45470                              Load1.getValue(1), Load2.getValue(1));
45471 
45472     SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
45473     return DCI.CombineTo(N, NewVec, TF, true);
45474   }
45475 
45476   // Bool vector load - attempt to cast to an integer, as we have good
45477   // (vXiY *ext(vXi1 bitcast(iX))) handling.
45478   if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
45479       RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
45480     unsigned NumElts = RegVT.getVectorNumElements();
45481     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
45482     if (TLI.isTypeLegal(IntVT)) {
45483       SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
45484                                     Ld->getPointerInfo(),
45485                                     Ld->getOriginalAlign(),
45486                                     Ld->getMemOperand()->getFlags());
45487       SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
45488       return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
45489     }
45490   }
45491 
45492   // If we also broadcast this as a subvector to a wider type, then just extract
45493   // the lowest subvector.
45494   if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
45495       (RegVT.is128BitVector() || RegVT.is256BitVector())) {
45496     SDValue Ptr = Ld->getBasePtr();
45497     SDValue Chain = Ld->getChain();
45498     for (SDNode *User : Ptr->uses()) {
45499       if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
45500           cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
45501           cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
45502           cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
45503               MemVT.getSizeInBits() &&
45504           !User->hasAnyUseOfValue(1) &&
45505           User->getValueSizeInBits(0).getFixedSize() >
45506               RegVT.getFixedSizeInBits()) {
45507         SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
45508                                            RegVT.getSizeInBits());
45509         Extract = DAG.getBitcast(RegVT, Extract);
45510         return DCI.CombineTo(N, Extract, SDValue(User, 1));
45511       }
45512     }
45513   }
45514 
45515   // Cast ptr32 and ptr64 pointers to the default address space before a load.
45516   unsigned AddrSpace = Ld->getAddressSpace();
45517   if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
45518       AddrSpace == X86AS::PTR32_UPTR) {
45519     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
45520     if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
45521       SDValue Cast =
45522           DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
45523       return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
45524                          Ld->getOriginalAlign(),
45525                          Ld->getMemOperand()->getFlags());
45526     }
45527   }
45528 
45529   return SDValue();
45530 }
45531 
45532 /// If V is a build vector of boolean constants and exactly one of those
45533 /// constants is true, return the operand index of that true element.
45534 /// Otherwise, return -1.
getOneTrueElt(SDValue V)45535 static int getOneTrueElt(SDValue V) {
45536   // This needs to be a build vector of booleans.
45537   // TODO: Checking for the i1 type matches the IR definition for the mask,
45538   // but the mask check could be loosened to i8 or other types. That might
45539   // also require checking more than 'allOnesValue'; eg, the x86 HW
45540   // instructions only require that the MSB is set for each mask element.
45541   // The ISD::MSTORE comments/definition do not specify how the mask operand
45542   // is formatted.
45543   auto *BV = dyn_cast<BuildVectorSDNode>(V);
45544   if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
45545     return -1;
45546 
45547   int TrueIndex = -1;
45548   unsigned NumElts = BV->getValueType(0).getVectorNumElements();
45549   for (unsigned i = 0; i < NumElts; ++i) {
45550     const SDValue &Op = BV->getOperand(i);
45551     if (Op.isUndef())
45552       continue;
45553     auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
45554     if (!ConstNode)
45555       return -1;
45556     if (ConstNode->getAPIntValue().countTrailingOnes() >= 1) {
45557       // If we already found a one, this is too many.
45558       if (TrueIndex >= 0)
45559         return -1;
45560       TrueIndex = i;
45561     }
45562   }
45563   return TrueIndex;
45564 }
45565 
45566 /// Given a masked memory load/store operation, return true if it has one mask
45567 /// bit set. If it has one mask bit set, then also return the memory address of
45568 /// the scalar element to load/store, the vector index to insert/extract that
45569 /// scalar element, and the alignment for the scalar memory access.
getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode * MaskedOp,SelectionDAG & DAG,SDValue & Addr,SDValue & Index,Align & Alignment,unsigned & Offset)45570 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
45571                                          SelectionDAG &DAG, SDValue &Addr,
45572                                          SDValue &Index, Align &Alignment,
45573                                          unsigned &Offset) {
45574   int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
45575   if (TrueMaskElt < 0)
45576     return false;
45577 
45578   // Get the address of the one scalar element that is specified by the mask
45579   // using the appropriate offset from the base pointer.
45580   EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
45581   Offset = 0;
45582   Addr = MaskedOp->getBasePtr();
45583   if (TrueMaskElt != 0) {
45584     Offset = TrueMaskElt * EltVT.getStoreSize();
45585     Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
45586                                     SDLoc(MaskedOp));
45587   }
45588 
45589   Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
45590   Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
45591                               EltVT.getStoreSize());
45592   return true;
45593 }
45594 
45595 /// If exactly one element of the mask is set for a non-extending masked load,
45596 /// it is a scalar load and vector insert.
45597 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
45598 /// mask have already been optimized in IR, so we don't bother with those here.
45599 static SDValue
reduceMaskedLoadToScalarLoad(MaskedLoadSDNode * ML,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)45600 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
45601                              TargetLowering::DAGCombinerInfo &DCI,
45602                              const X86Subtarget &Subtarget) {
45603   assert(ML->isUnindexed() && "Unexpected indexed masked load!");
45604   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
45605   // However, some target hooks may need to be added to know when the transform
45606   // is profitable. Endianness would also have to be considered.
45607 
45608   SDValue Addr, VecIndex;
45609   Align Alignment;
45610   unsigned Offset;
45611   if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
45612     return SDValue();
45613 
45614   // Load the one scalar element that is specified by the mask using the
45615   // appropriate offset from the base pointer.
45616   SDLoc DL(ML);
45617   EVT VT = ML->getValueType(0);
45618   EVT EltVT = VT.getVectorElementType();
45619 
45620   EVT CastVT = VT;
45621   if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
45622     EltVT = MVT::f64;
45623     CastVT = VT.changeVectorElementType(EltVT);
45624   }
45625 
45626   SDValue Load =
45627       DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
45628                   ML->getPointerInfo().getWithOffset(Offset),
45629                   Alignment, ML->getMemOperand()->getFlags());
45630 
45631   SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
45632 
45633   // Insert the loaded element into the appropriate place in the vector.
45634   SDValue Insert =
45635       DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
45636   Insert = DAG.getBitcast(VT, Insert);
45637   return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
45638 }
45639 
45640 static SDValue
combineMaskedLoadConstantMask(MaskedLoadSDNode * ML,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)45641 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
45642                               TargetLowering::DAGCombinerInfo &DCI) {
45643   assert(ML->isUnindexed() && "Unexpected indexed masked load!");
45644   if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
45645     return SDValue();
45646 
45647   SDLoc DL(ML);
45648   EVT VT = ML->getValueType(0);
45649 
45650   // If we are loading the first and last elements of a vector, it is safe and
45651   // always faster to load the whole vector. Replace the masked load with a
45652   // vector load and select.
45653   unsigned NumElts = VT.getVectorNumElements();
45654   BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
45655   bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
45656   bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
45657   if (LoadFirstElt && LoadLastElt) {
45658     SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
45659                                 ML->getMemOperand());
45660     SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
45661                                   ML->getPassThru());
45662     return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
45663   }
45664 
45665   // Convert a masked load with a constant mask into a masked load and a select.
45666   // This allows the select operation to use a faster kind of select instruction
45667   // (for example, vblendvps -> vblendps).
45668 
45669   // Don't try this if the pass-through operand is already undefined. That would
45670   // cause an infinite loop because that's what we're about to create.
45671   if (ML->getPassThru().isUndef())
45672     return SDValue();
45673 
45674   if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
45675     return SDValue();
45676 
45677   // The new masked load has an undef pass-through operand. The select uses the
45678   // original pass-through operand.
45679   SDValue NewML = DAG.getMaskedLoad(
45680       VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
45681       DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
45682       ML->getAddressingMode(), ML->getExtensionType());
45683   SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
45684                                 ML->getPassThru());
45685 
45686   return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
45687 }
45688 
combineMaskedLoad(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)45689 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
45690                                  TargetLowering::DAGCombinerInfo &DCI,
45691                                  const X86Subtarget &Subtarget) {
45692   auto *Mld = cast<MaskedLoadSDNode>(N);
45693 
45694   // TODO: Expanding load with constant mask may be optimized as well.
45695   if (Mld->isExpandingLoad())
45696     return SDValue();
45697 
45698   if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
45699     if (SDValue ScalarLoad =
45700             reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
45701       return ScalarLoad;
45702 
45703     // TODO: Do some AVX512 subsets benefit from this transform?
45704     if (!Subtarget.hasAVX512())
45705       if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
45706         return Blend;
45707   }
45708 
45709   // If the mask value has been legalized to a non-boolean vector, try to
45710   // simplify ops leading up to it. We only demand the MSB of each lane.
45711   SDValue Mask = Mld->getMask();
45712   if (Mask.getScalarValueSizeInBits() != 1) {
45713     EVT VT = Mld->getValueType(0);
45714     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45715     APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
45716     if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
45717       if (N->getOpcode() != ISD::DELETED_NODE)
45718         DCI.AddToWorklist(N);
45719       return SDValue(N, 0);
45720     }
45721     if (SDValue NewMask =
45722             TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
45723       return DAG.getMaskedLoad(
45724           VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
45725           NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
45726           Mld->getAddressingMode(), Mld->getExtensionType());
45727   }
45728 
45729   return SDValue();
45730 }
45731 
45732 /// If exactly one element of the mask is set for a non-truncating masked store,
45733 /// it is a vector extract and scalar store.
45734 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
45735 /// mask have already been optimized in IR, so we don't bother with those here.
reduceMaskedStoreToScalarStore(MaskedStoreSDNode * MS,SelectionDAG & DAG,const X86Subtarget & Subtarget)45736 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
45737                                               SelectionDAG &DAG,
45738                                               const X86Subtarget &Subtarget) {
45739   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
45740   // However, some target hooks may need to be added to know when the transform
45741   // is profitable. Endianness would also have to be considered.
45742 
45743   SDValue Addr, VecIndex;
45744   Align Alignment;
45745   unsigned Offset;
45746   if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
45747     return SDValue();
45748 
45749   // Extract the one scalar element that is actually being stored.
45750   SDLoc DL(MS);
45751   SDValue Value = MS->getValue();
45752   EVT VT = Value.getValueType();
45753   EVT EltVT = VT.getVectorElementType();
45754   if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
45755     EltVT = MVT::f64;
45756     EVT CastVT = VT.changeVectorElementType(EltVT);
45757     Value = DAG.getBitcast(CastVT, Value);
45758   }
45759   SDValue Extract =
45760       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
45761 
45762   // Store that element at the appropriate offset from the base pointer.
45763   return DAG.getStore(MS->getChain(), DL, Extract, Addr,
45764                       MS->getPointerInfo().getWithOffset(Offset),
45765                       Alignment, MS->getMemOperand()->getFlags());
45766 }
45767 
combineMaskedStore(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)45768 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
45769                                   TargetLowering::DAGCombinerInfo &DCI,
45770                                   const X86Subtarget &Subtarget) {
45771   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
45772   if (Mst->isCompressingStore())
45773     return SDValue();
45774 
45775   EVT VT = Mst->getValue().getValueType();
45776   SDLoc dl(Mst);
45777   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45778 
45779   if (Mst->isTruncatingStore())
45780     return SDValue();
45781 
45782   if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
45783     return ScalarStore;
45784 
45785   // If the mask value has been legalized to a non-boolean vector, try to
45786   // simplify ops leading up to it. We only demand the MSB of each lane.
45787   SDValue Mask = Mst->getMask();
45788   if (Mask.getScalarValueSizeInBits() != 1) {
45789     APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
45790     if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
45791       if (N->getOpcode() != ISD::DELETED_NODE)
45792         DCI.AddToWorklist(N);
45793       return SDValue(N, 0);
45794     }
45795     if (SDValue NewMask =
45796             TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
45797       return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
45798                                 Mst->getBasePtr(), Mst->getOffset(), NewMask,
45799                                 Mst->getMemoryVT(), Mst->getMemOperand(),
45800                                 Mst->getAddressingMode());
45801   }
45802 
45803   SDValue Value = Mst->getValue();
45804   if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
45805       TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
45806                             Mst->getMemoryVT())) {
45807     return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
45808                               Mst->getBasePtr(), Mst->getOffset(), Mask,
45809                               Mst->getMemoryVT(), Mst->getMemOperand(),
45810                               Mst->getAddressingMode(), true);
45811   }
45812 
45813   return SDValue();
45814 }
45815 
combineStore(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)45816 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
45817                             TargetLowering::DAGCombinerInfo &DCI,
45818                             const X86Subtarget &Subtarget) {
45819   StoreSDNode *St = cast<StoreSDNode>(N);
45820   EVT StVT = St->getMemoryVT();
45821   SDLoc dl(St);
45822   SDValue StoredVal = St->getValue();
45823   EVT VT = StoredVal.getValueType();
45824   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45825 
45826   // Convert a store of vXi1 into a store of iX and a bitcast.
45827   if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
45828       VT.getVectorElementType() == MVT::i1) {
45829 
45830     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
45831     StoredVal = DAG.getBitcast(NewVT, StoredVal);
45832 
45833     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
45834                         St->getPointerInfo(), St->getOriginalAlign(),
45835                         St->getMemOperand()->getFlags());
45836   }
45837 
45838   // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
45839   // This will avoid a copy to k-register.
45840   if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
45841       StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
45842       StoredVal.getOperand(0).getValueType() == MVT::i8) {
45843     SDValue Val = StoredVal.getOperand(0);
45844     // We must store zeros to the unused bits.
45845     Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
45846     return DAG.getStore(St->getChain(), dl, Val,
45847                         St->getBasePtr(), St->getPointerInfo(),
45848                         St->getOriginalAlign(),
45849                         St->getMemOperand()->getFlags());
45850   }
45851 
45852   // Widen v2i1/v4i1 stores to v8i1.
45853   if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
45854       Subtarget.hasAVX512()) {
45855     unsigned NumConcats = 8 / VT.getVectorNumElements();
45856     // We must store zeros to the unused bits.
45857     SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
45858     Ops[0] = StoredVal;
45859     StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45860     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
45861                         St->getPointerInfo(), St->getOriginalAlign(),
45862                         St->getMemOperand()->getFlags());
45863   }
45864 
45865   // Turn vXi1 stores of constants into a scalar store.
45866   if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
45867        VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
45868       ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
45869     // If its a v64i1 store without 64-bit support, we need two stores.
45870     if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
45871       SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
45872                                       StoredVal->ops().slice(0, 32));
45873       Lo = combinevXi1ConstantToInteger(Lo, DAG);
45874       SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
45875                                       StoredVal->ops().slice(32, 32));
45876       Hi = combinevXi1ConstantToInteger(Hi, DAG);
45877 
45878       SDValue Ptr0 = St->getBasePtr();
45879       SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
45880 
45881       SDValue Ch0 =
45882           DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
45883                        St->getOriginalAlign(),
45884                        St->getMemOperand()->getFlags());
45885       SDValue Ch1 =
45886           DAG.getStore(St->getChain(), dl, Hi, Ptr1,
45887                        St->getPointerInfo().getWithOffset(4),
45888                        St->getOriginalAlign(),
45889                        St->getMemOperand()->getFlags());
45890       return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
45891     }
45892 
45893     StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
45894     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
45895                         St->getPointerInfo(), St->getOriginalAlign(),
45896                         St->getMemOperand()->getFlags());
45897   }
45898 
45899   // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
45900   // Sandy Bridge, perform two 16-byte stores.
45901   bool Fast;
45902   if (VT.is256BitVector() && StVT == VT &&
45903       TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
45904                              *St->getMemOperand(), &Fast) &&
45905       !Fast) {
45906     unsigned NumElems = VT.getVectorNumElements();
45907     if (NumElems < 2)
45908       return SDValue();
45909 
45910     return splitVectorStore(St, DAG);
45911   }
45912 
45913   // Split under-aligned vector non-temporal stores.
45914   if (St->isNonTemporal() && StVT == VT &&
45915       St->getAlignment() < VT.getStoreSize()) {
45916     // ZMM/YMM nt-stores - either it can be stored as a series of shorter
45917     // vectors or the legalizer can scalarize it to use MOVNTI.
45918     if (VT.is256BitVector() || VT.is512BitVector()) {
45919       unsigned NumElems = VT.getVectorNumElements();
45920       if (NumElems < 2)
45921         return SDValue();
45922       return splitVectorStore(St, DAG);
45923     }
45924 
45925     // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
45926     // to use MOVNTI.
45927     if (VT.is128BitVector() && Subtarget.hasSSE2()) {
45928       MVT NTVT = Subtarget.hasSSE4A()
45929                      ? MVT::v2f64
45930                      : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
45931       return scalarizeVectorStore(St, NTVT, DAG);
45932     }
45933   }
45934 
45935   // Try to optimize v16i16->v16i8 truncating stores when BWI is not
45936   // supported, but avx512f is by extending to v16i32 and truncating.
45937   if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
45938       St->getValue().getOpcode() == ISD::TRUNCATE &&
45939       St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
45940       TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
45941       St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
45942     SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
45943     return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
45944                              MVT::v16i8, St->getMemOperand());
45945   }
45946 
45947   // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
45948   if (!St->isTruncatingStore() && StoredVal.hasOneUse() &&
45949       (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
45950        StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
45951       TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
45952     bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
45953     return EmitTruncSStore(IsSigned, St->getChain(),
45954                            dl, StoredVal.getOperand(0), St->getBasePtr(),
45955                            VT, St->getMemOperand(), DAG);
45956   }
45957 
45958   // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
45959   if (!St->isTruncatingStore() && StoredVal.hasOneUse()) {
45960     auto IsExtractedElement = [](SDValue V) {
45961       if (V.getOpcode() == ISD::TRUNCATE && V.getOperand(0).hasOneUse())
45962         V = V.getOperand(0);
45963       unsigned Opc = V.getOpcode();
45964       if (Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) {
45965         if (V.getOperand(0).hasOneUse() && isNullConstant(V.getOperand(1)))
45966           return V.getOperand(0);
45967       }
45968       return SDValue();
45969     };
45970     if (SDValue Extract = IsExtractedElement(StoredVal)) {
45971       SDValue Trunc = peekThroughOneUseBitcasts(Extract);
45972       if (Trunc.getOpcode() == X86ISD::VTRUNC) {
45973         SDValue Src = Trunc.getOperand(0);
45974         MVT DstVT = Trunc.getSimpleValueType();
45975         MVT SrcVT = Src.getSimpleValueType();
45976         unsigned NumSrcElts = SrcVT.getVectorNumElements();
45977         unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
45978         MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
45979         if (NumTruncBits == VT.getSizeInBits() &&
45980             TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
45981           return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
45982                                    TruncVT, St->getMemOperand());
45983         }
45984       }
45985     }
45986   }
45987 
45988   // Optimize trunc store (of multiple scalars) to shuffle and store.
45989   // First, pack all of the elements in one place. Next, store to memory
45990   // in fewer chunks.
45991   if (St->isTruncatingStore() && VT.isVector()) {
45992     // Check if we can detect an AVG pattern from the truncation. If yes,
45993     // replace the trunc store by a normal store with the result of X86ISD::AVG
45994     // instruction.
45995     if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
45996       if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
45997                                          Subtarget, dl))
45998         return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
45999                             St->getPointerInfo(), St->getOriginalAlign(),
46000                             St->getMemOperand()->getFlags());
46001 
46002     if (TLI.isTruncStoreLegal(VT, StVT)) {
46003       if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
46004         return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
46005                                dl, Val, St->getBasePtr(),
46006                                St->getMemoryVT(), St->getMemOperand(), DAG);
46007       if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
46008                                           DAG, dl))
46009         return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
46010                                dl, Val, St->getBasePtr(),
46011                                St->getMemoryVT(), St->getMemOperand(), DAG);
46012     }
46013 
46014     return SDValue();
46015   }
46016 
46017   // Cast ptr32 and ptr64 pointers to the default address space before a store.
46018   unsigned AddrSpace = St->getAddressSpace();
46019   if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
46020       AddrSpace == X86AS::PTR32_UPTR) {
46021     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
46022     if (PtrVT != St->getBasePtr().getSimpleValueType()) {
46023       SDValue Cast =
46024           DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
46025       return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
46026                           St->getPointerInfo(), St->getOriginalAlign(),
46027                           St->getMemOperand()->getFlags(), St->getAAInfo());
46028     }
46029   }
46030 
46031   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
46032   // the FP state in cases where an emms may be missing.
46033   // A preferable solution to the general problem is to figure out the right
46034   // places to insert EMMS.  This qualifies as a quick hack.
46035 
46036   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
46037   if (VT.getSizeInBits() != 64)
46038     return SDValue();
46039 
46040   const Function &F = DAG.getMachineFunction().getFunction();
46041   bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
46042   bool F64IsLegal =
46043       !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
46044   if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
46045       isa<LoadSDNode>(St->getValue()) &&
46046       cast<LoadSDNode>(St->getValue())->isSimple() &&
46047       St->getChain().hasOneUse() && St->isSimple()) {
46048     LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
46049 
46050     if (!ISD::isNormalLoad(Ld))
46051       return SDValue();
46052 
46053     // Avoid the transformation if there are multiple uses of the loaded value.
46054     if (!Ld->hasNUsesOfValue(1, 0))
46055       return SDValue();
46056 
46057     SDLoc LdDL(Ld);
46058     SDLoc StDL(N);
46059     // Lower to a single movq load/store pair.
46060     SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
46061                                 Ld->getBasePtr(), Ld->getMemOperand());
46062 
46063     // Make sure new load is placed in same chain order.
46064     DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
46065     return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
46066                         St->getMemOperand());
46067   }
46068 
46069   // This is similar to the above case, but here we handle a scalar 64-bit
46070   // integer store that is extracted from a vector on a 32-bit target.
46071   // If we have SSE2, then we can treat it like a floating-point double
46072   // to get past legalization. The execution dependencies fixup pass will
46073   // choose the optimal machine instruction for the store if this really is
46074   // an integer or v2f32 rather than an f64.
46075   if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
46076       St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
46077     SDValue OldExtract = St->getOperand(1);
46078     SDValue ExtOp0 = OldExtract.getOperand(0);
46079     unsigned VecSize = ExtOp0.getValueSizeInBits();
46080     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
46081     SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
46082     SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
46083                                      BitCast, OldExtract.getOperand(1));
46084     return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
46085                         St->getPointerInfo(), St->getOriginalAlign(),
46086                         St->getMemOperand()->getFlags());
46087   }
46088 
46089   return SDValue();
46090 }
46091 
combineVEXTRACT_STORE(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)46092 static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
46093                                      TargetLowering::DAGCombinerInfo &DCI,
46094                                      const X86Subtarget &Subtarget) {
46095   auto *St = cast<MemIntrinsicSDNode>(N);
46096 
46097   SDValue StoredVal = N->getOperand(1);
46098   MVT VT = StoredVal.getSimpleValueType();
46099   EVT MemVT = St->getMemoryVT();
46100 
46101   // Figure out which elements we demand.
46102   unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
46103   APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
46104 
46105   APInt KnownUndef, KnownZero;
46106   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46107   if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef,
46108                                      KnownZero, DCI)) {
46109     if (N->getOpcode() != ISD::DELETED_NODE)
46110       DCI.AddToWorklist(N);
46111     return SDValue(N, 0);
46112   }
46113 
46114   return SDValue();
46115 }
46116 
46117 /// Return 'true' if this vector operation is "horizontal"
46118 /// and return the operands for the horizontal operation in LHS and RHS.  A
46119 /// horizontal operation performs the binary operation on successive elements
46120 /// of its first operand, then on successive elements of its second operand,
46121 /// returning the resulting values in a vector.  For example, if
46122 ///   A = < float a0, float a1, float a2, float a3 >
46123 /// and
46124 ///   B = < float b0, float b1, float b2, float b3 >
46125 /// then the result of doing a horizontal operation on A and B is
46126 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
46127 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
46128 /// A horizontal-op B, for some already available A and B, and if so then LHS is
46129 /// set to A, RHS to B, and the routine returns 'true'.
isHorizontalBinOp(unsigned HOpcode,SDValue & LHS,SDValue & RHS,SelectionDAG & DAG,const X86Subtarget & Subtarget,bool IsCommutative,SmallVectorImpl<int> & PostShuffleMask)46130 static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
46131                               SelectionDAG &DAG, const X86Subtarget &Subtarget,
46132                               bool IsCommutative,
46133                               SmallVectorImpl<int> &PostShuffleMask) {
46134   // If either operand is undef, bail out. The binop should be simplified.
46135   if (LHS.isUndef() || RHS.isUndef())
46136     return false;
46137 
46138   // Look for the following pattern:
46139   //   A = < float a0, float a1, float a2, float a3 >
46140   //   B = < float b0, float b1, float b2, float b3 >
46141   // and
46142   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
46143   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
46144   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
46145   // which is A horizontal-op B.
46146 
46147   MVT VT = LHS.getSimpleValueType();
46148   assert((VT.is128BitVector() || VT.is256BitVector()) &&
46149          "Unsupported vector type for horizontal add/sub");
46150   unsigned NumElts = VT.getVectorNumElements();
46151 
46152   auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
46153                         SmallVectorImpl<int> &ShuffleMask) {
46154     bool UseSubVector = false;
46155     if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
46156         Op.getOperand(0).getValueType().is256BitVector() &&
46157         llvm::isNullConstant(Op.getOperand(1))) {
46158       Op = Op.getOperand(0);
46159       UseSubVector = true;
46160     }
46161     SmallVector<SDValue, 2> SrcOps;
46162     SmallVector<int, 16> SrcMask, ScaledMask;
46163     SDValue BC = peekThroughBitcasts(Op);
46164     if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
46165         !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
46166           return Op.getValueSizeInBits() == BC.getValueSizeInBits();
46167         })) {
46168       resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
46169       if (!UseSubVector && SrcOps.size() <= 2 &&
46170           scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
46171         N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
46172         N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
46173         ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
46174       }
46175       if (UseSubVector && SrcOps.size() == 1 &&
46176           scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
46177         std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
46178         ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
46179         ShuffleMask.assign(Mask.begin(), Mask.end());
46180       }
46181     }
46182   };
46183 
46184   // View LHS in the form
46185   //   LHS = VECTOR_SHUFFLE A, B, LMask
46186   // If LHS is not a shuffle, then pretend it is the identity shuffle:
46187   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
46188   // NOTE: A default initialized SDValue represents an UNDEF of type VT.
46189   SDValue A, B;
46190   SmallVector<int, 16> LMask;
46191   GetShuffle(LHS, A, B, LMask);
46192 
46193   // Likewise, view RHS in the form
46194   //   RHS = VECTOR_SHUFFLE C, D, RMask
46195   SDValue C, D;
46196   SmallVector<int, 16> RMask;
46197   GetShuffle(RHS, C, D, RMask);
46198 
46199   // At least one of the operands should be a vector shuffle.
46200   unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
46201   if (NumShuffles == 0)
46202     return false;
46203 
46204   if (LMask.empty()) {
46205     A = LHS;
46206     for (unsigned i = 0; i != NumElts; ++i)
46207       LMask.push_back(i);
46208   }
46209 
46210   if (RMask.empty()) {
46211     C = RHS;
46212     for (unsigned i = 0; i != NumElts; ++i)
46213       RMask.push_back(i);
46214   }
46215 
46216   // If we have an unary mask, ensure the other op is set to null.
46217   if (isUndefOrInRange(LMask, 0, NumElts))
46218     B = SDValue();
46219   else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
46220     A = SDValue();
46221 
46222   if (isUndefOrInRange(RMask, 0, NumElts))
46223     D = SDValue();
46224   else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
46225     C = SDValue();
46226 
46227   // If A and B occur in reverse order in RHS, then canonicalize by commuting
46228   // RHS operands and shuffle mask.
46229   if (A != C) {
46230     std::swap(C, D);
46231     ShuffleVectorSDNode::commuteMask(RMask);
46232   }
46233   // Check that the shuffles are both shuffling the same vectors.
46234   if (!(A == C && B == D))
46235     return false;
46236 
46237   PostShuffleMask.clear();
46238   PostShuffleMask.append(NumElts, SM_SentinelUndef);
46239 
46240   // LHS and RHS are now:
46241   //   LHS = shuffle A, B, LMask
46242   //   RHS = shuffle A, B, RMask
46243   // Check that the masks correspond to performing a horizontal operation.
46244   // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
46245   // so we just repeat the inner loop if this is a 256-bit op.
46246   unsigned Num128BitChunks = VT.getSizeInBits() / 128;
46247   unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
46248   unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
46249   assert((NumEltsPer128BitChunk % 2 == 0) &&
46250          "Vector type should have an even number of elements in each lane");
46251   for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
46252     for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
46253       // Ignore undefined components.
46254       int LIdx = LMask[i + j], RIdx = RMask[i + j];
46255       if (LIdx < 0 || RIdx < 0 ||
46256           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
46257           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
46258         continue;
46259 
46260       // Check that successive odd/even elements are being operated on. If not,
46261       // this is not a horizontal operation.
46262       if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
46263           !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
46264         return false;
46265 
46266       // Compute the post-shuffle mask index based on where the element
46267       // is stored in the HOP result, and where it needs to be moved to.
46268       int Base = LIdx & ~1u;
46269       int Index = ((Base % NumEltsPer128BitChunk) / 2) +
46270                   ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
46271 
46272       // The  low half of the 128-bit result must choose from A.
46273       // The high half of the 128-bit result must choose from B,
46274       // unless B is undef. In that case, we are always choosing from A.
46275       if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
46276         Index += NumEltsPer64BitChunk;
46277       PostShuffleMask[i + j] = Index;
46278     }
46279   }
46280 
46281   SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
46282   SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
46283 
46284   bool IsIdentityPostShuffle =
46285       isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
46286   if (IsIdentityPostShuffle)
46287     PostShuffleMask.clear();
46288 
46289   // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
46290   if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
46291       isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
46292     return false;
46293 
46294   // If the source nodes are already used in HorizOps then always accept this.
46295   // Shuffle folding should merge these back together.
46296   bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
46297     return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
46298   });
46299   bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
46300     return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
46301   });
46302   bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
46303 
46304   // Assume a SingleSource HOP if we only shuffle one input and don't need to
46305   // shuffle the result.
46306   if (!ForceHorizOp &&
46307       !shouldUseHorizontalOp(NewLHS == NewRHS &&
46308                                  (NumShuffles < 2 || !IsIdentityPostShuffle),
46309                              DAG, Subtarget))
46310     return false;
46311 
46312   LHS = DAG.getBitcast(VT, NewLHS);
46313   RHS = DAG.getBitcast(VT, NewRHS);
46314   return true;
46315 }
46316 
46317 // Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
combineToHorizontalAddSub(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)46318 static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
46319                                          const X86Subtarget &Subtarget) {
46320   EVT VT = N->getValueType(0);
46321   unsigned Opcode = N->getOpcode();
46322   bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
46323   SmallVector<int, 8> PostShuffleMask;
46324 
46325   switch (Opcode) {
46326   case ISD::FADD:
46327   case ISD::FSUB:
46328     if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
46329         (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
46330       SDValue LHS = N->getOperand(0);
46331       SDValue RHS = N->getOperand(1);
46332       auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
46333       if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
46334                             PostShuffleMask)) {
46335         SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
46336         if (!PostShuffleMask.empty())
46337           HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
46338                                             DAG.getUNDEF(VT), PostShuffleMask);
46339         return HorizBinOp;
46340       }
46341     }
46342     break;
46343   case ISD::ADD:
46344   case ISD::SUB:
46345     if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
46346                                  VT == MVT::v16i16 || VT == MVT::v8i32)) {
46347       SDValue LHS = N->getOperand(0);
46348       SDValue RHS = N->getOperand(1);
46349       auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
46350       if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
46351                             PostShuffleMask)) {
46352         auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
46353                                         ArrayRef<SDValue> Ops) {
46354           return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
46355         };
46356         SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
46357                                               {LHS, RHS}, HOpBuilder);
46358         if (!PostShuffleMask.empty())
46359           HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
46360                                             DAG.getUNDEF(VT), PostShuffleMask);
46361         return HorizBinOp;
46362       }
46363     }
46364     break;
46365   }
46366 
46367   return SDValue();
46368 }
46369 
46370 /// Do target-specific dag combines on floating-point adds/subs.
combineFaddFsub(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)46371 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
46372                                const X86Subtarget &Subtarget) {
46373   if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
46374     return HOp;
46375   return SDValue();
46376 }
46377 
46378 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
46379 /// the codegen.
46380 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
46381 /// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
46382 ///       anything that is guaranteed to be transformed by DAGCombiner.
combineTruncatedArithmetic(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget,const SDLoc & DL)46383 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
46384                                           const X86Subtarget &Subtarget,
46385                                           const SDLoc &DL) {
46386   assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
46387   SDValue Src = N->getOperand(0);
46388   unsigned SrcOpcode = Src.getOpcode();
46389   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46390 
46391   EVT VT = N->getValueType(0);
46392   EVT SrcVT = Src.getValueType();
46393 
46394   auto IsFreeTruncation = [VT](SDValue Op) {
46395     unsigned TruncSizeInBits = VT.getScalarSizeInBits();
46396 
46397     // See if this has been extended from a smaller/equal size to
46398     // the truncation size, allowing a truncation to combine with the extend.
46399     unsigned Opcode = Op.getOpcode();
46400     if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
46401          Opcode == ISD::ZERO_EXTEND) &&
46402         Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
46403       return true;
46404 
46405     // See if this is a single use constant which can be constant folded.
46406     // NOTE: We don't peek throught bitcasts here because there is currently
46407     // no support for constant folding truncate+bitcast+vector_of_constants. So
46408     // we'll just send up with a truncate on both operands which will
46409     // get turned back into (truncate (binop)) causing an infinite loop.
46410     return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
46411   };
46412 
46413   auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
46414     SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
46415     SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
46416     return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
46417   };
46418 
46419   // Don't combine if the operation has other uses.
46420   if (!Src.hasOneUse())
46421     return SDValue();
46422 
46423   // Only support vector truncation for now.
46424   // TODO: i64 scalar math would benefit as well.
46425   if (!VT.isVector())
46426     return SDValue();
46427 
46428   // In most cases its only worth pre-truncating if we're only facing the cost
46429   // of one truncation.
46430   // i.e. if one of the inputs will constant fold or the input is repeated.
46431   switch (SrcOpcode) {
46432   case ISD::MUL:
46433     // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
46434     // better to truncate if we have the chance.
46435     if (SrcVT.getScalarType() == MVT::i64 &&
46436         TLI.isOperationLegal(SrcOpcode, VT) &&
46437         !TLI.isOperationLegal(SrcOpcode, SrcVT))
46438       return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
46439     LLVM_FALLTHROUGH;
46440   case ISD::AND:
46441   case ISD::XOR:
46442   case ISD::OR:
46443   case ISD::ADD:
46444   case ISD::SUB: {
46445     SDValue Op0 = Src.getOperand(0);
46446     SDValue Op1 = Src.getOperand(1);
46447     if (TLI.isOperationLegal(SrcOpcode, VT) &&
46448         (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
46449       return TruncateArithmetic(Op0, Op1);
46450     break;
46451   }
46452   }
46453 
46454   return SDValue();
46455 }
46456 
46457 /// Truncate using ISD::AND mask and X86ISD::PACKUS.
46458 /// e.g. trunc <8 x i32> X to <8 x i16> -->
46459 /// MaskX = X & 0xffff (clear high bits to prevent saturation)
46460 /// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
combineVectorTruncationWithPACKUS(SDNode * N,const SDLoc & DL,const X86Subtarget & Subtarget,SelectionDAG & DAG)46461 static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
46462                                                  const X86Subtarget &Subtarget,
46463                                                  SelectionDAG &DAG) {
46464   SDValue In = N->getOperand(0);
46465   EVT InVT = In.getValueType();
46466   EVT OutVT = N->getValueType(0);
46467 
46468   APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
46469                                     OutVT.getScalarSizeInBits());
46470   In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
46471   return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
46472 }
46473 
46474 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
combineVectorTruncationWithPACKSS(SDNode * N,const SDLoc & DL,const X86Subtarget & Subtarget,SelectionDAG & DAG)46475 static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
46476                                                  const X86Subtarget &Subtarget,
46477                                                  SelectionDAG &DAG) {
46478   SDValue In = N->getOperand(0);
46479   EVT InVT = In.getValueType();
46480   EVT OutVT = N->getValueType(0);
46481   In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
46482                    DAG.getValueType(OutVT));
46483   return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
46484 }
46485 
46486 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
46487 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
46488 /// legalization the truncation will be translated into a BUILD_VECTOR with each
46489 /// element that is extracted from a vector and then truncated, and it is
46490 /// difficult to do this optimization based on them.
combineVectorTruncation(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)46491 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
46492                                        const X86Subtarget &Subtarget) {
46493   EVT OutVT = N->getValueType(0);
46494   if (!OutVT.isVector())
46495     return SDValue();
46496 
46497   SDValue In = N->getOperand(0);
46498   if (!In.getValueType().isSimple())
46499     return SDValue();
46500 
46501   EVT InVT = In.getValueType();
46502   unsigned NumElems = OutVT.getVectorNumElements();
46503 
46504   // AVX512 provides fast truncate ops.
46505   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
46506     return SDValue();
46507 
46508   EVT OutSVT = OutVT.getVectorElementType();
46509   EVT InSVT = InVT.getVectorElementType();
46510   if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
46511         (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
46512         NumElems >= 8))
46513     return SDValue();
46514 
46515   // SSSE3's pshufb results in less instructions in the cases below.
46516   if (Subtarget.hasSSSE3() && NumElems == 8 &&
46517       ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
46518        (InSVT == MVT::i32 && OutSVT == MVT::i16)))
46519     return SDValue();
46520 
46521   SDLoc DL(N);
46522   // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
46523   // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
46524   // truncate 2 x v4i32 to v8i16.
46525   if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
46526     return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
46527   if (InSVT == MVT::i32)
46528     return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
46529 
46530   return SDValue();
46531 }
46532 
46533 /// This function transforms vector truncation of 'extended sign-bits' or
46534 /// 'extended zero-bits' values.
46535 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
combineVectorSignBitsTruncation(SDNode * N,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget)46536 static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
46537                                                SelectionDAG &DAG,
46538                                                const X86Subtarget &Subtarget) {
46539   // Requires SSE2.
46540   if (!Subtarget.hasSSE2())
46541     return SDValue();
46542 
46543   if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
46544     return SDValue();
46545 
46546   SDValue In = N->getOperand(0);
46547   if (!In.getValueType().isSimple())
46548     return SDValue();
46549 
46550   MVT VT = N->getValueType(0).getSimpleVT();
46551   MVT SVT = VT.getScalarType();
46552 
46553   MVT InVT = In.getValueType().getSimpleVT();
46554   MVT InSVT = InVT.getScalarType();
46555 
46556   // Check we have a truncation suited for PACKSS/PACKUS.
46557   if (!isPowerOf2_32(VT.getVectorNumElements()))
46558     return SDValue();
46559   if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
46560     return SDValue();
46561   if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
46562     return SDValue();
46563 
46564   // Truncation to sub-128bit vXi32 can be better handled with shuffles.
46565   if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
46566     return SDValue();
46567 
46568   // AVX512 has fast truncate, but if the input is already going to be split,
46569   // there's no harm in trying pack.
46570   if (Subtarget.hasAVX512() &&
46571       !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
46572         InVT.is512BitVector())) {
46573     // PACK should still be worth it for 128-bit vectors if the sources were
46574     // originally concatenated from subvectors.
46575     SmallVector<SDValue> ConcatOps;
46576     if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps))
46577     return SDValue();
46578   }
46579 
46580   unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
46581   unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
46582 
46583   // Use PACKUS if the input has zero-bits that extend all the way to the
46584   // packed/truncated value. e.g. masks, zext_in_reg, etc.
46585   KnownBits Known = DAG.computeKnownBits(In);
46586   unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
46587   if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
46588     return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
46589 
46590   // Use PACKSS if the input has sign-bits that extend all the way to the
46591   // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
46592   unsigned NumSignBits = DAG.ComputeNumSignBits(In);
46593 
46594   // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
46595   // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
46596   // on and combines/simplifications can't then use it.
46597   if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
46598     return SDValue();
46599 
46600   unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;
46601   if (NumSignBits > MinSignBits)
46602     return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
46603 
46604   // If we have a srl that only generates signbits that we will discard in
46605   // the truncation then we can use PACKSS by converting the srl to a sra.
46606   // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
46607   if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))
46608     if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
46609             In, APInt::getAllOnesValue(VT.getVectorNumElements()))) {
46610       if (*ShAmt == MinSignBits) {
46611         SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());
46612         return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,
46613                                       Subtarget);
46614       }
46615     }
46616 
46617   return SDValue();
46618 }
46619 
46620 // Try to form a MULHU or MULHS node by looking for
46621 // (trunc (srl (mul ext, ext), 16))
46622 // TODO: This is X86 specific because we want to be able to handle wide types
46623 // before type legalization. But we can only do it if the vector will be
46624 // legalized via widening/splitting. Type legalization can't handle promotion
46625 // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
46626 // combiner.
combinePMULH(SDValue Src,EVT VT,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget)46627 static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
46628                             SelectionDAG &DAG, const X86Subtarget &Subtarget) {
46629   // First instruction should be a right shift of a multiply.
46630   if (Src.getOpcode() != ISD::SRL ||
46631       Src.getOperand(0).getOpcode() != ISD::MUL)
46632     return SDValue();
46633 
46634   if (!Subtarget.hasSSE2())
46635     return SDValue();
46636 
46637   // Only handle vXi16 types that are at least 128-bits unless they will be
46638   // widened.
46639   if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
46640     return SDValue();
46641 
46642   // Input type should be at least vXi32.
46643   EVT InVT = Src.getValueType();
46644   if (InVT.getVectorElementType().getSizeInBits() < 32)
46645     return SDValue();
46646 
46647   // Need a shift by 16.
46648   APInt ShiftAmt;
46649   if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
46650       ShiftAmt != 16)
46651     return SDValue();
46652 
46653   SDValue LHS = Src.getOperand(0).getOperand(0);
46654   SDValue RHS = Src.getOperand(0).getOperand(1);
46655 
46656   unsigned ExtOpc = LHS.getOpcode();
46657   if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
46658       RHS.getOpcode() != ExtOpc)
46659     return SDValue();
46660 
46661   // Peek through the extends.
46662   LHS = LHS.getOperand(0);
46663   RHS = RHS.getOperand(0);
46664 
46665   // Ensure the input types match.
46666   if (LHS.getValueType() != VT || RHS.getValueType() != VT)
46667     return SDValue();
46668 
46669   unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
46670   return DAG.getNode(Opc, DL, VT, LHS, RHS);
46671 }
46672 
46673 // Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
46674 // from one vector with signed bytes from another vector, adds together
46675 // adjacent pairs of 16-bit products, and saturates the result before
46676 // truncating to 16-bits.
46677 //
46678 // Which looks something like this:
46679 // (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
46680 //                 (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
detectPMADDUBSW(SDValue In,EVT VT,SelectionDAG & DAG,const X86Subtarget & Subtarget,const SDLoc & DL)46681 static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
46682                                const X86Subtarget &Subtarget,
46683                                const SDLoc &DL) {
46684   if (!VT.isVector() || !Subtarget.hasSSSE3())
46685     return SDValue();
46686 
46687   unsigned NumElems = VT.getVectorNumElements();
46688   EVT ScalarVT = VT.getVectorElementType();
46689   if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
46690     return SDValue();
46691 
46692   SDValue SSatVal = detectSSatPattern(In, VT);
46693   if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
46694     return SDValue();
46695 
46696   // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
46697   // of multiplies from even/odd elements.
46698   SDValue N0 = SSatVal.getOperand(0);
46699   SDValue N1 = SSatVal.getOperand(1);
46700 
46701   if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
46702     return SDValue();
46703 
46704   SDValue N00 = N0.getOperand(0);
46705   SDValue N01 = N0.getOperand(1);
46706   SDValue N10 = N1.getOperand(0);
46707   SDValue N11 = N1.getOperand(1);
46708 
46709   // TODO: Handle constant vectors and use knownbits/computenumsignbits?
46710   // Canonicalize zero_extend to LHS.
46711   if (N01.getOpcode() == ISD::ZERO_EXTEND)
46712     std::swap(N00, N01);
46713   if (N11.getOpcode() == ISD::ZERO_EXTEND)
46714     std::swap(N10, N11);
46715 
46716   // Ensure we have a zero_extend and a sign_extend.
46717   if (N00.getOpcode() != ISD::ZERO_EXTEND ||
46718       N01.getOpcode() != ISD::SIGN_EXTEND ||
46719       N10.getOpcode() != ISD::ZERO_EXTEND ||
46720       N11.getOpcode() != ISD::SIGN_EXTEND)
46721     return SDValue();
46722 
46723   // Peek through the extends.
46724   N00 = N00.getOperand(0);
46725   N01 = N01.getOperand(0);
46726   N10 = N10.getOperand(0);
46727   N11 = N11.getOperand(0);
46728 
46729   // Ensure the extend is from vXi8.
46730   if (N00.getValueType().getVectorElementType() != MVT::i8 ||
46731       N01.getValueType().getVectorElementType() != MVT::i8 ||
46732       N10.getValueType().getVectorElementType() != MVT::i8 ||
46733       N11.getValueType().getVectorElementType() != MVT::i8)
46734     return SDValue();
46735 
46736   // All inputs should be build_vectors.
46737   if (N00.getOpcode() != ISD::BUILD_VECTOR ||
46738       N01.getOpcode() != ISD::BUILD_VECTOR ||
46739       N10.getOpcode() != ISD::BUILD_VECTOR ||
46740       N11.getOpcode() != ISD::BUILD_VECTOR)
46741     return SDValue();
46742 
46743   // N00/N10 are zero extended. N01/N11 are sign extended.
46744 
46745   // For each element, we need to ensure we have an odd element from one vector
46746   // multiplied by the odd element of another vector and the even element from
46747   // one of the same vectors being multiplied by the even element from the
46748   // other vector. So we need to make sure for each element i, this operator
46749   // is being performed:
46750   //  A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
46751   SDValue ZExtIn, SExtIn;
46752   for (unsigned i = 0; i != NumElems; ++i) {
46753     SDValue N00Elt = N00.getOperand(i);
46754     SDValue N01Elt = N01.getOperand(i);
46755     SDValue N10Elt = N10.getOperand(i);
46756     SDValue N11Elt = N11.getOperand(i);
46757     // TODO: Be more tolerant to undefs.
46758     if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
46759         N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
46760         N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
46761         N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
46762       return SDValue();
46763     auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
46764     auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
46765     auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
46766     auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
46767     if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
46768       return SDValue();
46769     unsigned IdxN00 = ConstN00Elt->getZExtValue();
46770     unsigned IdxN01 = ConstN01Elt->getZExtValue();
46771     unsigned IdxN10 = ConstN10Elt->getZExtValue();
46772     unsigned IdxN11 = ConstN11Elt->getZExtValue();
46773     // Add is commutative so indices can be reordered.
46774     if (IdxN00 > IdxN10) {
46775       std::swap(IdxN00, IdxN10);
46776       std::swap(IdxN01, IdxN11);
46777     }
46778     // N0 indices be the even element. N1 indices must be the next odd element.
46779     if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
46780         IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
46781       return SDValue();
46782     SDValue N00In = N00Elt.getOperand(0);
46783     SDValue N01In = N01Elt.getOperand(0);
46784     SDValue N10In = N10Elt.getOperand(0);
46785     SDValue N11In = N11Elt.getOperand(0);
46786     // First time we find an input capture it.
46787     if (!ZExtIn) {
46788       ZExtIn = N00In;
46789       SExtIn = N01In;
46790     }
46791     if (ZExtIn != N00In || SExtIn != N01In ||
46792         ZExtIn != N10In || SExtIn != N11In)
46793       return SDValue();
46794   }
46795 
46796   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46797                          ArrayRef<SDValue> Ops) {
46798     // Shrink by adding truncate nodes and let DAGCombine fold with the
46799     // sources.
46800     EVT InVT = Ops[0].getValueType();
46801     assert(InVT.getScalarType() == MVT::i8 &&
46802            "Unexpected scalar element type");
46803     assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
46804     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
46805                                  InVT.getVectorNumElements() / 2);
46806     return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
46807   };
46808   return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
46809                           PMADDBuilder);
46810 }
46811 
combineTruncate(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)46812 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
46813                                const X86Subtarget &Subtarget) {
46814   EVT VT = N->getValueType(0);
46815   SDValue Src = N->getOperand(0);
46816   SDLoc DL(N);
46817 
46818   // Attempt to pre-truncate inputs to arithmetic ops instead.
46819   if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
46820     return V;
46821 
46822   // Try to detect AVG pattern first.
46823   if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
46824     return Avg;
46825 
46826   // Try to detect PMADD
46827   if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
46828     return PMAdd;
46829 
46830   // Try to combine truncation with signed/unsigned saturation.
46831   if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
46832     return Val;
46833 
46834   // Try to combine PMULHUW/PMULHW for vXi16.
46835   if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
46836     return V;
46837 
46838   // The bitcast source is a direct mmx result.
46839   // Detect bitcasts between i32 to x86mmx
46840   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
46841     SDValue BCSrc = Src.getOperand(0);
46842     if (BCSrc.getValueType() == MVT::x86mmx)
46843       return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
46844   }
46845 
46846   // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
46847   if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
46848     return V;
46849 
46850   return combineVectorTruncation(N, DAG, Subtarget);
46851 }
46852 
combineVTRUNC(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)46853 static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
46854                              TargetLowering::DAGCombinerInfo &DCI) {
46855   EVT VT = N->getValueType(0);
46856   SDValue In = N->getOperand(0);
46857   SDLoc DL(N);
46858 
46859   if (auto SSatVal = detectSSatPattern(In, VT))
46860     return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
46861   if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
46862     return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
46863 
46864   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46865   APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));
46866   if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
46867     return SDValue(N, 0);
46868 
46869   return SDValue();
46870 }
46871 
46872 /// Returns the negated value if the node \p N flips sign of FP value.
46873 ///
46874 /// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
46875 /// or FSUB(0, x)
46876 /// AVX512F does not have FXOR, so FNEG is lowered as
46877 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
46878 /// In this case we go though all bitcasts.
46879 /// This also recognizes splat of a negated value and returns the splat of that
46880 /// value.
isFNEG(SelectionDAG & DAG,SDNode * N,unsigned Depth=0)46881 static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
46882   if (N->getOpcode() == ISD::FNEG)
46883     return N->getOperand(0);
46884 
46885   // Don't recurse exponentially.
46886   if (Depth > SelectionDAG::MaxRecursionDepth)
46887     return SDValue();
46888 
46889   unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
46890 
46891   SDValue Op = peekThroughBitcasts(SDValue(N, 0));
46892   EVT VT = Op->getValueType(0);
46893 
46894   // Make sure the element size doesn't change.
46895   if (VT.getScalarSizeInBits() != ScalarSize)
46896     return SDValue();
46897 
46898   unsigned Opc = Op.getOpcode();
46899   switch (Opc) {
46900   case ISD::VECTOR_SHUFFLE: {
46901     // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
46902     // of this is VECTOR_SHUFFLE(-VEC1, UNDEF).  The mask can be anything here.
46903     if (!Op.getOperand(1).isUndef())
46904       return SDValue();
46905     if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
46906       if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
46907         return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
46908                                     cast<ShuffleVectorSDNode>(Op)->getMask());
46909     break;
46910   }
46911   case ISD::INSERT_VECTOR_ELT: {
46912     // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
46913     // -V, INDEX).
46914     SDValue InsVector = Op.getOperand(0);
46915     SDValue InsVal = Op.getOperand(1);
46916     if (!InsVector.isUndef())
46917       return SDValue();
46918     if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
46919       if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
46920         return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
46921                            NegInsVal, Op.getOperand(2));
46922     break;
46923   }
46924   case ISD::FSUB:
46925   case ISD::XOR:
46926   case X86ISD::FXOR: {
46927     SDValue Op1 = Op.getOperand(1);
46928     SDValue Op0 = Op.getOperand(0);
46929 
46930     // For XOR and FXOR, we want to check if constant
46931     // bits of Op1 are sign bit masks. For FSUB, we
46932     // have to check if constant bits of Op0 are sign
46933     // bit masks and hence we swap the operands.
46934     if (Opc == ISD::FSUB)
46935       std::swap(Op0, Op1);
46936 
46937     APInt UndefElts;
46938     SmallVector<APInt, 16> EltBits;
46939     // Extract constant bits and see if they are all
46940     // sign bit masks. Ignore the undef elements.
46941     if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
46942                                       /* AllowWholeUndefs */ true,
46943                                       /* AllowPartialUndefs */ false)) {
46944       for (unsigned I = 0, E = EltBits.size(); I < E; I++)
46945         if (!UndefElts[I] && !EltBits[I].isSignMask())
46946           return SDValue();
46947 
46948       return peekThroughBitcasts(Op0);
46949     }
46950   }
46951   }
46952 
46953   return SDValue();
46954 }
46955 
negateFMAOpcode(unsigned Opcode,bool NegMul,bool NegAcc,bool NegRes)46956 static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
46957                                 bool NegRes) {
46958   if (NegMul) {
46959     switch (Opcode) {
46960     default: llvm_unreachable("Unexpected opcode");
46961     case ISD::FMA:              Opcode = X86ISD::FNMADD;        break;
46962     case ISD::STRICT_FMA:       Opcode = X86ISD::STRICT_FNMADD; break;
46963     case X86ISD::FMADD_RND:     Opcode = X86ISD::FNMADD_RND;    break;
46964     case X86ISD::FMSUB:         Opcode = X86ISD::FNMSUB;        break;
46965     case X86ISD::STRICT_FMSUB:  Opcode = X86ISD::STRICT_FNMSUB; break;
46966     case X86ISD::FMSUB_RND:     Opcode = X86ISD::FNMSUB_RND;    break;
46967     case X86ISD::FNMADD:        Opcode = ISD::FMA;              break;
46968     case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA;       break;
46969     case X86ISD::FNMADD_RND:    Opcode = X86ISD::FMADD_RND;     break;
46970     case X86ISD::FNMSUB:        Opcode = X86ISD::FMSUB;         break;
46971     case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB;  break;
46972     case X86ISD::FNMSUB_RND:    Opcode = X86ISD::FMSUB_RND;     break;
46973     }
46974   }
46975 
46976   if (NegAcc) {
46977     switch (Opcode) {
46978     default: llvm_unreachable("Unexpected opcode");
46979     case ISD::FMA:              Opcode = X86ISD::FMSUB;         break;
46980     case ISD::STRICT_FMA:       Opcode = X86ISD::STRICT_FMSUB;  break;
46981     case X86ISD::FMADD_RND:     Opcode = X86ISD::FMSUB_RND;     break;
46982     case X86ISD::FMSUB:         Opcode = ISD::FMA;              break;
46983     case X86ISD::STRICT_FMSUB:  Opcode = ISD::STRICT_FMA;       break;
46984     case X86ISD::FMSUB_RND:     Opcode = X86ISD::FMADD_RND;     break;
46985     case X86ISD::FNMADD:        Opcode = X86ISD::FNMSUB;        break;
46986     case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
46987     case X86ISD::FNMADD_RND:    Opcode = X86ISD::FNMSUB_RND;    break;
46988     case X86ISD::FNMSUB:        Opcode = X86ISD::FNMADD;        break;
46989     case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
46990     case X86ISD::FNMSUB_RND:    Opcode = X86ISD::FNMADD_RND;    break;
46991     case X86ISD::FMADDSUB:      Opcode = X86ISD::FMSUBADD;      break;
46992     case X86ISD::FMADDSUB_RND:  Opcode = X86ISD::FMSUBADD_RND;  break;
46993     case X86ISD::FMSUBADD:      Opcode = X86ISD::FMADDSUB;      break;
46994     case X86ISD::FMSUBADD_RND:  Opcode = X86ISD::FMADDSUB_RND;  break;
46995     }
46996   }
46997 
46998   if (NegRes) {
46999     switch (Opcode) {
47000     // For accuracy reason, we never combine fneg and fma under strict FP.
47001     default: llvm_unreachable("Unexpected opcode");
47002     case ISD::FMA:             Opcode = X86ISD::FNMSUB;       break;
47003     case X86ISD::FMADD_RND:    Opcode = X86ISD::FNMSUB_RND;   break;
47004     case X86ISD::FMSUB:        Opcode = X86ISD::FNMADD;       break;
47005     case X86ISD::FMSUB_RND:    Opcode = X86ISD::FNMADD_RND;   break;
47006     case X86ISD::FNMADD:       Opcode = X86ISD::FMSUB;        break;
47007     case X86ISD::FNMADD_RND:   Opcode = X86ISD::FMSUB_RND;    break;
47008     case X86ISD::FNMSUB:       Opcode = ISD::FMA;             break;
47009     case X86ISD::FNMSUB_RND:   Opcode = X86ISD::FMADD_RND;    break;
47010     }
47011   }
47012 
47013   return Opcode;
47014 }
47015 
47016 /// Do target-specific dag combines on floating point negations.
combineFneg(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)47017 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
47018                            TargetLowering::DAGCombinerInfo &DCI,
47019                            const X86Subtarget &Subtarget) {
47020   EVT OrigVT = N->getValueType(0);
47021   SDValue Arg = isFNEG(DAG, N);
47022   if (!Arg)
47023     return SDValue();
47024 
47025   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47026   EVT VT = Arg.getValueType();
47027   EVT SVT = VT.getScalarType();
47028   SDLoc DL(N);
47029 
47030   // Let legalize expand this if it isn't a legal type yet.
47031   if (!TLI.isTypeLegal(VT))
47032     return SDValue();
47033 
47034   // If we're negating a FMUL node on a target with FMA, then we can avoid the
47035   // use of a constant by performing (-0 - A*B) instead.
47036   // FIXME: Check rounding control flags as well once it becomes available.
47037   if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
47038       Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
47039     SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
47040     SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
47041                                   Arg.getOperand(1), Zero);
47042     return DAG.getBitcast(OrigVT, NewNode);
47043   }
47044 
47045   bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
47046   bool LegalOperations = !DCI.isBeforeLegalizeOps();
47047   if (SDValue NegArg =
47048           TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
47049     return DAG.getBitcast(OrigVT, NegArg);
47050 
47051   return SDValue();
47052 }
47053 
getNegatedExpression(SDValue Op,SelectionDAG & DAG,bool LegalOperations,bool ForCodeSize,NegatibleCost & Cost,unsigned Depth) const47054 SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
47055                                                 bool LegalOperations,
47056                                                 bool ForCodeSize,
47057                                                 NegatibleCost &Cost,
47058                                                 unsigned Depth) const {
47059   // fneg patterns are removable even if they have multiple uses.
47060   if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
47061     Cost = NegatibleCost::Cheaper;
47062     return DAG.getBitcast(Op.getValueType(), Arg);
47063   }
47064 
47065   EVT VT = Op.getValueType();
47066   EVT SVT = VT.getScalarType();
47067   unsigned Opc = Op.getOpcode();
47068   SDNodeFlags Flags = Op.getNode()->getFlags();
47069   switch (Opc) {
47070   case ISD::FMA:
47071   case X86ISD::FMSUB:
47072   case X86ISD::FNMADD:
47073   case X86ISD::FNMSUB:
47074   case X86ISD::FMADD_RND:
47075   case X86ISD::FMSUB_RND:
47076   case X86ISD::FNMADD_RND:
47077   case X86ISD::FNMSUB_RND: {
47078     if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
47079         !(SVT == MVT::f32 || SVT == MVT::f64) ||
47080         !isOperationLegal(ISD::FMA, VT))
47081       break;
47082 
47083     // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
47084     // if it may have signed zeros.
47085     if (!Flags.hasNoSignedZeros())
47086       break;
47087 
47088     // This is always negatible for free but we might be able to remove some
47089     // extra operand negations as well.
47090     SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
47091     for (int i = 0; i != 3; ++i)
47092       NewOps[i] = getCheaperNegatedExpression(
47093           Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
47094 
47095     bool NegA = !!NewOps[0];
47096     bool NegB = !!NewOps[1];
47097     bool NegC = !!NewOps[2];
47098     unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
47099 
47100     Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
47101                                   : NegatibleCost::Neutral;
47102 
47103     // Fill in the non-negated ops with the original values.
47104     for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
47105       if (!NewOps[i])
47106         NewOps[i] = Op.getOperand(i);
47107     return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
47108   }
47109   case X86ISD::FRCP:
47110     if (SDValue NegOp0 =
47111             getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
47112                                  ForCodeSize, Cost, Depth + 1))
47113       return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
47114     break;
47115   }
47116 
47117   return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
47118                                               ForCodeSize, Cost, Depth);
47119 }
47120 
lowerX86FPLogicOp(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)47121 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
47122                                  const X86Subtarget &Subtarget) {
47123   MVT VT = N->getSimpleValueType(0);
47124   // If we have integer vector types available, use the integer opcodes.
47125   if (!VT.isVector() || !Subtarget.hasSSE2())
47126     return SDValue();
47127 
47128   SDLoc dl(N);
47129 
47130   unsigned IntBits = VT.getScalarSizeInBits();
47131   MVT IntSVT = MVT::getIntegerVT(IntBits);
47132   MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
47133 
47134   SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
47135   SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
47136   unsigned IntOpcode;
47137   switch (N->getOpcode()) {
47138   default: llvm_unreachable("Unexpected FP logic op");
47139   case X86ISD::FOR:   IntOpcode = ISD::OR; break;
47140   case X86ISD::FXOR:  IntOpcode = ISD::XOR; break;
47141   case X86ISD::FAND:  IntOpcode = ISD::AND; break;
47142   case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
47143   }
47144   SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
47145   return DAG.getBitcast(VT, IntOp);
47146 }
47147 
47148 
47149 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
foldXor1SetCC(SDNode * N,SelectionDAG & DAG)47150 static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
47151   if (N->getOpcode() != ISD::XOR)
47152     return SDValue();
47153 
47154   SDValue LHS = N->getOperand(0);
47155   if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
47156     return SDValue();
47157 
47158   X86::CondCode NewCC = X86::GetOppositeBranchCondition(
47159       X86::CondCode(LHS->getConstantOperandVal(0)));
47160   SDLoc DL(N);
47161   return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
47162 }
47163 
combineXor(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)47164 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
47165                           TargetLowering::DAGCombinerInfo &DCI,
47166                           const X86Subtarget &Subtarget) {
47167   SDValue N0 = N->getOperand(0);
47168   SDValue N1 = N->getOperand(1);
47169   EVT VT = N->getValueType(0);
47170 
47171   // If this is SSE1 only convert to FXOR to avoid scalarization.
47172   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
47173     return DAG.getBitcast(MVT::v4i32,
47174                           DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
47175                                       DAG.getBitcast(MVT::v4f32, N0),
47176                                       DAG.getBitcast(MVT::v4f32, N1)));
47177   }
47178 
47179   if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
47180     return Cmp;
47181 
47182   if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
47183     return R;
47184 
47185   if (DCI.isBeforeLegalizeOps())
47186     return SDValue();
47187 
47188   if (SDValue SetCC = foldXor1SetCC(N, DAG))
47189     return SetCC;
47190 
47191   if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
47192     return RV;
47193 
47194   // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
47195   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47196   if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
47197       N0.getOperand(0).getValueType().isVector() &&
47198       N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
47199       TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
47200     return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
47201                                          N0.getOperand(0).getValueType()));
47202   }
47203 
47204   // Handle AVX512 mask widening.
47205   // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
47206   if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
47207       VT.getVectorElementType() == MVT::i1 &&
47208       N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&
47209       TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
47210     return DAG.getNode(
47211         ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
47212         DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
47213         N0.getOperand(2));
47214   }
47215 
47216   // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
47217   // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
47218   // TODO: Under what circumstances could this be performed in DAGCombine?
47219   if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
47220       N0.getOperand(0).getOpcode() == N->getOpcode()) {
47221     SDValue TruncExtSrc = N0.getOperand(0);
47222     auto *N1C = dyn_cast<ConstantSDNode>(N1);
47223     auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
47224     if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
47225       SDLoc DL(N);
47226       SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
47227       SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
47228       return DAG.getNode(ISD::XOR, DL, VT, LHS,
47229                          DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
47230     }
47231   }
47232 
47233   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
47234     return FPLogic;
47235 
47236   return combineFneg(N, DAG, DCI, Subtarget);
47237 }
47238 
combineBEXTR(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)47239 static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
47240                             TargetLowering::DAGCombinerInfo &DCI,
47241                             const X86Subtarget &Subtarget) {
47242   EVT VT = N->getValueType(0);
47243   unsigned NumBits = VT.getSizeInBits();
47244 
47245   // TODO - Constant Folding.
47246 
47247   // Simplify the inputs.
47248   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47249   APInt DemandedMask(APInt::getAllOnesValue(NumBits));
47250   if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
47251     return SDValue(N, 0);
47252 
47253   return SDValue();
47254 }
47255 
isNullFPScalarOrVectorConst(SDValue V)47256 static bool isNullFPScalarOrVectorConst(SDValue V) {
47257   return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
47258 }
47259 
47260 /// If a value is a scalar FP zero or a vector FP zero (potentially including
47261 /// undefined elements), return a zero constant that may be used to fold away
47262 /// that value. In the case of a vector, the returned constant will not contain
47263 /// undefined elements even if the input parameter does. This makes it suitable
47264 /// to be used as a replacement operand with operations (eg, bitwise-and) where
47265 /// an undef should not propagate.
getNullFPConstForNullVal(SDValue V,SelectionDAG & DAG,const X86Subtarget & Subtarget)47266 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
47267                                         const X86Subtarget &Subtarget) {
47268   if (!isNullFPScalarOrVectorConst(V))
47269     return SDValue();
47270 
47271   if (V.getValueType().isVector())
47272     return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
47273 
47274   return V;
47275 }
47276 
combineFAndFNotToFAndn(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)47277 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
47278                                       const X86Subtarget &Subtarget) {
47279   SDValue N0 = N->getOperand(0);
47280   SDValue N1 = N->getOperand(1);
47281   EVT VT = N->getValueType(0);
47282   SDLoc DL(N);
47283 
47284   // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
47285   if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
47286         (VT == MVT::f64 && Subtarget.hasSSE2()) ||
47287         (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
47288     return SDValue();
47289 
47290   auto isAllOnesConstantFP = [](SDValue V) {
47291     if (V.getSimpleValueType().isVector())
47292       return ISD::isBuildVectorAllOnes(V.getNode());
47293     auto *C = dyn_cast<ConstantFPSDNode>(V);
47294     return C && C->getConstantFPValue()->isAllOnesValue();
47295   };
47296 
47297   // fand (fxor X, -1), Y --> fandn X, Y
47298   if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
47299     return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
47300 
47301   // fand X, (fxor Y, -1) --> fandn Y, X
47302   if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
47303     return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
47304 
47305   return SDValue();
47306 }
47307 
47308 /// Do target-specific dag combines on X86ISD::FAND nodes.
combineFAnd(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)47309 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
47310                            const X86Subtarget &Subtarget) {
47311   // FAND(0.0, x) -> 0.0
47312   if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
47313     return V;
47314 
47315   // FAND(x, 0.0) -> 0.0
47316   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
47317     return V;
47318 
47319   if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
47320     return V;
47321 
47322   return lowerX86FPLogicOp(N, DAG, Subtarget);
47323 }
47324 
47325 /// Do target-specific dag combines on X86ISD::FANDN nodes.
combineFAndn(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)47326 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
47327                             const X86Subtarget &Subtarget) {
47328   // FANDN(0.0, x) -> x
47329   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
47330     return N->getOperand(1);
47331 
47332   // FANDN(x, 0.0) -> 0.0
47333   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
47334     return V;
47335 
47336   return lowerX86FPLogicOp(N, DAG, Subtarget);
47337 }
47338 
47339 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
combineFOr(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)47340 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
47341                           TargetLowering::DAGCombinerInfo &DCI,
47342                           const X86Subtarget &Subtarget) {
47343   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
47344 
47345   // F[X]OR(0.0, x) -> x
47346   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
47347     return N->getOperand(1);
47348 
47349   // F[X]OR(x, 0.0) -> x
47350   if (isNullFPScalarOrVectorConst(N->getOperand(1)))
47351     return N->getOperand(0);
47352 
47353   if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
47354     return NewVal;
47355 
47356   return lowerX86FPLogicOp(N, DAG, Subtarget);
47357 }
47358 
47359 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
combineFMinFMax(SDNode * N,SelectionDAG & DAG)47360 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
47361   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
47362 
47363   // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
47364   if (!DAG.getTarget().Options.NoNaNsFPMath ||
47365       !DAG.getTarget().Options.NoSignedZerosFPMath)
47366     return SDValue();
47367 
47368   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
47369   // into FMINC and FMAXC, which are Commutative operations.
47370   unsigned NewOp = 0;
47371   switch (N->getOpcode()) {
47372     default: llvm_unreachable("unknown opcode");
47373     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
47374     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
47375   }
47376 
47377   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
47378                      N->getOperand(0), N->getOperand(1));
47379 }
47380 
combineFMinNumFMaxNum(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)47381 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
47382                                      const X86Subtarget &Subtarget) {
47383   if (Subtarget.useSoftFloat())
47384     return SDValue();
47385 
47386   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47387 
47388   EVT VT = N->getValueType(0);
47389   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
47390         (Subtarget.hasSSE2() && VT == MVT::f64) ||
47391         (VT.isVector() && TLI.isTypeLegal(VT))))
47392     return SDValue();
47393 
47394   SDValue Op0 = N->getOperand(0);
47395   SDValue Op1 = N->getOperand(1);
47396   SDLoc DL(N);
47397   auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
47398 
47399   // If we don't have to respect NaN inputs, this is a direct translation to x86
47400   // min/max instructions.
47401   if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
47402     return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
47403 
47404   // If one of the operands is known non-NaN use the native min/max instructions
47405   // with the non-NaN input as second operand.
47406   if (DAG.isKnownNeverNaN(Op1))
47407     return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
47408   if (DAG.isKnownNeverNaN(Op0))
47409     return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
47410 
47411   // If we have to respect NaN inputs, this takes at least 3 instructions.
47412   // Favor a library call when operating on a scalar and minimizing code size.
47413   if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
47414     return SDValue();
47415 
47416   EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
47417                                          VT);
47418 
47419   // There are 4 possibilities involving NaN inputs, and these are the required
47420   // outputs:
47421   //                   Op1
47422   //               Num     NaN
47423   //            ----------------
47424   //       Num  |  Max  |  Op0 |
47425   // Op0        ----------------
47426   //       NaN  |  Op1  |  NaN |
47427   //            ----------------
47428   //
47429   // The SSE FP max/min instructions were not designed for this case, but rather
47430   // to implement:
47431   //   Min = Op1 < Op0 ? Op1 : Op0
47432   //   Max = Op1 > Op0 ? Op1 : Op0
47433   //
47434   // So they always return Op0 if either input is a NaN. However, we can still
47435   // use those instructions for fmaxnum by selecting away a NaN input.
47436 
47437   // If either operand is NaN, the 2nd source operand (Op0) is passed through.
47438   SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
47439   SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
47440 
47441   // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
47442   // are NaN, the NaN value of Op1 is the result.
47443   return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
47444 }
47445 
combineX86INT_TO_FP(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)47446 static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
47447                                    TargetLowering::DAGCombinerInfo &DCI) {
47448   EVT VT = N->getValueType(0);
47449   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47450 
47451   APInt KnownUndef, KnownZero;
47452   APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
47453   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
47454                                      KnownZero, DCI))
47455     return SDValue(N, 0);
47456 
47457   // Convert a full vector load into vzload when not all bits are needed.
47458   SDValue In = N->getOperand(0);
47459   MVT InVT = In.getSimpleValueType();
47460   if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
47461       ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
47462     assert(InVT.is128BitVector() && "Expected 128-bit input vector");
47463     LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
47464     unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
47465     MVT MemVT = MVT::getIntegerVT(NumBits);
47466     MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
47467     if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
47468       SDLoc dl(N);
47469       SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
47470                                     DAG.getBitcast(InVT, VZLoad));
47471       DCI.CombineTo(N, Convert);
47472       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
47473       DCI.recursivelyDeleteUnusedNodes(LN);
47474       return SDValue(N, 0);
47475     }
47476   }
47477 
47478   return SDValue();
47479 }
47480 
combineCVTP2I_CVTTP2I(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)47481 static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
47482                                      TargetLowering::DAGCombinerInfo &DCI) {
47483   bool IsStrict = N->isTargetStrictFPOpcode();
47484   EVT VT = N->getValueType(0);
47485 
47486   // Convert a full vector load into vzload when not all bits are needed.
47487   SDValue In = N->getOperand(IsStrict ? 1 : 0);
47488   MVT InVT = In.getSimpleValueType();
47489   if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
47490       ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
47491     assert(InVT.is128BitVector() && "Expected 128-bit input vector");
47492     LoadSDNode *LN = cast<LoadSDNode>(In);
47493     unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
47494     MVT MemVT = MVT::getFloatingPointVT(NumBits);
47495     MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
47496     if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
47497       SDLoc dl(N);
47498       if (IsStrict) {
47499         SDValue Convert =
47500             DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
47501                         {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
47502         DCI.CombineTo(N, Convert, Convert.getValue(1));
47503       } else {
47504         SDValue Convert =
47505             DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
47506         DCI.CombineTo(N, Convert);
47507       }
47508       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
47509       DCI.recursivelyDeleteUnusedNodes(LN);
47510       return SDValue(N, 0);
47511     }
47512   }
47513 
47514   return SDValue();
47515 }
47516 
47517 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
combineAndnp(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)47518 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
47519                             TargetLowering::DAGCombinerInfo &DCI,
47520                             const X86Subtarget &Subtarget) {
47521   MVT VT = N->getSimpleValueType(0);
47522 
47523   // ANDNP(0, x) -> x
47524   if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
47525     return N->getOperand(1);
47526 
47527   // ANDNP(x, 0) -> 0
47528   if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
47529     return DAG.getConstant(0, SDLoc(N), VT);
47530 
47531   // Turn ANDNP back to AND if input is inverted.
47532   if (SDValue Not = IsNOT(N->getOperand(0), DAG))
47533     return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
47534                        N->getOperand(1));
47535 
47536   // Attempt to recursively combine a bitmask ANDNP with shuffles.
47537   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
47538     SDValue Op(N, 0);
47539     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47540       return Res;
47541   }
47542 
47543   return SDValue();
47544 }
47545 
combineBT(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)47546 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
47547                          TargetLowering::DAGCombinerInfo &DCI) {
47548   SDValue N1 = N->getOperand(1);
47549 
47550   // BT ignores high bits in the bit index operand.
47551   unsigned BitWidth = N1.getValueSizeInBits();
47552   APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
47553   if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
47554     if (N->getOpcode() != ISD::DELETED_NODE)
47555       DCI.AddToWorklist(N);
47556     return SDValue(N, 0);
47557   }
47558 
47559   return SDValue();
47560 }
47561 
combineCVTPH2PS(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)47562 static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
47563                                TargetLowering::DAGCombinerInfo &DCI) {
47564   bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
47565   SDValue Src = N->getOperand(IsStrict ? 1 : 0);
47566 
47567   if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
47568     APInt KnownUndef, KnownZero;
47569     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47570     APInt DemandedElts = APInt::getLowBitsSet(8, 4);
47571     if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
47572                                        DCI)) {
47573       if (N->getOpcode() != ISD::DELETED_NODE)
47574         DCI.AddToWorklist(N);
47575       return SDValue(N, 0);
47576     }
47577 
47578     // Convert a full vector load into vzload when not all bits are needed.
47579     if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
47580       LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
47581       if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
47582         SDLoc dl(N);
47583         if (IsStrict) {
47584           SDValue Convert = DAG.getNode(
47585               N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
47586               {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
47587           DCI.CombineTo(N, Convert, Convert.getValue(1));
47588         } else {
47589           SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
47590                                         DAG.getBitcast(MVT::v8i16, VZLoad));
47591           DCI.CombineTo(N, Convert);
47592         }
47593 
47594         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
47595         DCI.recursivelyDeleteUnusedNodes(LN);
47596         return SDValue(N, 0);
47597       }
47598     }
47599   }
47600 
47601   return SDValue();
47602 }
47603 
47604 // Try to combine sext_in_reg of a cmov of constants by extending the constants.
combineSextInRegCmov(SDNode * N,SelectionDAG & DAG)47605 static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
47606   assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
47607 
47608   EVT DstVT = N->getValueType(0);
47609 
47610   SDValue N0 = N->getOperand(0);
47611   SDValue N1 = N->getOperand(1);
47612   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
47613 
47614   if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
47615     return SDValue();
47616 
47617   // Look through single use any_extends / truncs.
47618   SDValue IntermediateBitwidthOp;
47619   if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
47620       N0.hasOneUse()) {
47621     IntermediateBitwidthOp = N0;
47622     N0 = N0.getOperand(0);
47623   }
47624 
47625   // See if we have a single use cmov.
47626   if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
47627     return SDValue();
47628 
47629   SDValue CMovOp0 = N0.getOperand(0);
47630   SDValue CMovOp1 = N0.getOperand(1);
47631 
47632   // Make sure both operands are constants.
47633   if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
47634       !isa<ConstantSDNode>(CMovOp1.getNode()))
47635     return SDValue();
47636 
47637   SDLoc DL(N);
47638 
47639   // If we looked through an any_extend/trunc above, add one to the constants.
47640   if (IntermediateBitwidthOp) {
47641     unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
47642     CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
47643     CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
47644   }
47645 
47646   CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
47647   CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
47648 
47649   EVT CMovVT = DstVT;
47650   // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
47651   if (DstVT == MVT::i16) {
47652     CMovVT = MVT::i32;
47653     CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
47654     CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
47655   }
47656 
47657   SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
47658                              N0.getOperand(2), N0.getOperand(3));
47659 
47660   if (CMovVT != DstVT)
47661     CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
47662 
47663   return CMov;
47664 }
47665 
combineSignExtendInReg(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)47666 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
47667                                       const X86Subtarget &Subtarget) {
47668   assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
47669 
47670   if (SDValue V = combineSextInRegCmov(N, DAG))
47671     return V;
47672 
47673   EVT VT = N->getValueType(0);
47674   SDValue N0 = N->getOperand(0);
47675   SDValue N1 = N->getOperand(1);
47676   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
47677   SDLoc dl(N);
47678 
47679   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
47680   // both SSE and AVX2 since there is no sign-extended shift right
47681   // operation on a vector with 64-bit elements.
47682   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
47683   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
47684   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
47685                            N0.getOpcode() == ISD::SIGN_EXTEND)) {
47686     SDValue N00 = N0.getOperand(0);
47687 
47688     // EXTLOAD has a better solution on AVX2,
47689     // it may be replaced with X86ISD::VSEXT node.
47690     if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
47691       if (!ISD::isNormalLoad(N00.getNode()))
47692         return SDValue();
47693 
47694     // Attempt to promote any comparison mask ops before moving the
47695     // SIGN_EXTEND_INREG in the way.
47696     if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
47697       return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
47698 
47699     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
47700       SDValue Tmp =
47701           DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
47702       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
47703     }
47704   }
47705   return SDValue();
47706 }
47707 
47708 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
47709 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
47710 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
47711 /// opportunities to combine math ops, use an LEA, or use a complex addressing
47712 /// mode. This can eliminate extend, add, and shift instructions.
promoteExtBeforeAdd(SDNode * Ext,SelectionDAG & DAG,const X86Subtarget & Subtarget)47713 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
47714                                    const X86Subtarget &Subtarget) {
47715   if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
47716       Ext->getOpcode() != ISD::ZERO_EXTEND)
47717     return SDValue();
47718 
47719   // TODO: This should be valid for other integer types.
47720   EVT VT = Ext->getValueType(0);
47721   if (VT != MVT::i64)
47722     return SDValue();
47723 
47724   SDValue Add = Ext->getOperand(0);
47725   if (Add.getOpcode() != ISD::ADD)
47726     return SDValue();
47727 
47728   bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
47729   bool NSW = Add->getFlags().hasNoSignedWrap();
47730   bool NUW = Add->getFlags().hasNoUnsignedWrap();
47731 
47732   // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
47733   // into the 'zext'
47734   if ((Sext && !NSW) || (!Sext && !NUW))
47735     return SDValue();
47736 
47737   // Having a constant operand to the 'add' ensures that we are not increasing
47738   // the instruction count because the constant is extended for free below.
47739   // A constant operand can also become the displacement field of an LEA.
47740   auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
47741   if (!AddOp1)
47742     return SDValue();
47743 
47744   // Don't make the 'add' bigger if there's no hope of combining it with some
47745   // other 'add' or 'shl' instruction.
47746   // TODO: It may be profitable to generate simpler LEA instructions in place
47747   // of single 'add' instructions, but the cost model for selecting an LEA
47748   // currently has a high threshold.
47749   bool HasLEAPotential = false;
47750   for (auto *User : Ext->uses()) {
47751     if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
47752       HasLEAPotential = true;
47753       break;
47754     }
47755   }
47756   if (!HasLEAPotential)
47757     return SDValue();
47758 
47759   // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
47760   int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
47761   SDValue AddOp0 = Add.getOperand(0);
47762   SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
47763   SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
47764 
47765   // The wider add is guaranteed to not wrap because both operands are
47766   // sign-extended.
47767   SDNodeFlags Flags;
47768   Flags.setNoSignedWrap(NSW);
47769   Flags.setNoUnsignedWrap(NUW);
47770   return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
47771 }
47772 
47773 // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
47774 // operands and the result of CMOV is not used anywhere else - promote CMOV
47775 // itself instead of promoting its result. This could be beneficial, because:
47776 //     1) X86TargetLowering::EmitLoweredSelect later can do merging of two
47777 //        (or more) pseudo-CMOVs only when they go one-after-another and
47778 //        getting rid of result extension code after CMOV will help that.
47779 //     2) Promotion of constant CMOV arguments is free, hence the
47780 //        {ANY,SIGN,ZERO}_EXTEND will just be deleted.
47781 //     3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
47782 //        promotion is also good in terms of code-size.
47783 //        (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
47784 //         promotion).
combineToExtendCMOV(SDNode * Extend,SelectionDAG & DAG)47785 static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
47786   SDValue CMovN = Extend->getOperand(0);
47787   if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
47788     return SDValue();
47789 
47790   EVT TargetVT = Extend->getValueType(0);
47791   unsigned ExtendOpcode = Extend->getOpcode();
47792   SDLoc DL(Extend);
47793 
47794   EVT VT = CMovN.getValueType();
47795   SDValue CMovOp0 = CMovN.getOperand(0);
47796   SDValue CMovOp1 = CMovN.getOperand(1);
47797 
47798   if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
47799       !isa<ConstantSDNode>(CMovOp1.getNode()))
47800     return SDValue();
47801 
47802   // Only extend to i32 or i64.
47803   if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
47804     return SDValue();
47805 
47806   // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
47807   // are free.
47808   if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
47809     return SDValue();
47810 
47811   // If this a zero extend to i64, we should only extend to i32 and use a free
47812   // zero extend to finish.
47813   EVT ExtendVT = TargetVT;
47814   if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
47815     ExtendVT = MVT::i32;
47816 
47817   CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
47818   CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
47819 
47820   SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
47821                             CMovN.getOperand(2), CMovN.getOperand(3));
47822 
47823   // Finish extending if needed.
47824   if (ExtendVT != TargetVT)
47825     Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
47826 
47827   return Res;
47828 }
47829 
47830 // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
47831 // This is more or less the reverse of combineBitcastvxi1.
47832 static SDValue
combineToExtendBoolVectorInReg(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)47833 combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
47834                                TargetLowering::DAGCombinerInfo &DCI,
47835                                const X86Subtarget &Subtarget) {
47836   unsigned Opcode = N->getOpcode();
47837   if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
47838       Opcode != ISD::ANY_EXTEND)
47839     return SDValue();
47840   if (!DCI.isBeforeLegalizeOps())
47841     return SDValue();
47842   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
47843     return SDValue();
47844 
47845   SDValue N0 = N->getOperand(0);
47846   EVT VT = N->getValueType(0);
47847   EVT SVT = VT.getScalarType();
47848   EVT InSVT = N0.getValueType().getScalarType();
47849   unsigned EltSizeInBits = SVT.getSizeInBits();
47850 
47851   // Input type must be extending a bool vector (bit-casted from a scalar
47852   // integer) to legal integer types.
47853   if (!VT.isVector())
47854     return SDValue();
47855   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
47856     return SDValue();
47857   if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
47858     return SDValue();
47859 
47860   SDValue N00 = N0.getOperand(0);
47861   EVT SclVT = N0.getOperand(0).getValueType();
47862   if (!SclVT.isScalarInteger())
47863     return SDValue();
47864 
47865   SDLoc DL(N);
47866   SDValue Vec;
47867   SmallVector<int, 32> ShuffleMask;
47868   unsigned NumElts = VT.getVectorNumElements();
47869   assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
47870 
47871   // Broadcast the scalar integer to the vector elements.
47872   if (NumElts > EltSizeInBits) {
47873     // If the scalar integer is greater than the vector element size, then we
47874     // must split it down into sub-sections for broadcasting. For example:
47875     //   i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
47876     //   i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
47877     assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
47878     unsigned Scale = NumElts / EltSizeInBits;
47879     EVT BroadcastVT =
47880         EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
47881     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
47882     Vec = DAG.getBitcast(VT, Vec);
47883 
47884     for (unsigned i = 0; i != Scale; ++i)
47885       ShuffleMask.append(EltSizeInBits, i);
47886     Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
47887   } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
47888              (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
47889     // If we have register broadcast instructions, use the scalar size as the
47890     // element type for the shuffle. Then cast to the wider element type. The
47891     // widened bits won't be used, and this might allow the use of a broadcast
47892     // load.
47893     assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
47894     unsigned Scale = EltSizeInBits / NumElts;
47895     EVT BroadcastVT =
47896         EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
47897     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
47898     ShuffleMask.append(NumElts * Scale, 0);
47899     Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
47900     Vec = DAG.getBitcast(VT, Vec);
47901   } else {
47902     // For smaller scalar integers, we can simply any-extend it to the vector
47903     // element size (we don't care about the upper bits) and broadcast it to all
47904     // elements.
47905     SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
47906     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
47907     ShuffleMask.append(NumElts, 0);
47908     Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
47909   }
47910 
47911   // Now, mask the relevant bit in each element.
47912   SmallVector<SDValue, 32> Bits;
47913   for (unsigned i = 0; i != NumElts; ++i) {
47914     int BitIdx = (i % EltSizeInBits);
47915     APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
47916     Bits.push_back(DAG.getConstant(Bit, DL, SVT));
47917   }
47918   SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
47919   Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
47920 
47921   // Compare against the bitmask and extend the result.
47922   EVT CCVT = VT.changeVectorElementType(MVT::i1);
47923   Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
47924   Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
47925 
47926   // For SEXT, this is now done, otherwise shift the result down for
47927   // zero-extension.
47928   if (Opcode == ISD::SIGN_EXTEND)
47929     return Vec;
47930   return DAG.getNode(ISD::SRL, DL, VT, Vec,
47931                      DAG.getConstant(EltSizeInBits - 1, DL, VT));
47932 }
47933 
47934 // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
47935 // result type.
combineExtSetcc(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)47936 static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
47937                                const X86Subtarget &Subtarget) {
47938   SDValue N0 = N->getOperand(0);
47939   EVT VT = N->getValueType(0);
47940   SDLoc dl(N);
47941 
47942   // Only do this combine with AVX512 for vector extends.
47943   if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
47944     return SDValue();
47945 
47946   // Only combine legal element types.
47947   EVT SVT = VT.getVectorElementType();
47948   if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
47949       SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
47950     return SDValue();
47951 
47952   // We can only do this if the vector size in 256 bits or less.
47953   unsigned Size = VT.getSizeInBits();
47954   if (Size > 256 && Subtarget.useAVX512Regs())
47955     return SDValue();
47956 
47957   // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
47958   // that's the only integer compares with we have.
47959   ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
47960   if (ISD::isUnsignedIntSetCC(CC))
47961     return SDValue();
47962 
47963   // Only do this combine if the extension will be fully consumed by the setcc.
47964   EVT N00VT = N0.getOperand(0).getValueType();
47965   EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
47966   if (Size != MatchingVecType.getSizeInBits())
47967     return SDValue();
47968 
47969   SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
47970 
47971   if (N->getOpcode() == ISD::ZERO_EXTEND)
47972     Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
47973 
47974   return Res;
47975 }
47976 
combineSext(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)47977 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
47978                            TargetLowering::DAGCombinerInfo &DCI,
47979                            const X86Subtarget &Subtarget) {
47980   SDValue N0 = N->getOperand(0);
47981   EVT VT = N->getValueType(0);
47982   SDLoc DL(N);
47983 
47984   // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
47985   if (!DCI.isBeforeLegalizeOps() &&
47986       N0.getOpcode() == X86ISD::SETCC_CARRY) {
47987     SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
47988                                  N0->getOperand(1));
47989     bool ReplaceOtherUses = !N0.hasOneUse();
47990     DCI.CombineTo(N, Setcc);
47991     // Replace other uses with a truncate of the widened setcc_carry.
47992     if (ReplaceOtherUses) {
47993       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
47994                                   N0.getValueType(), Setcc);
47995       DCI.CombineTo(N0.getNode(), Trunc);
47996     }
47997 
47998     return SDValue(N, 0);
47999   }
48000 
48001   if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
48002     return NewCMov;
48003 
48004   if (!DCI.isBeforeLegalizeOps())
48005     return SDValue();
48006 
48007   if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
48008     return V;
48009 
48010   if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
48011     return V;
48012 
48013   if (VT.isVector()) {
48014     if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
48015       return R;
48016 
48017     if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
48018       return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
48019   }
48020 
48021   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
48022     return NewAdd;
48023 
48024   return SDValue();
48025 }
48026 
combineFMA(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)48027 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
48028                           TargetLowering::DAGCombinerInfo &DCI,
48029                           const X86Subtarget &Subtarget) {
48030   SDLoc dl(N);
48031   EVT VT = N->getValueType(0);
48032   bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
48033 
48034   // Let legalize expand this if it isn't a legal type yet.
48035   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48036   if (!TLI.isTypeLegal(VT))
48037     return SDValue();
48038 
48039   SDValue A = N->getOperand(IsStrict ? 1 : 0);
48040   SDValue B = N->getOperand(IsStrict ? 2 : 1);
48041   SDValue C = N->getOperand(IsStrict ? 3 : 2);
48042 
48043   // If the operation allows fast-math and the target does not support FMA,
48044   // split this into mul+add to avoid libcall(s).
48045   SDNodeFlags Flags = N->getFlags();
48046   if (!IsStrict && Flags.hasAllowReassociation() &&
48047       TLI.isOperationExpand(ISD::FMA, VT)) {
48048     SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
48049     return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
48050   }
48051 
48052   EVT ScalarVT = VT.getScalarType();
48053   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
48054     return SDValue();
48055 
48056   auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
48057     bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
48058     bool LegalOperations = !DCI.isBeforeLegalizeOps();
48059     if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
48060                                                        CodeSize)) {
48061       V = NegV;
48062       return true;
48063     }
48064     // Look through extract_vector_elts. If it comes from an FNEG, create a
48065     // new extract from the FNEG input.
48066     if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
48067         isNullConstant(V.getOperand(1))) {
48068       SDValue Vec = V.getOperand(0);
48069       if (SDValue NegV = TLI.getCheaperNegatedExpression(
48070               Vec, DAG, LegalOperations, CodeSize)) {
48071         V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
48072                         NegV, V.getOperand(1));
48073         return true;
48074       }
48075     }
48076 
48077     return false;
48078   };
48079 
48080   // Do not convert the passthru input of scalar intrinsics.
48081   // FIXME: We could allow negations of the lower element only.
48082   bool NegA = invertIfNegative(A);
48083   bool NegB = invertIfNegative(B);
48084   bool NegC = invertIfNegative(C);
48085 
48086   if (!NegA && !NegB && !NegC)
48087     return SDValue();
48088 
48089   unsigned NewOpcode =
48090       negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
48091 
48092   // Propagate fast-math-flags to new FMA node.
48093   SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
48094   if (IsStrict) {
48095     assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
48096     return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
48097                        {N->getOperand(0), A, B, C});
48098   } else {
48099     if (N->getNumOperands() == 4)
48100       return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
48101     return DAG.getNode(NewOpcode, dl, VT, A, B, C);
48102   }
48103 }
48104 
48105 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
48106 // Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
combineFMADDSUB(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)48107 static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
48108                                TargetLowering::DAGCombinerInfo &DCI) {
48109   SDLoc dl(N);
48110   EVT VT = N->getValueType(0);
48111   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48112   bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
48113   bool LegalOperations = !DCI.isBeforeLegalizeOps();
48114 
48115   SDValue N2 = N->getOperand(2);
48116 
48117   SDValue NegN2 =
48118       TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
48119   if (!NegN2)
48120     return SDValue();
48121   unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
48122 
48123   if (N->getNumOperands() == 4)
48124     return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
48125                        NegN2, N->getOperand(3));
48126   return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
48127                      NegN2);
48128 }
48129 
combineZext(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)48130 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
48131                            TargetLowering::DAGCombinerInfo &DCI,
48132                            const X86Subtarget &Subtarget) {
48133   SDLoc dl(N);
48134   SDValue N0 = N->getOperand(0);
48135   EVT VT = N->getValueType(0);
48136 
48137   // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
48138   // FIXME: Is this needed? We don't seem to have any tests for it.
48139   if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
48140       N0.getOpcode() == X86ISD::SETCC_CARRY) {
48141     SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
48142                                  N0->getOperand(1));
48143     bool ReplaceOtherUses = !N0.hasOneUse();
48144     DCI.CombineTo(N, Setcc);
48145     // Replace other uses with a truncate of the widened setcc_carry.
48146     if (ReplaceOtherUses) {
48147       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
48148                                   N0.getValueType(), Setcc);
48149       DCI.CombineTo(N0.getNode(), Trunc);
48150     }
48151 
48152     return SDValue(N, 0);
48153   }
48154 
48155   if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
48156     return NewCMov;
48157 
48158   if (DCI.isBeforeLegalizeOps())
48159     if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
48160       return V;
48161 
48162   if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
48163     return V;
48164 
48165   if (VT.isVector())
48166     if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
48167       return R;
48168 
48169   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
48170     return NewAdd;
48171 
48172   if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
48173     return R;
48174 
48175   // TODO: Combine with any target/faux shuffle.
48176   if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
48177       VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
48178     SDValue N00 = N0.getOperand(0);
48179     SDValue N01 = N0.getOperand(1);
48180     unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
48181     APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
48182     if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
48183         (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
48184       return concatSubVectors(N00, N01, DAG, dl);
48185     }
48186   }
48187 
48188   return SDValue();
48189 }
48190 
48191 /// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
48192 /// recognizable memcmp expansion.
isOrXorXorTree(SDValue X,bool Root=true)48193 static bool isOrXorXorTree(SDValue X, bool Root = true) {
48194   if (X.getOpcode() == ISD::OR)
48195     return isOrXorXorTree(X.getOperand(0), false) &&
48196            isOrXorXorTree(X.getOperand(1), false);
48197   if (Root)
48198     return false;
48199   return X.getOpcode() == ISD::XOR;
48200 }
48201 
48202 /// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
48203 /// expansion.
48204 template<typename F>
emitOrXorXorTree(SDValue X,SDLoc & DL,SelectionDAG & DAG,EVT VecVT,EVT CmpVT,bool HasPT,F SToV)48205 static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
48206                                 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
48207   SDValue Op0 = X.getOperand(0);
48208   SDValue Op1 = X.getOperand(1);
48209   if (X.getOpcode() == ISD::OR) {
48210     SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
48211     SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
48212     if (VecVT != CmpVT)
48213       return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
48214     if (HasPT)
48215       return DAG.getNode(ISD::OR, DL, VecVT, A, B);
48216     return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
48217   } else if (X.getOpcode() == ISD::XOR) {
48218     SDValue A = SToV(Op0);
48219     SDValue B = SToV(Op1);
48220     if (VecVT != CmpVT)
48221       return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
48222     if (HasPT)
48223       return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
48224     return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
48225   }
48226   llvm_unreachable("Impossible");
48227 }
48228 
48229 /// Try to map a 128-bit or larger integer comparison to vector instructions
48230 /// before type legalization splits it up into chunks.
combineVectorSizedSetCCEquality(SDNode * SetCC,SelectionDAG & DAG,const X86Subtarget & Subtarget)48231 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
48232                                                const X86Subtarget &Subtarget) {
48233   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
48234   assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
48235 
48236   // We're looking for an oversized integer equality comparison.
48237   SDValue X = SetCC->getOperand(0);
48238   SDValue Y = SetCC->getOperand(1);
48239   EVT OpVT = X.getValueType();
48240   unsigned OpSize = OpVT.getSizeInBits();
48241   if (!OpVT.isScalarInteger() || OpSize < 128)
48242     return SDValue();
48243 
48244   // Ignore a comparison with zero because that gets special treatment in
48245   // EmitTest(). But make an exception for the special case of a pair of
48246   // logically-combined vector-sized operands compared to zero. This pattern may
48247   // be generated by the memcmp expansion pass with oversized integer compares
48248   // (see PR33325).
48249   bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
48250   if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
48251     return SDValue();
48252 
48253   // Don't perform this combine if constructing the vector will be expensive.
48254   auto IsVectorBitCastCheap = [](SDValue X) {
48255     X = peekThroughBitcasts(X);
48256     return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
48257            X.getOpcode() == ISD::LOAD;
48258   };
48259   if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
48260       !IsOrXorXorTreeCCZero)
48261     return SDValue();
48262 
48263   EVT VT = SetCC->getValueType(0);
48264   SDLoc DL(SetCC);
48265 
48266   // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
48267   // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
48268   // Otherwise use PCMPEQ (plus AND) and mask testing.
48269   if ((OpSize == 128 && Subtarget.hasSSE2()) ||
48270       (OpSize == 256 && Subtarget.hasAVX()) ||
48271       (OpSize == 512 && Subtarget.useAVX512Regs())) {
48272     bool HasPT = Subtarget.hasSSE41();
48273 
48274     // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
48275     // vector registers are essentially free. (Technically, widening registers
48276     // prevents load folding, but the tradeoff is worth it.)
48277     bool PreferKOT = Subtarget.preferMaskRegisters();
48278     bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
48279 
48280     EVT VecVT = MVT::v16i8;
48281     EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
48282     if (OpSize == 256) {
48283       VecVT = MVT::v32i8;
48284       CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
48285     }
48286     EVT CastVT = VecVT;
48287     bool NeedsAVX512FCast = false;
48288     if (OpSize == 512 || NeedZExt) {
48289       if (Subtarget.hasBWI()) {
48290         VecVT = MVT::v64i8;
48291         CmpVT = MVT::v64i1;
48292         if (OpSize == 512)
48293           CastVT = VecVT;
48294       } else {
48295         VecVT = MVT::v16i32;
48296         CmpVT = MVT::v16i1;
48297         CastVT = OpSize == 512 ? VecVT :
48298                  OpSize == 256 ? MVT::v8i32 : MVT::v4i32;
48299         NeedsAVX512FCast = true;
48300       }
48301     }
48302 
48303     auto ScalarToVector = [&](SDValue X) -> SDValue {
48304       bool TmpZext = false;
48305       EVT TmpCastVT = CastVT;
48306       if (X.getOpcode() == ISD::ZERO_EXTEND) {
48307         SDValue OrigX = X.getOperand(0);
48308         unsigned OrigSize = OrigX.getScalarValueSizeInBits();
48309         if (OrigSize < OpSize) {
48310           if (OrigSize == 128) {
48311             TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
48312             X = OrigX;
48313             TmpZext = true;
48314           } else if (OrigSize == 256) {
48315             TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
48316             X = OrigX;
48317             TmpZext = true;
48318           }
48319         }
48320       }
48321       X = DAG.getBitcast(TmpCastVT, X);
48322       if (!NeedZExt && !TmpZext)
48323         return X;
48324       return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
48325                          DAG.getConstant(0, DL, VecVT), X,
48326                          DAG.getVectorIdxConstant(0, DL));
48327     };
48328 
48329     SDValue Cmp;
48330     if (IsOrXorXorTreeCCZero) {
48331       // This is a bitwise-combined equality comparison of 2 pairs of vectors:
48332       // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
48333       // Use 2 vector equality compares and 'and' the results before doing a
48334       // MOVMSK.
48335       Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
48336     } else {
48337       SDValue VecX = ScalarToVector(X);
48338       SDValue VecY = ScalarToVector(Y);
48339       if (VecVT != CmpVT) {
48340         Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
48341       } else if (HasPT) {
48342         Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
48343       } else {
48344         Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
48345       }
48346     }
48347     // AVX512 should emit a setcc that will lower to kortest.
48348     if (VecVT != CmpVT) {
48349       EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :
48350                    CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;
48351       return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
48352                           DAG.getConstant(0, DL, KRegVT), CC);
48353     }
48354     if (HasPT) {
48355       SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
48356                                      Cmp);
48357       SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
48358       X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
48359       SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
48360       return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
48361     }
48362     // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
48363     // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
48364     // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
48365     assert(Cmp.getValueType() == MVT::v16i8 &&
48366            "Non 128-bit vector on pre-SSE41 target");
48367     SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
48368     SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
48369     return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
48370   }
48371 
48372   return SDValue();
48373 }
48374 
combineSetCC(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)48375 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
48376                             TargetLowering::DAGCombinerInfo &DCI,
48377                             const X86Subtarget &Subtarget) {
48378   const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
48379   const SDValue LHS = N->getOperand(0);
48380   const SDValue RHS = N->getOperand(1);
48381   EVT VT = N->getValueType(0);
48382   EVT OpVT = LHS.getValueType();
48383   SDLoc DL(N);
48384 
48385   if (CC == ISD::SETNE || CC == ISD::SETEQ) {
48386     if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
48387       return V;
48388 
48389     if (VT == MVT::i1 && isNullConstant(RHS)) {
48390       SDValue X86CC;
48391       if (SDValue V =
48392               MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))
48393         return DAG.getNode(ISD::TRUNCATE, DL, VT,
48394                            DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));
48395     }
48396 
48397     if (OpVT.isScalarInteger()) {
48398       // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
48399       // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
48400       auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
48401         if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
48402           if (N0.getOperand(0) == N1)
48403             return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
48404                                N0.getOperand(1));
48405           if (N0.getOperand(1) == N1)
48406             return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
48407                                N0.getOperand(0));
48408         }
48409         return SDValue();
48410       };
48411       if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
48412         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
48413       if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
48414         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
48415 
48416       // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
48417       // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
48418       auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
48419         if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
48420           if (N0.getOperand(0) == N1)
48421             return DAG.getNode(ISD::AND, DL, OpVT, N1,
48422                                DAG.getNOT(DL, N0.getOperand(1), OpVT));
48423           if (N0.getOperand(1) == N1)
48424             return DAG.getNode(ISD::AND, DL, OpVT, N1,
48425                                DAG.getNOT(DL, N0.getOperand(0), OpVT));
48426         }
48427         return SDValue();
48428       };
48429       if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
48430         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
48431       if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
48432         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
48433 
48434       // cmpeq(trunc(x),0) --> cmpeq(x,0)
48435       // cmpne(trunc(x),0) --> cmpne(x,0)
48436       // iff x upper bits are zero.
48437       // TODO: Add support for RHS to be truncate as well?
48438       if (LHS.getOpcode() == ISD::TRUNCATE &&
48439           LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
48440           isNullConstant(RHS) && !DCI.isBeforeLegalize()) {
48441         EVT SrcVT = LHS.getOperand(0).getValueType();
48442         APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
48443                                                 OpVT.getScalarSizeInBits());
48444         const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48445         if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
48446             TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
48447           return DAG.getSetCC(DL, VT, LHS.getOperand(0),
48448                               DAG.getConstant(0, DL, SrcVT), CC);
48449       }
48450     }
48451   }
48452 
48453   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
48454       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
48455     // Using temporaries to avoid messing up operand ordering for later
48456     // transformations if this doesn't work.
48457     SDValue Op0 = LHS;
48458     SDValue Op1 = RHS;
48459     ISD::CondCode TmpCC = CC;
48460     // Put build_vector on the right.
48461     if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
48462       std::swap(Op0, Op1);
48463       TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
48464     }
48465 
48466     bool IsSEXT0 =
48467         (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
48468         (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
48469     bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
48470 
48471     if (IsSEXT0 && IsVZero1) {
48472       assert(VT == Op0.getOperand(0).getValueType() &&
48473              "Unexpected operand type");
48474       if (TmpCC == ISD::SETGT)
48475         return DAG.getConstant(0, DL, VT);
48476       if (TmpCC == ISD::SETLE)
48477         return DAG.getConstant(1, DL, VT);
48478       if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
48479         return DAG.getNOT(DL, Op0.getOperand(0), VT);
48480 
48481       assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
48482              "Unexpected condition code!");
48483       return Op0.getOperand(0);
48484     }
48485   }
48486 
48487   // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
48488   // pre-promote its result type since vXi1 vectors don't get promoted
48489   // during type legalization.
48490   // NOTE: The element count check is to ignore operand types that need to
48491   // go through type promotion to a 128-bit vector.
48492   if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
48493       VT.getVectorElementType() == MVT::i1 &&
48494       (OpVT.getVectorElementType() == MVT::i8 ||
48495        OpVT.getVectorElementType() == MVT::i16)) {
48496     SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
48497     return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
48498   }
48499 
48500   // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
48501   // to avoid scalarization via legalization because v4i32 is not a legal type.
48502   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
48503       LHS.getValueType() == MVT::v4f32)
48504     return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
48505 
48506   return SDValue();
48507 }
48508 
combineMOVMSK(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)48509 static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
48510                              TargetLowering::DAGCombinerInfo &DCI,
48511                              const X86Subtarget &Subtarget) {
48512   SDValue Src = N->getOperand(0);
48513   MVT SrcVT = Src.getSimpleValueType();
48514   MVT VT = N->getSimpleValueType(0);
48515   unsigned NumBits = VT.getScalarSizeInBits();
48516   unsigned NumElts = SrcVT.getVectorNumElements();
48517 
48518   // Perform constant folding.
48519   if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
48520     assert(VT == MVT::i32 && "Unexpected result type");
48521     APInt Imm(32, 0);
48522     for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
48523       if (!Src.getOperand(Idx).isUndef() &&
48524           Src.getConstantOperandAPInt(Idx).isNegative())
48525         Imm.setBit(Idx);
48526     }
48527     return DAG.getConstant(Imm, SDLoc(N), VT);
48528   }
48529 
48530   // Look through int->fp bitcasts that don't change the element width.
48531   unsigned EltWidth = SrcVT.getScalarSizeInBits();
48532   if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
48533       Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
48534     return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
48535 
48536   // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
48537   // with scalar comparisons.
48538   if (SDValue NotSrc = IsNOT(Src, DAG)) {
48539     SDLoc DL(N);
48540     APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
48541     NotSrc = DAG.getBitcast(SrcVT, NotSrc);
48542     return DAG.getNode(ISD::XOR, DL, VT,
48543                        DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
48544                        DAG.getConstant(NotMask, DL, VT));
48545   }
48546 
48547   // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
48548   // results with scalar comparisons.
48549   if (Src.getOpcode() == X86ISD::PCMPGT &&
48550       ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
48551     SDLoc DL(N);
48552     APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
48553     return DAG.getNode(ISD::XOR, DL, VT,
48554                        DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
48555                        DAG.getConstant(NotMask, DL, VT));
48556   }
48557 
48558   // Simplify the inputs.
48559   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48560   APInt DemandedMask(APInt::getAllOnesValue(NumBits));
48561   if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
48562     return SDValue(N, 0);
48563 
48564   return SDValue();
48565 }
48566 
combineX86GatherScatter(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)48567 static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
48568                                        TargetLowering::DAGCombinerInfo &DCI) {
48569   // With vector masks we only demand the upper bit of the mask.
48570   SDValue Mask = cast<X86MaskedGatherScatterSDNode>(N)->getMask();
48571   if (Mask.getScalarValueSizeInBits() != 1) {
48572     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48573     APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
48574     if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
48575       if (N->getOpcode() != ISD::DELETED_NODE)
48576         DCI.AddToWorklist(N);
48577       return SDValue(N, 0);
48578     }
48579   }
48580 
48581   return SDValue();
48582 }
48583 
rebuildGatherScatter(MaskedGatherScatterSDNode * GorS,SDValue Index,SDValue Base,SDValue Scale,SelectionDAG & DAG)48584 static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
48585                                     SDValue Index, SDValue Base, SDValue Scale,
48586                                     SelectionDAG &DAG) {
48587   SDLoc DL(GorS);
48588 
48589   if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
48590     SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
48591                       Gather->getMask(), Base, Index, Scale } ;
48592     return DAG.getMaskedGather(Gather->getVTList(),
48593                                Gather->getMemoryVT(), DL, Ops,
48594                                Gather->getMemOperand(),
48595                                Gather->getIndexType(),
48596                                Gather->getExtensionType());
48597   }
48598   auto *Scatter = cast<MaskedScatterSDNode>(GorS);
48599   SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
48600                     Scatter->getMask(), Base, Index, Scale };
48601   return DAG.getMaskedScatter(Scatter->getVTList(),
48602                               Scatter->getMemoryVT(), DL,
48603                               Ops, Scatter->getMemOperand(),
48604                               Scatter->getIndexType(),
48605                               Scatter->isTruncatingStore());
48606 }
48607 
combineGatherScatter(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)48608 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
48609                                     TargetLowering::DAGCombinerInfo &DCI) {
48610   SDLoc DL(N);
48611   auto *GorS = cast<MaskedGatherScatterSDNode>(N);
48612   SDValue Index = GorS->getIndex();
48613   SDValue Base = GorS->getBasePtr();
48614   SDValue Scale = GorS->getScale();
48615 
48616   if (DCI.isBeforeLegalize()) {
48617     unsigned IndexWidth = Index.getScalarValueSizeInBits();
48618 
48619     // Shrink constant indices if they are larger than 32-bits.
48620     // Only do this before legalize types since v2i64 could become v2i32.
48621     // FIXME: We could check that the type is legal if we're after legalize
48622     // types, but then we would need to construct test cases where that happens.
48623     // FIXME: We could support more than just constant vectors, but we need to
48624     // careful with costing. A truncate that can be optimized out would be fine.
48625     // Otherwise we might only want to create a truncate if it avoids a split.
48626     if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
48627       if (BV->isConstant() && IndexWidth > 32 &&
48628           DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
48629         EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
48630         Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
48631         return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
48632       }
48633     }
48634 
48635     // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
48636     // there are sufficient sign bits. Only do this before legalize types to
48637     // avoid creating illegal types in truncate.
48638     if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
48639          Index.getOpcode() == ISD::ZERO_EXTEND) &&
48640         IndexWidth > 32 &&
48641         Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
48642         DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
48643       EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
48644       Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
48645       return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
48646     }
48647   }
48648 
48649   if (DCI.isBeforeLegalizeOps()) {
48650     unsigned IndexWidth = Index.getScalarValueSizeInBits();
48651 
48652     // Make sure the index is either i32 or i64
48653     if (IndexWidth != 32 && IndexWidth != 64) {
48654       MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
48655       EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
48656       Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
48657       return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
48658     }
48659   }
48660 
48661   // With vector masks we only demand the upper bit of the mask.
48662   SDValue Mask = GorS->getMask();
48663   if (Mask.getScalarValueSizeInBits() != 1) {
48664     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48665     APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
48666     if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
48667       if (N->getOpcode() != ISD::DELETED_NODE)
48668         DCI.AddToWorklist(N);
48669       return SDValue(N, 0);
48670     }
48671   }
48672 
48673   return SDValue();
48674 }
48675 
48676 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
combineX86SetCC(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)48677 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
48678                                const X86Subtarget &Subtarget) {
48679   SDLoc DL(N);
48680   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
48681   SDValue EFLAGS = N->getOperand(1);
48682 
48683   // Try to simplify the EFLAGS and condition code operands.
48684   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
48685     return getSETCC(CC, Flags, DL, DAG);
48686 
48687   return SDValue();
48688 }
48689 
48690 /// Optimize branch condition evaluation.
combineBrCond(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)48691 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
48692                              const X86Subtarget &Subtarget) {
48693   SDLoc DL(N);
48694   SDValue EFLAGS = N->getOperand(3);
48695   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
48696 
48697   // Try to simplify the EFLAGS and condition code operands.
48698   // Make sure to not keep references to operands, as combineSetCCEFLAGS can
48699   // RAUW them under us.
48700   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
48701     SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
48702     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
48703                        N->getOperand(1), Cond, Flags);
48704   }
48705 
48706   return SDValue();
48707 }
48708 
48709 // TODO: Could we move this to DAGCombine?
combineVectorCompareAndMaskUnaryOp(SDNode * N,SelectionDAG & DAG)48710 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
48711                                                   SelectionDAG &DAG) {
48712   // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
48713   // to optimize away operation when it's from a constant.
48714   //
48715   // The general transformation is:
48716   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
48717   //       AND(VECTOR_CMP(x,y), constant2)
48718   //    constant2 = UNARYOP(constant)
48719 
48720   // Early exit if this isn't a vector operation, the operand of the
48721   // unary operation isn't a bitwise AND, or if the sizes of the operations
48722   // aren't the same.
48723   EVT VT = N->getValueType(0);
48724   bool IsStrict = N->isStrictFPOpcode();
48725   unsigned NumEltBits = VT.getScalarSizeInBits();
48726   SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
48727   if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
48728       DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
48729       VT.getSizeInBits() != Op0.getValueSizeInBits())
48730     return SDValue();
48731 
48732   // Now check that the other operand of the AND is a constant. We could
48733   // make the transformation for non-constant splats as well, but it's unclear
48734   // that would be a benefit as it would not eliminate any operations, just
48735   // perform one more step in scalar code before moving to the vector unit.
48736   if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
48737     // Bail out if the vector isn't a constant.
48738     if (!BV->isConstant())
48739       return SDValue();
48740 
48741     // Everything checks out. Build up the new and improved node.
48742     SDLoc DL(N);
48743     EVT IntVT = BV->getValueType(0);
48744     // Create a new constant of the appropriate type for the transformed
48745     // DAG.
48746     SDValue SourceConst;
48747     if (IsStrict)
48748       SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
48749                                 {N->getOperand(0), SDValue(BV, 0)});
48750     else
48751       SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
48752     // The AND node needs bitcasts to/from an integer vector type around it.
48753     SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
48754     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
48755                                  MaskConst);
48756     SDValue Res = DAG.getBitcast(VT, NewAnd);
48757     if (IsStrict)
48758       return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
48759     return Res;
48760   }
48761 
48762   return SDValue();
48763 }
48764 
48765 /// If we are converting a value to floating-point, try to replace scalar
48766 /// truncate of an extracted vector element with a bitcast. This tries to keep
48767 /// the sequence on XMM registers rather than moving between vector and GPRs.
combineToFPTruncExtElt(SDNode * N,SelectionDAG & DAG)48768 static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
48769   // TODO: This is currently only used by combineSIntToFP, but it is generalized
48770   //       to allow being called by any similar cast opcode.
48771   // TODO: Consider merging this into lowering: vectorizeExtractedCast().
48772   SDValue Trunc = N->getOperand(0);
48773   if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
48774     return SDValue();
48775 
48776   SDValue ExtElt = Trunc.getOperand(0);
48777   if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
48778       !isNullConstant(ExtElt.getOperand(1)))
48779     return SDValue();
48780 
48781   EVT TruncVT = Trunc.getValueType();
48782   EVT SrcVT = ExtElt.getValueType();
48783   unsigned DestWidth = TruncVT.getSizeInBits();
48784   unsigned SrcWidth = SrcVT.getSizeInBits();
48785   if (SrcWidth % DestWidth != 0)
48786     return SDValue();
48787 
48788   // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
48789   EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
48790   unsigned VecWidth = SrcVecVT.getSizeInBits();
48791   unsigned NumElts = VecWidth / DestWidth;
48792   EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
48793   SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
48794   SDLoc DL(N);
48795   SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
48796                                   BitcastVec, ExtElt.getOperand(1));
48797   return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
48798 }
48799 
combineUIntToFP(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)48800 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
48801                                const X86Subtarget &Subtarget) {
48802   bool IsStrict = N->isStrictFPOpcode();
48803   SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
48804   EVT VT = N->getValueType(0);
48805   EVT InVT = Op0.getValueType();
48806 
48807   // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
48808   // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
48809   // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
48810   if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
48811     SDLoc dl(N);
48812     EVT DstVT = InVT.changeVectorElementType(MVT::i32);
48813     SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
48814 
48815     // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
48816     if (IsStrict)
48817       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
48818                          {N->getOperand(0), P});
48819     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
48820   }
48821 
48822   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
48823   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
48824   // the optimization here.
48825   if (DAG.SignBitIsZero(Op0)) {
48826     if (IsStrict)
48827       return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
48828                          {N->getOperand(0), Op0});
48829     return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
48830   }
48831 
48832   return SDValue();
48833 }
48834 
combineSIntToFP(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)48835 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
48836                                TargetLowering::DAGCombinerInfo &DCI,
48837                                const X86Subtarget &Subtarget) {
48838   // First try to optimize away the conversion entirely when it's
48839   // conditionally from a constant. Vectors only.
48840   bool IsStrict = N->isStrictFPOpcode();
48841   if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
48842     return Res;
48843 
48844   // Now move on to more general possibilities.
48845   SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
48846   EVT VT = N->getValueType(0);
48847   EVT InVT = Op0.getValueType();
48848 
48849   // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
48850   // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
48851   // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
48852   if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
48853     SDLoc dl(N);
48854     EVT DstVT = InVT.changeVectorElementType(MVT::i32);
48855     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
48856     if (IsStrict)
48857       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
48858                          {N->getOperand(0), P});
48859     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
48860   }
48861 
48862   // Without AVX512DQ we only support i64 to float scalar conversion. For both
48863   // vectors and scalars, see if we know that the upper bits are all the sign
48864   // bit, in which case we can truncate the input to i32 and convert from that.
48865   if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
48866     unsigned BitWidth = InVT.getScalarSizeInBits();
48867     unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
48868     if (NumSignBits >= (BitWidth - 31)) {
48869       EVT TruncVT = MVT::i32;
48870       if (InVT.isVector())
48871         TruncVT = InVT.changeVectorElementType(TruncVT);
48872       SDLoc dl(N);
48873       if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
48874         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
48875         if (IsStrict)
48876           return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
48877                              {N->getOperand(0), Trunc});
48878         return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
48879       }
48880       // If we're after legalize and the type is v2i32 we need to shuffle and
48881       // use CVTSI2P.
48882       assert(InVT == MVT::v2i64 && "Unexpected VT!");
48883       SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
48884       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
48885                                           { 0, 2, -1, -1 });
48886       if (IsStrict)
48887         return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
48888                            {N->getOperand(0), Shuf});
48889       return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
48890     }
48891   }
48892 
48893   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
48894   // a 32-bit target where SSE doesn't support i64->FP operations.
48895   if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
48896       Op0.getOpcode() == ISD::LOAD) {
48897     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
48898 
48899     // This transformation is not supported if the result type is f16 or f128.
48900     if (VT == MVT::f16 || VT == MVT::f128)
48901       return SDValue();
48902 
48903     // If we have AVX512DQ we can use packed conversion instructions unless
48904     // the VT is f80.
48905     if (Subtarget.hasDQI() && VT != MVT::f80)
48906       return SDValue();
48907 
48908     if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
48909         Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
48910       std::pair<SDValue, SDValue> Tmp =
48911           Subtarget.getTargetLowering()->BuildFILD(
48912               VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
48913               Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
48914       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
48915       return Tmp.first;
48916     }
48917   }
48918 
48919   if (IsStrict)
48920     return SDValue();
48921 
48922   if (SDValue V = combineToFPTruncExtElt(N, DAG))
48923     return V;
48924 
48925   return SDValue();
48926 }
48927 
needCarryOrOverflowFlag(SDValue Flags)48928 static bool needCarryOrOverflowFlag(SDValue Flags) {
48929   assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
48930 
48931   for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
48932          UI != UE; ++UI) {
48933     SDNode *User = *UI;
48934 
48935     X86::CondCode CC;
48936     switch (User->getOpcode()) {
48937     default:
48938       // Be conservative.
48939       return true;
48940     case X86ISD::SETCC:
48941     case X86ISD::SETCC_CARRY:
48942       CC = (X86::CondCode)User->getConstantOperandVal(0);
48943       break;
48944     case X86ISD::BRCOND:
48945       CC = (X86::CondCode)User->getConstantOperandVal(2);
48946       break;
48947     case X86ISD::CMOV:
48948       CC = (X86::CondCode)User->getConstantOperandVal(2);
48949       break;
48950     }
48951 
48952     switch (CC) {
48953     default: break;
48954     case X86::COND_A: case X86::COND_AE:
48955     case X86::COND_B: case X86::COND_BE:
48956     case X86::COND_O: case X86::COND_NO:
48957     case X86::COND_G: case X86::COND_GE:
48958     case X86::COND_L: case X86::COND_LE:
48959       return true;
48960     }
48961   }
48962 
48963   return false;
48964 }
48965 
onlyZeroFlagUsed(SDValue Flags)48966 static bool onlyZeroFlagUsed(SDValue Flags) {
48967   assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
48968 
48969   for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
48970          UI != UE; ++UI) {
48971     SDNode *User = *UI;
48972 
48973     unsigned CCOpNo;
48974     switch (User->getOpcode()) {
48975     default:
48976       // Be conservative.
48977       return false;
48978     case X86ISD::SETCC:       CCOpNo = 0; break;
48979     case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
48980     case X86ISD::BRCOND:      CCOpNo = 2; break;
48981     case X86ISD::CMOV:        CCOpNo = 2; break;
48982     }
48983 
48984     X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
48985     if (CC != X86::COND_E && CC != X86::COND_NE)
48986       return false;
48987   }
48988 
48989   return true;
48990 }
48991 
combineCMP(SDNode * N,SelectionDAG & DAG)48992 static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
48993   // Only handle test patterns.
48994   if (!isNullConstant(N->getOperand(1)))
48995     return SDValue();
48996 
48997   // If we have a CMP of a truncated binop, see if we can make a smaller binop
48998   // and use its flags directly.
48999   // TODO: Maybe we should try promoting compares that only use the zero flag
49000   // first if we can prove the upper bits with computeKnownBits?
49001   SDLoc dl(N);
49002   SDValue Op = N->getOperand(0);
49003   EVT VT = Op.getValueType();
49004 
49005   // If we have a constant logical shift that's only used in a comparison
49006   // against zero turn it into an equivalent AND. This allows turning it into
49007   // a TEST instruction later.
49008   if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
49009       Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
49010       onlyZeroFlagUsed(SDValue(N, 0))) {
49011     unsigned BitWidth = VT.getSizeInBits();
49012     const APInt &ShAmt = Op.getConstantOperandAPInt(1);
49013     if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
49014       unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
49015       APInt Mask = Op.getOpcode() == ISD::SRL
49016                        ? APInt::getHighBitsSet(BitWidth, MaskBits)
49017                        : APInt::getLowBitsSet(BitWidth, MaskBits);
49018       if (Mask.isSignedIntN(32)) {
49019         Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
49020                          DAG.getConstant(Mask, dl, VT));
49021         return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
49022                            DAG.getConstant(0, dl, VT));
49023       }
49024     }
49025   }
49026 
49027   // Look for a truncate.
49028   if (Op.getOpcode() != ISD::TRUNCATE)
49029     return SDValue();
49030 
49031   SDValue Trunc = Op;
49032   Op = Op.getOperand(0);
49033 
49034   // See if we can compare with zero against the truncation source,
49035   // which should help using the Z flag from many ops. Only do this for
49036   // i32 truncated op to prevent partial-reg compares of promoted ops.
49037   EVT OpVT = Op.getValueType();
49038   APInt UpperBits =
49039       APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());
49040   if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
49041       onlyZeroFlagUsed(SDValue(N, 0))) {
49042     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
49043                        DAG.getConstant(0, dl, OpVT));
49044   }
49045 
49046   // After this the truncate and arithmetic op must have a single use.
49047   if (!Trunc.hasOneUse() || !Op.hasOneUse())
49048       return SDValue();
49049 
49050   unsigned NewOpc;
49051   switch (Op.getOpcode()) {
49052   default: return SDValue();
49053   case ISD::AND:
49054     // Skip and with constant. We have special handling for and with immediate
49055     // during isel to generate test instructions.
49056     if (isa<ConstantSDNode>(Op.getOperand(1)))
49057       return SDValue();
49058     NewOpc = X86ISD::AND;
49059     break;
49060   case ISD::OR:  NewOpc = X86ISD::OR;  break;
49061   case ISD::XOR: NewOpc = X86ISD::XOR; break;
49062   case ISD::ADD:
49063     // If the carry or overflow flag is used, we can't truncate.
49064     if (needCarryOrOverflowFlag(SDValue(N, 0)))
49065       return SDValue();
49066     NewOpc = X86ISD::ADD;
49067     break;
49068   case ISD::SUB:
49069     // If the carry or overflow flag is used, we can't truncate.
49070     if (needCarryOrOverflowFlag(SDValue(N, 0)))
49071       return SDValue();
49072     NewOpc = X86ISD::SUB;
49073     break;
49074   }
49075 
49076   // We found an op we can narrow. Truncate its inputs.
49077   SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
49078   SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
49079 
49080   // Use a X86 specific opcode to avoid DAG combine messing with it.
49081   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49082   Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
49083 
49084   // For AND, keep a CMP so that we can match the test pattern.
49085   if (NewOpc == X86ISD::AND)
49086     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
49087                        DAG.getConstant(0, dl, VT));
49088 
49089   // Return the flags.
49090   return Op.getValue(1);
49091 }
49092 
combineX86AddSub(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)49093 static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
49094                                 TargetLowering::DAGCombinerInfo &DCI) {
49095   assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
49096          "Expected X86ISD::ADD or X86ISD::SUB");
49097 
49098   SDLoc DL(N);
49099   SDValue LHS = N->getOperand(0);
49100   SDValue RHS = N->getOperand(1);
49101   MVT VT = LHS.getSimpleValueType();
49102   unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;
49103 
49104   // If we don't use the flag result, simplify back to a generic ADD/SUB.
49105   if (!N->hasAnyUseOfValue(1)) {
49106     SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
49107     return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
49108   }
49109 
49110   // Fold any similar generic ADD/SUB opcodes to reuse this node.
49111   auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
49112     SDValue Ops[] = {N0, N1};
49113     SDVTList VTs = DAG.getVTList(N->getValueType(0));
49114     if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
49115       SDValue Op(N, 0);
49116       if (Negate)
49117         Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
49118       DCI.CombineTo(GenericAddSub, Op);
49119     }
49120   };
49121   MatchGeneric(LHS, RHS, false);
49122   MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
49123 
49124   return SDValue();
49125 }
49126 
combineSBB(SDNode * N,SelectionDAG & DAG)49127 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
49128   if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
49129     MVT VT = N->getSimpleValueType(0);
49130     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49131     return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
49132                        N->getOperand(0), N->getOperand(1),
49133                        Flags);
49134   }
49135 
49136   // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
49137   // iff the flag result is dead.
49138   SDValue Op0 = N->getOperand(0);
49139   SDValue Op1 = N->getOperand(1);
49140   if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
49141       !N->hasAnyUseOfValue(1))
49142     return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
49143                        Op0.getOperand(1), N->getOperand(2));
49144 
49145   return SDValue();
49146 }
49147 
49148 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
combineADC(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)49149 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
49150                           TargetLowering::DAGCombinerInfo &DCI) {
49151   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
49152   // the result is either zero or one (depending on the input carry bit).
49153   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
49154   if (X86::isZeroNode(N->getOperand(0)) &&
49155       X86::isZeroNode(N->getOperand(1)) &&
49156       // We don't have a good way to replace an EFLAGS use, so only do this when
49157       // dead right now.
49158       SDValue(N, 1).use_empty()) {
49159     SDLoc DL(N);
49160     EVT VT = N->getValueType(0);
49161     SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
49162     SDValue Res1 =
49163         DAG.getNode(ISD::AND, DL, VT,
49164                     DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49165                                 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49166                                 N->getOperand(2)),
49167                     DAG.getConstant(1, DL, VT));
49168     return DCI.CombineTo(N, Res1, CarryOut);
49169   }
49170 
49171   if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
49172     MVT VT = N->getSimpleValueType(0);
49173     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49174     return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
49175                        N->getOperand(0), N->getOperand(1),
49176                        Flags);
49177   }
49178 
49179   return SDValue();
49180 }
49181 
49182 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
49183 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
49184 /// with CMP+{ADC, SBB}.
combineAddOrSubToADCOrSBB(SDNode * N,SelectionDAG & DAG)49185 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
49186   bool IsSub = N->getOpcode() == ISD::SUB;
49187   SDValue X = N->getOperand(0);
49188   SDValue Y = N->getOperand(1);
49189 
49190   // If this is an add, canonicalize a zext operand to the RHS.
49191   // TODO: Incomplete? What if both sides are zexts?
49192   if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
49193       Y.getOpcode() != ISD::ZERO_EXTEND)
49194     std::swap(X, Y);
49195 
49196   // Look through a one-use zext.
49197   bool PeekedThroughZext = false;
49198   if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
49199     Y = Y.getOperand(0);
49200     PeekedThroughZext = true;
49201   }
49202 
49203   // If this is an add, canonicalize a setcc operand to the RHS.
49204   // TODO: Incomplete? What if both sides are setcc?
49205   // TODO: Should we allow peeking through a zext of the other operand?
49206   if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
49207       Y.getOpcode() != X86ISD::SETCC)
49208     std::swap(X, Y);
49209 
49210   if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
49211     return SDValue();
49212 
49213   SDLoc DL(N);
49214   EVT VT = N->getValueType(0);
49215   X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
49216 
49217   // If X is -1 or 0, then we have an opportunity to avoid constants required in
49218   // the general case below.
49219   auto *ConstantX = dyn_cast<ConstantSDNode>(X);
49220   if (ConstantX) {
49221     if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
49222         (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
49223       // This is a complicated way to get -1 or 0 from the carry flag:
49224       // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
49225       //  0 - SETB  -->  0 -  (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
49226       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49227                          DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49228                          Y.getOperand(1));
49229     }
49230 
49231     if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
49232         (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
49233       SDValue EFLAGS = Y->getOperand(1);
49234       if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
49235           EFLAGS.getValueType().isInteger() &&
49236           !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49237         // Swap the operands of a SUB, and we have the same pattern as above.
49238         // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
49239         //  0 - SETA  (SUB A, B) -->  0 - SETB  (SUB B, A) --> SUB + SBB
49240         SDValue NewSub = DAG.getNode(
49241             X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49242             EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49243         SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
49244         return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49245                            DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49246                            NewEFLAGS);
49247       }
49248     }
49249   }
49250 
49251   if (CC == X86::COND_B) {
49252     // X + SETB Z --> adc X, 0
49253     // X - SETB Z --> sbb X, 0
49254     return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
49255                        DAG.getVTList(VT, MVT::i32), X,
49256                        DAG.getConstant(0, DL, VT), Y.getOperand(1));
49257   }
49258 
49259   if (CC == X86::COND_A) {
49260     SDValue EFLAGS = Y.getOperand(1);
49261     // Try to convert COND_A into COND_B in an attempt to facilitate
49262     // materializing "setb reg".
49263     //
49264     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
49265     // cannot take an immediate as its first operand.
49266     //
49267     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
49268         EFLAGS.getValueType().isInteger() &&
49269         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49270       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
49271                                    EFLAGS.getNode()->getVTList(),
49272                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49273       SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
49274       return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
49275                          DAG.getVTList(VT, MVT::i32), X,
49276                          DAG.getConstant(0, DL, VT), NewEFLAGS);
49277     }
49278   }
49279 
49280   if (CC == X86::COND_AE) {
49281     // X + SETAE --> sbb X, -1
49282     // X - SETAE --> adc X, -1
49283     return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
49284                        DAG.getVTList(VT, MVT::i32), X,
49285                        DAG.getConstant(-1, DL, VT), Y.getOperand(1));
49286   }
49287 
49288   if (CC == X86::COND_BE) {
49289     // X + SETBE --> sbb X, -1
49290     // X - SETBE --> adc X, -1
49291     SDValue EFLAGS = Y.getOperand(1);
49292     // Try to convert COND_BE into COND_AE in an attempt to facilitate
49293     // materializing "setae reg".
49294     //
49295     // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
49296     // cannot take an immediate as its first operand.
49297     //
49298     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
49299         EFLAGS.getValueType().isInteger() &&
49300         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49301       SDValue NewSub = DAG.getNode(
49302           X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49303           EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49304       SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
49305       return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
49306                          DAG.getVTList(VT, MVT::i32), X,
49307                          DAG.getConstant(-1, DL, VT), NewEFLAGS);
49308     }
49309   }
49310 
49311   if (CC != X86::COND_E && CC != X86::COND_NE)
49312     return SDValue();
49313 
49314   SDValue Cmp = Y.getOperand(1);
49315   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
49316       !X86::isZeroNode(Cmp.getOperand(1)) ||
49317       !Cmp.getOperand(0).getValueType().isInteger())
49318     return SDValue();
49319 
49320   SDValue Z = Cmp.getOperand(0);
49321   EVT ZVT = Z.getValueType();
49322 
49323   // If X is -1 or 0, then we have an opportunity to avoid constants required in
49324   // the general case below.
49325   if (ConstantX) {
49326     // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
49327     // fake operands:
49328     //  0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
49329     // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
49330     if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
49331         (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
49332       SDValue Zero = DAG.getConstant(0, DL, ZVT);
49333       SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49334       SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
49335       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49336                          DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49337                          SDValue(Neg.getNode(), 1));
49338     }
49339 
49340     // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
49341     // with fake operands:
49342     //  0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
49343     // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
49344     if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
49345         (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
49346       SDValue One = DAG.getConstant(1, DL, ZVT);
49347       SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49348       SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
49349       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49350                          DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49351                          Cmp1.getValue(1));
49352     }
49353   }
49354 
49355   // (cmp Z, 1) sets the carry flag if Z is 0.
49356   SDValue One = DAG.getConstant(1, DL, ZVT);
49357   SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49358   SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
49359 
49360   // Add the flags type for ADC/SBB nodes.
49361   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49362 
49363   // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
49364   // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
49365   if (CC == X86::COND_NE)
49366     return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
49367                        DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
49368 
49369   // X - (Z == 0) --> sub X, (zext(sete  Z, 0)) --> sbb X, 0, (cmp Z, 1)
49370   // X + (Z == 0) --> add X, (zext(sete  Z, 0)) --> adc X, 0, (cmp Z, 1)
49371   return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
49372                      DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
49373 }
49374 
matchPMADDWD(SelectionDAG & DAG,SDValue Op0,SDValue Op1,const SDLoc & DL,EVT VT,const X86Subtarget & Subtarget)49375 static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
49376                             const SDLoc &DL, EVT VT,
49377                             const X86Subtarget &Subtarget) {
49378   // Example of pattern we try to detect:
49379   // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
49380   //(add (build_vector (extract_elt t, 0),
49381   //                   (extract_elt t, 2),
49382   //                   (extract_elt t, 4),
49383   //                   (extract_elt t, 6)),
49384   //     (build_vector (extract_elt t, 1),
49385   //                   (extract_elt t, 3),
49386   //                   (extract_elt t, 5),
49387   //                   (extract_elt t, 7)))
49388 
49389   if (!Subtarget.hasSSE2())
49390     return SDValue();
49391 
49392   if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
49393       Op1.getOpcode() != ISD::BUILD_VECTOR)
49394     return SDValue();
49395 
49396   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
49397       VT.getVectorNumElements() < 4 ||
49398       !isPowerOf2_32(VT.getVectorNumElements()))
49399     return SDValue();
49400 
49401   // Check if one of Op0,Op1 is of the form:
49402   // (build_vector (extract_elt Mul, 0),
49403   //               (extract_elt Mul, 2),
49404   //               (extract_elt Mul, 4),
49405   //                   ...
49406   // the other is of the form:
49407   // (build_vector (extract_elt Mul, 1),
49408   //               (extract_elt Mul, 3),
49409   //               (extract_elt Mul, 5),
49410   //                   ...
49411   // and identify Mul.
49412   SDValue Mul;
49413   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
49414     SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
49415             Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
49416     // TODO: Be more tolerant to undefs.
49417     if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49418         Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49419         Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49420         Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
49421       return SDValue();
49422     auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
49423     auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
49424     auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
49425     auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
49426     if (!Const0L || !Const1L || !Const0H || !Const1H)
49427       return SDValue();
49428     unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
49429              Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
49430     // Commutativity of mul allows factors of a product to reorder.
49431     if (Idx0L > Idx1L)
49432       std::swap(Idx0L, Idx1L);
49433     if (Idx0H > Idx1H)
49434       std::swap(Idx0H, Idx1H);
49435     // Commutativity of add allows pairs of factors to reorder.
49436     if (Idx0L > Idx0H) {
49437       std::swap(Idx0L, Idx0H);
49438       std::swap(Idx1L, Idx1H);
49439     }
49440     if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
49441         Idx1H != 2 * i + 3)
49442       return SDValue();
49443     if (!Mul) {
49444       // First time an extract_elt's source vector is visited. Must be a MUL
49445       // with 2X number of vector elements than the BUILD_VECTOR.
49446       // Both extracts must be from same MUL.
49447       Mul = Op0L->getOperand(0);
49448       if (Mul->getOpcode() != ISD::MUL ||
49449           Mul.getValueType().getVectorNumElements() != 2 * e)
49450         return SDValue();
49451     }
49452     // Check that the extract is from the same MUL previously seen.
49453     if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
49454         Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
49455       return SDValue();
49456   }
49457 
49458   // Check if the Mul source can be safely shrunk.
49459   ShrinkMode Mode;
49460   if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
49461       Mode == ShrinkMode::MULU16)
49462     return SDValue();
49463 
49464   EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
49465                                  VT.getVectorNumElements() * 2);
49466   SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
49467   SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
49468 
49469   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49470                          ArrayRef<SDValue> Ops) {
49471     EVT InVT = Ops[0].getValueType();
49472     assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
49473     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
49474                                  InVT.getVectorNumElements() / 2);
49475     return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
49476   };
49477   return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
49478 }
49479 
49480 // Attempt to turn this pattern into PMADDWD.
49481 // (add (mul (sext (build_vector)), (sext (build_vector))),
49482 //      (mul (sext (build_vector)), (sext (build_vector)))
matchPMADDWD_2(SelectionDAG & DAG,SDValue N0,SDValue N1,const SDLoc & DL,EVT VT,const X86Subtarget & Subtarget)49483 static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
49484                               const SDLoc &DL, EVT VT,
49485                               const X86Subtarget &Subtarget) {
49486   if (!Subtarget.hasSSE2())
49487     return SDValue();
49488 
49489   if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
49490     return SDValue();
49491 
49492   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
49493       VT.getVectorNumElements() < 4 ||
49494       !isPowerOf2_32(VT.getVectorNumElements()))
49495     return SDValue();
49496 
49497   SDValue N00 = N0.getOperand(0);
49498   SDValue N01 = N0.getOperand(1);
49499   SDValue N10 = N1.getOperand(0);
49500   SDValue N11 = N1.getOperand(1);
49501 
49502   // All inputs need to be sign extends.
49503   // TODO: Support ZERO_EXTEND from known positive?
49504   if (N00.getOpcode() != ISD::SIGN_EXTEND ||
49505       N01.getOpcode() != ISD::SIGN_EXTEND ||
49506       N10.getOpcode() != ISD::SIGN_EXTEND ||
49507       N11.getOpcode() != ISD::SIGN_EXTEND)
49508     return SDValue();
49509 
49510   // Peek through the extends.
49511   N00 = N00.getOperand(0);
49512   N01 = N01.getOperand(0);
49513   N10 = N10.getOperand(0);
49514   N11 = N11.getOperand(0);
49515 
49516   // Must be extending from vXi16.
49517   EVT InVT = N00.getValueType();
49518   if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
49519       N10.getValueType() != InVT || N11.getValueType() != InVT)
49520     return SDValue();
49521 
49522   // All inputs should be build_vectors.
49523   if (N00.getOpcode() != ISD::BUILD_VECTOR ||
49524       N01.getOpcode() != ISD::BUILD_VECTOR ||
49525       N10.getOpcode() != ISD::BUILD_VECTOR ||
49526       N11.getOpcode() != ISD::BUILD_VECTOR)
49527     return SDValue();
49528 
49529   // For each element, we need to ensure we have an odd element from one vector
49530   // multiplied by the odd element of another vector and the even element from
49531   // one of the same vectors being multiplied by the even element from the
49532   // other vector. So we need to make sure for each element i, this operator
49533   // is being performed:
49534   //  A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
49535   SDValue In0, In1;
49536   for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
49537     SDValue N00Elt = N00.getOperand(i);
49538     SDValue N01Elt = N01.getOperand(i);
49539     SDValue N10Elt = N10.getOperand(i);
49540     SDValue N11Elt = N11.getOperand(i);
49541     // TODO: Be more tolerant to undefs.
49542     if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49543         N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49544         N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49545         N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
49546       return SDValue();
49547     auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
49548     auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
49549     auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
49550     auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
49551     if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
49552       return SDValue();
49553     unsigned IdxN00 = ConstN00Elt->getZExtValue();
49554     unsigned IdxN01 = ConstN01Elt->getZExtValue();
49555     unsigned IdxN10 = ConstN10Elt->getZExtValue();
49556     unsigned IdxN11 = ConstN11Elt->getZExtValue();
49557     // Add is commutative so indices can be reordered.
49558     if (IdxN00 > IdxN10) {
49559       std::swap(IdxN00, IdxN10);
49560       std::swap(IdxN01, IdxN11);
49561     }
49562     // N0 indices be the even element. N1 indices must be the next odd element.
49563     if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
49564         IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
49565       return SDValue();
49566     SDValue N00In = N00Elt.getOperand(0);
49567     SDValue N01In = N01Elt.getOperand(0);
49568     SDValue N10In = N10Elt.getOperand(0);
49569     SDValue N11In = N11Elt.getOperand(0);
49570 
49571     // First time we find an input capture it.
49572     if (!In0) {
49573       In0 = N00In;
49574       In1 = N01In;
49575 
49576       // The input vectors must be at least as wide as the output.
49577       // If they are larger than the output, we extract subvector below.
49578       if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
49579           In1.getValueSizeInBits() < VT.getSizeInBits())
49580         return SDValue();
49581     }
49582     // Mul is commutative so the input vectors can be in any order.
49583     // Canonicalize to make the compares easier.
49584     if (In0 != N00In)
49585       std::swap(N00In, N01In);
49586     if (In0 != N10In)
49587       std::swap(N10In, N11In);
49588     if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
49589       return SDValue();
49590   }
49591 
49592   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49593                          ArrayRef<SDValue> Ops) {
49594     EVT OpVT = Ops[0].getValueType();
49595     assert(OpVT.getScalarType() == MVT::i16 &&
49596            "Unexpected scalar element type");
49597     assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
49598     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
49599                                  OpVT.getVectorNumElements() / 2);
49600     return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
49601   };
49602 
49603   // If the output is narrower than an input, extract the low part of the input
49604   // vector.
49605   EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
49606                                VT.getVectorNumElements() * 2);
49607   if (OutVT16.bitsLT(In0.getValueType())) {
49608     In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
49609                       DAG.getIntPtrConstant(0, DL));
49610   }
49611   if (OutVT16.bitsLT(In1.getValueType())) {
49612     In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
49613                       DAG.getIntPtrConstant(0, DL));
49614   }
49615   return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
49616                           PMADDBuilder);
49617 }
49618 
combineAdd(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)49619 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
49620                           TargetLowering::DAGCombinerInfo &DCI,
49621                           const X86Subtarget &Subtarget) {
49622   EVT VT = N->getValueType(0);
49623   SDValue Op0 = N->getOperand(0);
49624   SDValue Op1 = N->getOperand(1);
49625 
49626   if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
49627     return MAdd;
49628   if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
49629     return MAdd;
49630 
49631   // Try to synthesize horizontal adds from adds of shuffles.
49632   if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
49633     return V;
49634 
49635   // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
49636   // (sub Y, (sext (vXi1 X))).
49637   // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
49638   // generic DAG combine without a legal type check, but adding this there
49639   // caused regressions.
49640   if (VT.isVector()) {
49641     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49642     if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
49643         Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
49644         TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
49645       SDLoc DL(N);
49646       SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
49647       return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
49648     }
49649 
49650     if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
49651         Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
49652         TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
49653       SDLoc DL(N);
49654       SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
49655       return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
49656     }
49657   }
49658 
49659   return combineAddOrSubToADCOrSBB(N, DAG);
49660 }
49661 
combineSub(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)49662 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
49663                           TargetLowering::DAGCombinerInfo &DCI,
49664                           const X86Subtarget &Subtarget) {
49665   SDValue Op0 = N->getOperand(0);
49666   SDValue Op1 = N->getOperand(1);
49667 
49668   // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
49669   auto IsNonOpaqueConstant = [&](SDValue Op) {
49670     if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {
49671       if (auto *Cst = dyn_cast<ConstantSDNode>(C))
49672         return !Cst->isOpaque();
49673       return true;
49674     }
49675     return false;
49676   };
49677 
49678   // X86 can't encode an immediate LHS of a sub. See if we can push the
49679   // negation into a preceding instruction. If the RHS of the sub is a XOR with
49680   // one use and a constant, invert the immediate, saving one register.
49681   // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
49682   if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
49683       IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) {
49684     SDLoc DL(N);
49685     EVT VT = Op0.getValueType();
49686     SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
49687                                  DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
49688     SDValue NewAdd =
49689         DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
49690     return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
49691   }
49692 
49693   // Try to synthesize horizontal subs from subs of shuffles.
49694   if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
49695     return V;
49696 
49697   return combineAddOrSubToADCOrSBB(N, DAG);
49698 }
49699 
combineVectorCompare(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)49700 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
49701                                     const X86Subtarget &Subtarget) {
49702   MVT VT = N->getSimpleValueType(0);
49703   SDLoc DL(N);
49704 
49705   if (N->getOperand(0) == N->getOperand(1)) {
49706     if (N->getOpcode() == X86ISD::PCMPEQ)
49707       return DAG.getConstant(-1, DL, VT);
49708     if (N->getOpcode() == X86ISD::PCMPGT)
49709       return DAG.getConstant(0, DL, VT);
49710   }
49711 
49712   return SDValue();
49713 }
49714 
49715 /// Helper that combines an array of subvector ops as if they were the operands
49716 /// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
49717 /// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
combineConcatVectorOps(const SDLoc & DL,MVT VT,ArrayRef<SDValue> Ops,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)49718 static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
49719                                       ArrayRef<SDValue> Ops, SelectionDAG &DAG,
49720                                       TargetLowering::DAGCombinerInfo &DCI,
49721                                       const X86Subtarget &Subtarget) {
49722   assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
49723   unsigned EltSizeInBits = VT.getScalarSizeInBits();
49724 
49725   if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
49726     return DAG.getUNDEF(VT);
49727 
49728   if (llvm::all_of(Ops, [](SDValue Op) {
49729         return ISD::isBuildVectorAllZeros(Op.getNode());
49730       }))
49731     return getZeroVector(VT, Subtarget, DAG, DL);
49732 
49733   SDValue Op0 = Ops[0];
49734   bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
49735 
49736   // Repeated subvectors.
49737   if (IsSplat &&
49738       (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
49739     // If this broadcast is inserted into both halves, use a larger broadcast.
49740     if (Op0.getOpcode() == X86ISD::VBROADCAST)
49741       return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
49742 
49743     // If this scalar/subvector broadcast_load is inserted into both halves, use
49744     // a larger broadcast_load. Update other uses to use an extracted subvector.
49745     if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
49746         Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
49747       auto *MemIntr = cast<MemIntrinsicSDNode>(Op0);
49748       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
49749       SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};
49750       SDValue BcastLd = DAG.getMemIntrinsicNode(Op0.getOpcode(), DL, Tys, Ops,
49751                                                 MemIntr->getMemoryVT(),
49752                                                 MemIntr->getMemOperand());
49753       DAG.ReplaceAllUsesOfValueWith(
49754           Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
49755       DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
49756       return BcastLd;
49757     }
49758 
49759     // If this is a simple subvector load repeated across multiple lanes, then
49760     // broadcast the load. Update other uses to use an extracted subvector.
49761     if (auto *Ld = dyn_cast<LoadSDNode>(Op0)) {
49762       if (Ld->isSimple() && !Ld->isNonTemporal() &&
49763           Ld->getExtensionType() == ISD::NON_EXTLOAD) {
49764         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
49765         SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
49766         SDValue BcastLd =
49767             DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops,
49768                                     Ld->getMemoryVT(), Ld->getMemOperand());
49769         DAG.ReplaceAllUsesOfValueWith(
49770             Op0,
49771             extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
49772         DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
49773         return BcastLd;
49774       }
49775     }
49776 
49777     // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
49778     if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
49779         (Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))
49780       return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
49781                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
49782                                      Op0.getOperand(0),
49783                                      DAG.getIntPtrConstant(0, DL)));
49784 
49785     // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
49786     if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
49787         (Subtarget.hasAVX2() ||
49788          (EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
49789         Op0.getOperand(0).getValueType() == VT.getScalarType())
49790       return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
49791 
49792     // concat_vectors(extract_subvector(broadcast(x)),
49793     //                extract_subvector(broadcast(x))) -> broadcast(x)
49794     if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
49795         Op0.getOperand(0).getValueType() == VT) {
49796       if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||
49797           Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
49798         return Op0.getOperand(0);
49799     }
49800   }
49801 
49802   // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
49803   // Only concat of subvector high halves which vperm2x128 is best at.
49804   // TODO: This should go in combineX86ShufflesRecursively eventually.
49805   if (VT.is256BitVector() && Ops.size() == 2) {
49806     SDValue Src0 = peekThroughBitcasts(Ops[0]);
49807     SDValue Src1 = peekThroughBitcasts(Ops[1]);
49808     if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
49809         Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
49810       EVT SrcVT0 = Src0.getOperand(0).getValueType();
49811       EVT SrcVT1 = Src1.getOperand(0).getValueType();
49812       unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
49813       unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
49814       if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
49815           Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
49816           Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
49817         return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
49818                            DAG.getBitcast(VT, Src0.getOperand(0)),
49819                            DAG.getBitcast(VT, Src1.getOperand(0)),
49820                            DAG.getTargetConstant(0x31, DL, MVT::i8));
49821       }
49822     }
49823   }
49824 
49825   // Repeated opcode.
49826   // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
49827   // but it currently struggles with different vector widths.
49828   if (llvm::all_of(Ops, [Op0](SDValue Op) {
49829         return Op.getOpcode() == Op0.getOpcode();
49830       })) {
49831     auto ConcatSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
49832       SmallVector<SDValue> Subs;
49833       for (SDValue SubOp : SubOps)
49834         Subs.push_back(SubOp.getOperand(I));
49835       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
49836     };
49837 
49838     unsigned NumOps = Ops.size();
49839     switch (Op0.getOpcode()) {
49840     case X86ISD::SHUFP: {
49841       // Add SHUFPD support if/when necessary.
49842       if (!IsSplat && VT.getScalarType() == MVT::f32 &&
49843           llvm::all_of(Ops, [Op0](SDValue Op) {
49844             return Op.getOperand(2) == Op0.getOperand(2);
49845           })) {
49846         return DAG.getNode(Op0.getOpcode(), DL, VT,
49847                            ConcatSubOperand(VT, Ops, 0),
49848                            ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
49849       }
49850       break;
49851     }
49852     case X86ISD::PSHUFHW:
49853     case X86ISD::PSHUFLW:
49854     case X86ISD::PSHUFD:
49855       if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
49856           Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
49857         return DAG.getNode(Op0.getOpcode(), DL, VT,
49858                            ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
49859       }
49860       LLVM_FALLTHROUGH;
49861     case X86ISD::VPERMILPI:
49862       // TODO - add support for vXf64/vXi64 shuffles.
49863       if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
49864           Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
49865         SDValue Res = DAG.getBitcast(MVT::v8f32, ConcatSubOperand(VT, Ops, 0));
49866         Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
49867                           Op0.getOperand(1));
49868         return DAG.getBitcast(VT, Res);
49869       }
49870       break;
49871     case X86ISD::VPERMV3:
49872       if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
49873         MVT OpVT = Op0.getSimpleValueType();
49874         int NumSrcElts = OpVT.getVectorNumElements();
49875         SmallVector<int, 64> ConcatMask;
49876         for (unsigned i = 0; i != NumOps; ++i) {
49877           SmallVector<int, 64> SubMask;
49878           SmallVector<SDValue, 2> SubOps;
49879           if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
49880                                     SubMask))
49881             break;
49882           for (int M : SubMask) {
49883             if (0 <= M) {
49884               M += M < NumSrcElts ? 0 : NumSrcElts;
49885               M += i * NumSrcElts;
49886             }
49887             ConcatMask.push_back(M);
49888           }
49889         }
49890         if (ConcatMask.size() == (NumOps * NumSrcElts)) {
49891           SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
49892                                           Ops[1].getOperand(0), DAG, DL);
49893           SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
49894                                           Ops[1].getOperand(2), DAG, DL);
49895           MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
49896           MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
49897           SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
49898           return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
49899         }
49900       }
49901       break;
49902     case X86ISD::VSHLI:
49903     case X86ISD::VSRLI:
49904       // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
49905       // TODO: Move this to LowerScalarImmediateShift?
49906       if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
49907           llvm::all_of(Ops, [](SDValue Op) {
49908             return Op.getConstantOperandAPInt(1) == 32;
49909           })) {
49910         SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
49911         SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
49912         if (Op0.getOpcode() == X86ISD::VSHLI) {
49913           Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
49914                                      {8, 0, 8, 2, 8, 4, 8, 6});
49915         } else {
49916           Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
49917                                      {1, 8, 3, 8, 5, 8, 7, 8});
49918         }
49919         return DAG.getBitcast(VT, Res);
49920       }
49921       LLVM_FALLTHROUGH;
49922     case X86ISD::VSRAI:
49923       if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
49924            (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
49925             (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
49926           llvm::all_of(Ops, [Op0](SDValue Op) {
49927             return Op0.getOperand(1) == Op.getOperand(1);
49928           })) {
49929         return DAG.getNode(Op0.getOpcode(), DL, VT,
49930                            ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
49931       }
49932       break;
49933     case X86ISD::VPERMI:
49934     case X86ISD::VROTLI:
49935     case X86ISD::VROTRI:
49936       if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
49937           llvm::all_of(Ops, [Op0](SDValue Op) {
49938             return Op0.getOperand(1) == Op.getOperand(1);
49939           })) {
49940         return DAG.getNode(Op0.getOpcode(), DL, VT,
49941                            ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
49942       }
49943       break;
49944     case ISD::AND:
49945     case ISD::OR:
49946     case ISD::XOR:
49947     case X86ISD::ANDNP:
49948       // TODO: Add 256-bit support.
49949       if (!IsSplat && VT.is512BitVector()) {
49950         MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
49951         SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
49952                                  NumOps * SrcVT.getVectorNumElements());
49953         return DAG.getNode(Op0.getOpcode(), DL, VT,
49954                            ConcatSubOperand(SrcVT, Ops, 0),
49955                            ConcatSubOperand(SrcVT, Ops, 1));
49956       }
49957       break;
49958     case X86ISD::HADD:
49959     case X86ISD::HSUB:
49960     case X86ISD::FHADD:
49961     case X86ISD::FHSUB:
49962     case X86ISD::PACKSS:
49963     case X86ISD::PACKUS:
49964       if (!IsSplat && VT.is256BitVector() &&
49965           (VT.isFloatingPoint() || Subtarget.hasInt256())) {
49966         MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
49967         SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
49968                                  NumOps * SrcVT.getVectorNumElements());
49969         return DAG.getNode(Op0.getOpcode(), DL, VT,
49970                            ConcatSubOperand(SrcVT, Ops, 0),
49971                            ConcatSubOperand(SrcVT, Ops, 1));
49972       }
49973       break;
49974     case X86ISD::PALIGNR:
49975       if (!IsSplat &&
49976           ((VT.is256BitVector() && Subtarget.hasInt256()) ||
49977            (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
49978           llvm::all_of(Ops, [Op0](SDValue Op) {
49979             return Op0.getOperand(2) == Op.getOperand(2);
49980           })) {
49981         return DAG.getNode(Op0.getOpcode(), DL, VT,
49982                            ConcatSubOperand(VT, Ops, 0),
49983                            ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
49984       }
49985       break;
49986     }
49987   }
49988 
49989   // Fold subvector loads into one.
49990   // If needed, look through bitcasts to get to the load.
49991   if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
49992     bool Fast;
49993     const X86TargetLowering *TLI = Subtarget.getTargetLowering();
49994     if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
49995                                 *FirstLd->getMemOperand(), &Fast) &&
49996         Fast) {
49997       if (SDValue Ld =
49998               EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
49999         return Ld;
50000     }
50001   }
50002 
50003   return SDValue();
50004 }
50005 
combineConcatVectors(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)50006 static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
50007                                     TargetLowering::DAGCombinerInfo &DCI,
50008                                     const X86Subtarget &Subtarget) {
50009   EVT VT = N->getValueType(0);
50010   EVT SrcVT = N->getOperand(0).getValueType();
50011   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50012 
50013   // Don't do anything for i1 vectors.
50014   if (VT.getVectorElementType() == MVT::i1)
50015     return SDValue();
50016 
50017   if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
50018     SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
50019     if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
50020                                            DCI, Subtarget))
50021       return R;
50022   }
50023 
50024   return SDValue();
50025 }
50026 
combineInsertSubvector(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)50027 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
50028                                       TargetLowering::DAGCombinerInfo &DCI,
50029                                       const X86Subtarget &Subtarget) {
50030   if (DCI.isBeforeLegalizeOps())
50031     return SDValue();
50032 
50033   MVT OpVT = N->getSimpleValueType(0);
50034 
50035   bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
50036 
50037   SDLoc dl(N);
50038   SDValue Vec = N->getOperand(0);
50039   SDValue SubVec = N->getOperand(1);
50040 
50041   uint64_t IdxVal = N->getConstantOperandVal(2);
50042   MVT SubVecVT = SubVec.getSimpleValueType();
50043 
50044   if (Vec.isUndef() && SubVec.isUndef())
50045     return DAG.getUNDEF(OpVT);
50046 
50047   // Inserting undefs/zeros into zeros/undefs is a zero vector.
50048   if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
50049       (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
50050     return getZeroVector(OpVT, Subtarget, DAG, dl);
50051 
50052   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
50053     // If we're inserting into a zero vector and then into a larger zero vector,
50054     // just insert into the larger zero vector directly.
50055     if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
50056         ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
50057       uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
50058       return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
50059                          getZeroVector(OpVT, Subtarget, DAG, dl),
50060                          SubVec.getOperand(1),
50061                          DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
50062     }
50063 
50064     // If we're inserting into a zero vector and our input was extracted from an
50065     // insert into a zero vector of the same type and the extraction was at
50066     // least as large as the original insertion. Just insert the original
50067     // subvector into a zero vector.
50068     if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
50069         isNullConstant(SubVec.getOperand(1)) &&
50070         SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
50071       SDValue Ins = SubVec.getOperand(0);
50072       if (isNullConstant(Ins.getOperand(2)) &&
50073           ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
50074           Ins.getOperand(1).getValueSizeInBits().getFixedSize() <=
50075               SubVecVT.getFixedSizeInBits())
50076         return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
50077                            getZeroVector(OpVT, Subtarget, DAG, dl),
50078                            Ins.getOperand(1), N->getOperand(2));
50079     }
50080   }
50081 
50082   // Stop here if this is an i1 vector.
50083   if (IsI1Vector)
50084     return SDValue();
50085 
50086   // If this is an insert of an extract, combine to a shuffle. Don't do this
50087   // if the insert or extract can be represented with a subregister operation.
50088   if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
50089       SubVec.getOperand(0).getSimpleValueType() == OpVT &&
50090       (IdxVal != 0 ||
50091        !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
50092     int ExtIdxVal = SubVec.getConstantOperandVal(1);
50093     if (ExtIdxVal != 0) {
50094       int VecNumElts = OpVT.getVectorNumElements();
50095       int SubVecNumElts = SubVecVT.getVectorNumElements();
50096       SmallVector<int, 64> Mask(VecNumElts);
50097       // First create an identity shuffle mask.
50098       for (int i = 0; i != VecNumElts; ++i)
50099         Mask[i] = i;
50100       // Now insert the extracted portion.
50101       for (int i = 0; i != SubVecNumElts; ++i)
50102         Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
50103 
50104       return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
50105     }
50106   }
50107 
50108   // Match concat_vector style patterns.
50109   SmallVector<SDValue, 2> SubVectorOps;
50110   if (collectConcatOps(N, SubVectorOps)) {
50111     if (SDValue Fold =
50112             combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
50113       return Fold;
50114 
50115     // If we're inserting all zeros into the upper half, change this to
50116     // a concat with zero. We will match this to a move
50117     // with implicit upper bit zeroing during isel.
50118     // We do this here because we don't want combineConcatVectorOps to
50119     // create INSERT_SUBVECTOR from CONCAT_VECTORS.
50120     if (SubVectorOps.size() == 2 &&
50121         ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
50122       return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
50123                          getZeroVector(OpVT, Subtarget, DAG, dl),
50124                          SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
50125   }
50126 
50127   // If this is a broadcast insert into an upper undef, use a larger broadcast.
50128   if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
50129     return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
50130 
50131   // If this is a broadcast load inserted into an upper undef, use a larger
50132   // broadcast load.
50133   if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
50134       SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
50135     auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
50136     SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
50137     SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
50138     SDValue BcastLd =
50139         DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
50140                                 MemIntr->getMemoryVT(),
50141                                 MemIntr->getMemOperand());
50142     DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
50143     return BcastLd;
50144   }
50145 
50146   return SDValue();
50147 }
50148 
50149 /// If we are extracting a subvector of a vector select and the select condition
50150 /// is composed of concatenated vectors, try to narrow the select width. This
50151 /// is a common pattern for AVX1 integer code because 256-bit selects may be
50152 /// legal, but there is almost no integer math/logic available for 256-bit.
50153 /// This function should only be called with legal types (otherwise, the calls
50154 /// to get simple value types will assert).
narrowExtractedVectorSelect(SDNode * Ext,SelectionDAG & DAG)50155 static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
50156   SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
50157   SmallVector<SDValue, 4> CatOps;
50158   if (Sel.getOpcode() != ISD::VSELECT ||
50159       !collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
50160     return SDValue();
50161 
50162   // Note: We assume simple value types because this should only be called with
50163   //       legal operations/types.
50164   // TODO: This can be extended to handle extraction to 256-bits.
50165   MVT VT = Ext->getSimpleValueType(0);
50166   if (!VT.is128BitVector())
50167     return SDValue();
50168 
50169   MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
50170   if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
50171     return SDValue();
50172 
50173   MVT WideVT = Ext->getOperand(0).getSimpleValueType();
50174   MVT SelVT = Sel.getSimpleValueType();
50175   assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
50176          "Unexpected vector type with legal operations");
50177 
50178   unsigned SelElts = SelVT.getVectorNumElements();
50179   unsigned CastedElts = WideVT.getVectorNumElements();
50180   unsigned ExtIdx = Ext->getConstantOperandVal(1);
50181   if (SelElts % CastedElts == 0) {
50182     // The select has the same or more (narrower) elements than the extract
50183     // operand. The extraction index gets scaled by that factor.
50184     ExtIdx *= (SelElts / CastedElts);
50185   } else if (CastedElts % SelElts == 0) {
50186     // The select has less (wider) elements than the extract operand. Make sure
50187     // that the extraction index can be divided evenly.
50188     unsigned IndexDivisor = CastedElts / SelElts;
50189     if (ExtIdx % IndexDivisor != 0)
50190       return SDValue();
50191     ExtIdx /= IndexDivisor;
50192   } else {
50193     llvm_unreachable("Element count of simple vector types are not divisible?");
50194   }
50195 
50196   unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
50197   unsigned NarrowElts = SelElts / NarrowingFactor;
50198   MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
50199   SDLoc DL(Ext);
50200   SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
50201   SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
50202   SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
50203   SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
50204   return DAG.getBitcast(VT, NarrowSel);
50205 }
50206 
combineExtractSubvector(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)50207 static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
50208                                        TargetLowering::DAGCombinerInfo &DCI,
50209                                        const X86Subtarget &Subtarget) {
50210   // For AVX1 only, if we are extracting from a 256-bit and+not (which will
50211   // eventually get combined/lowered into ANDNP) with a concatenated operand,
50212   // split the 'and' into 128-bit ops to avoid the concatenate and extract.
50213   // We let generic combining take over from there to simplify the
50214   // insert/extract and 'not'.
50215   // This pattern emerges during AVX1 legalization. We handle it before lowering
50216   // to avoid complications like splitting constant vector loads.
50217 
50218   // Capture the original wide type in the likely case that we need to bitcast
50219   // back to this type.
50220   if (!N->getValueType(0).isSimple())
50221     return SDValue();
50222 
50223   MVT VT = N->getSimpleValueType(0);
50224   SDValue InVec = N->getOperand(0);
50225   unsigned IdxVal = N->getConstantOperandVal(1);
50226   SDValue InVecBC = peekThroughBitcasts(InVec);
50227   EVT InVecVT = InVec.getValueType();
50228   unsigned SizeInBits = VT.getSizeInBits();
50229   unsigned InSizeInBits = InVecVT.getSizeInBits();
50230   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50231 
50232   if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
50233       TLI.isTypeLegal(InVecVT) &&
50234       InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
50235     auto isConcatenatedNot = [](SDValue V) {
50236       V = peekThroughBitcasts(V);
50237       if (!isBitwiseNot(V))
50238         return false;
50239       SDValue NotOp = V->getOperand(0);
50240       return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
50241     };
50242     if (isConcatenatedNot(InVecBC.getOperand(0)) ||
50243         isConcatenatedNot(InVecBC.getOperand(1))) {
50244       // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
50245       SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
50246       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
50247                          DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
50248     }
50249   }
50250 
50251   if (DCI.isBeforeLegalizeOps())
50252     return SDValue();
50253 
50254   if (SDValue V = narrowExtractedVectorSelect(N, DAG))
50255     return V;
50256 
50257   if (ISD::isBuildVectorAllZeros(InVec.getNode()))
50258     return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
50259 
50260   if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
50261     if (VT.getScalarType() == MVT::i1)
50262       return DAG.getConstant(1, SDLoc(N), VT);
50263     return getOnesVector(VT, DAG, SDLoc(N));
50264   }
50265 
50266   if (InVec.getOpcode() == ISD::BUILD_VECTOR)
50267     return DAG.getBuildVector(
50268         VT, SDLoc(N),
50269         InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
50270 
50271   // If we are extracting from an insert into a zero vector, replace with a
50272   // smaller insert into zero if we don't access less than the original
50273   // subvector. Don't do this for i1 vectors.
50274   if (VT.getVectorElementType() != MVT::i1 &&
50275       InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
50276       InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
50277       ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
50278       InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
50279     SDLoc DL(N);
50280     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
50281                        getZeroVector(VT, Subtarget, DAG, DL),
50282                        InVec.getOperand(1), InVec.getOperand(2));
50283   }
50284 
50285   // If we're extracting an upper subvector from a broadcast we should just
50286   // extract the lowest subvector instead which should allow
50287   // SimplifyDemandedVectorElts do more simplifications.
50288   if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
50289                       InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||
50290                       DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
50291     return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
50292 
50293   // If we're extracting a broadcasted subvector, just use the lowest subvector.
50294   if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
50295       cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
50296     return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
50297 
50298   // Attempt to extract from the source of a shuffle vector.
50299   if ((InSizeInBits % SizeInBits) == 0 &&
50300       (IdxVal % VT.getVectorNumElements()) == 0) {
50301     SmallVector<int, 32> ShuffleMask;
50302     SmallVector<int, 32> ScaledMask;
50303     SmallVector<SDValue, 2> ShuffleInputs;
50304     unsigned NumSubVecs = InSizeInBits / SizeInBits;
50305     // Decode the shuffle mask and scale it so its shuffling subvectors.
50306     if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
50307         scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
50308       unsigned SubVecIdx = IdxVal / VT.getVectorNumElements();
50309       if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
50310         return DAG.getUNDEF(VT);
50311       if (ScaledMask[SubVecIdx] == SM_SentinelZero)
50312         return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
50313       SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
50314       if (Src.getValueSizeInBits() == InSizeInBits) {
50315         unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
50316         unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();
50317         return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
50318                                 SDLoc(N), SizeInBits);
50319       }
50320     }
50321   }
50322 
50323   // If we're extracting the lowest subvector and we're the only user,
50324   // we may be able to perform this with a smaller vector width.
50325   unsigned InOpcode = InVec.getOpcode();
50326   if (IdxVal == 0 && InVec.hasOneUse()) {
50327     if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
50328       // v2f64 CVTDQ2PD(v4i32).
50329       if (InOpcode == ISD::SINT_TO_FP &&
50330           InVec.getOperand(0).getValueType() == MVT::v4i32) {
50331         return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
50332       }
50333       // v2f64 CVTUDQ2PD(v4i32).
50334       if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
50335           InVec.getOperand(0).getValueType() == MVT::v4i32) {
50336         return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
50337       }
50338       // v2f64 CVTPS2PD(v4f32).
50339       if (InOpcode == ISD::FP_EXTEND &&
50340           InVec.getOperand(0).getValueType() == MVT::v4f32) {
50341         return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
50342       }
50343     }
50344     if ((InOpcode == ISD::ANY_EXTEND ||
50345          InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
50346          InOpcode == ISD::ZERO_EXTEND ||
50347          InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
50348          InOpcode == ISD::SIGN_EXTEND ||
50349          InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
50350         (SizeInBits == 128 || SizeInBits == 256) &&
50351         InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
50352       SDLoc DL(N);
50353       SDValue Ext = InVec.getOperand(0);
50354       if (Ext.getValueSizeInBits() > SizeInBits)
50355         Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
50356       unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
50357       return DAG.getNode(ExtOp, DL, VT, Ext);
50358     }
50359     if (InOpcode == ISD::VSELECT &&
50360         InVec.getOperand(0).getValueType().is256BitVector() &&
50361         InVec.getOperand(1).getValueType().is256BitVector() &&
50362         InVec.getOperand(2).getValueType().is256BitVector()) {
50363       SDLoc DL(N);
50364       SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
50365       SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
50366       SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
50367       return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
50368     }
50369     if (InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
50370         (VT.is128BitVector() || VT.is256BitVector())) {
50371       SDLoc DL(N);
50372       SDValue InVecSrc = InVec.getOperand(0);
50373       unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
50374       SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
50375       return DAG.getNode(InOpcode, DL, VT, Ext);
50376     }
50377   }
50378 
50379   // Always split vXi64 logical shifts where we're extracting the upper 32-bits
50380   // as this is very likely to fold into a shuffle/truncation.
50381   if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
50382       InVecVT.getScalarSizeInBits() == 64 &&
50383       InVec.getConstantOperandAPInt(1) == 32) {
50384     SDLoc DL(N);
50385     SDValue Ext =
50386         extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
50387     return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
50388   }
50389 
50390   return SDValue();
50391 }
50392 
combineScalarToVector(SDNode * N,SelectionDAG & DAG)50393 static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
50394   EVT VT = N->getValueType(0);
50395   SDValue Src = N->getOperand(0);
50396   SDLoc DL(N);
50397 
50398   // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
50399   // This occurs frequently in our masked scalar intrinsic code and our
50400   // floating point select lowering with AVX512.
50401   // TODO: SimplifyDemandedBits instead?
50402   if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
50403     if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
50404       if (C->getAPIntValue().isOneValue())
50405         return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
50406                            Src.getOperand(0));
50407 
50408   // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
50409   if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
50410       Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
50411       Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
50412     if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
50413       if (C->isNullValue())
50414         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
50415                            Src.getOperand(1));
50416 
50417   // Reduce v2i64 to v4i32 if we don't need the upper bits.
50418   // TODO: Move to DAGCombine/SimplifyDemandedBits?
50419   if (VT == MVT::v2i64 || VT == MVT::v2f64) {
50420     auto IsAnyExt64 = [](SDValue Op) {
50421       if (Op.getValueType() != MVT::i64 || !Op.hasOneUse())
50422         return SDValue();
50423       if (Op.getOpcode() == ISD::ANY_EXTEND &&
50424           Op.getOperand(0).getScalarValueSizeInBits() <= 32)
50425         return Op.getOperand(0);
50426       if (auto *Ld = dyn_cast<LoadSDNode>(Op))
50427         if (Ld->getExtensionType() == ISD::EXTLOAD &&
50428             Ld->getMemoryVT().getScalarSizeInBits() <= 32)
50429           return Op;
50430       return SDValue();
50431     };
50432     if (SDValue ExtSrc = IsAnyExt64(peekThroughOneUseBitcasts(Src)))
50433       return DAG.getBitcast(
50434           VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
50435                           DAG.getAnyExtOrTrunc(ExtSrc, DL, MVT::i32)));
50436   }
50437 
50438   // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
50439   if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
50440       Src.getOperand(0).getValueType() == MVT::x86mmx)
50441     return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
50442 
50443   // See if we're broadcasting the scalar value, in which case just reuse that.
50444   // Ensure the same SDValue from the SDNode use is being used.
50445   for (SDNode *User : Src->uses())
50446     if (User->getOpcode() == X86ISD::VBROADCAST && Src == User->getOperand(0)) {
50447       unsigned SizeInBits = VT.getFixedSizeInBits();
50448       unsigned BroadcastSizeInBits = User->getValueSizeInBits(0).getFixedSize();
50449       if (BroadcastSizeInBits == SizeInBits)
50450         return SDValue(User, 0);
50451       if (BroadcastSizeInBits > SizeInBits)
50452         return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
50453       // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test coverage.
50454     }
50455 
50456   return SDValue();
50457 }
50458 
50459 // Simplify PMULDQ and PMULUDQ operations.
combinePMULDQ(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)50460 static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
50461                              TargetLowering::DAGCombinerInfo &DCI,
50462                              const X86Subtarget &Subtarget) {
50463   SDValue LHS = N->getOperand(0);
50464   SDValue RHS = N->getOperand(1);
50465 
50466   // Canonicalize constant to RHS.
50467   if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
50468       !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
50469     return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
50470 
50471   // Multiply by zero.
50472   // Don't return RHS as it may contain UNDEFs.
50473   if (ISD::isBuildVectorAllZeros(RHS.getNode()))
50474     return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
50475 
50476   // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
50477   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50478   if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
50479     return SDValue(N, 0);
50480 
50481   // If the input is an extend_invec and the SimplifyDemandedBits call didn't
50482   // convert it to any_extend_invec, due to the LegalOperations check, do the
50483   // conversion directly to a vector shuffle manually. This exposes combine
50484   // opportunities missed by combineEXTEND_VECTOR_INREG not calling
50485   // combineX86ShufflesRecursively on SSE4.1 targets.
50486   // FIXME: This is basically a hack around several other issues related to
50487   // ANY_EXTEND_VECTOR_INREG.
50488   if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
50489       (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
50490        LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
50491       LHS.getOperand(0).getValueType() == MVT::v4i32) {
50492     SDLoc dl(N);
50493     LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
50494                                LHS.getOperand(0), { 0, -1, 1, -1 });
50495     LHS = DAG.getBitcast(MVT::v2i64, LHS);
50496     return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
50497   }
50498   if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
50499       (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
50500        RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
50501       RHS.getOperand(0).getValueType() == MVT::v4i32) {
50502     SDLoc dl(N);
50503     RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
50504                                RHS.getOperand(0), { 0, -1, 1, -1 });
50505     RHS = DAG.getBitcast(MVT::v2i64, RHS);
50506     return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
50507   }
50508 
50509   return SDValue();
50510 }
50511 
combineEXTEND_VECTOR_INREG(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)50512 static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
50513                                           TargetLowering::DAGCombinerInfo &DCI,
50514                                           const X86Subtarget &Subtarget) {
50515   EVT VT = N->getValueType(0);
50516   SDValue In = N->getOperand(0);
50517   unsigned Opcode = N->getOpcode();
50518   unsigned InOpcode = In.getOpcode();
50519   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50520 
50521   // Try to merge vector loads and extend_inreg to an extload.
50522   if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
50523       In.hasOneUse()) {
50524     auto *Ld = cast<LoadSDNode>(In);
50525     if (Ld->isSimple()) {
50526       MVT SVT = In.getSimpleValueType().getVectorElementType();
50527       ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
50528                                  ? ISD::SEXTLOAD
50529                                  : ISD::ZEXTLOAD;
50530       EVT MemVT = VT.changeVectorElementType(SVT);
50531       if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
50532         SDValue Load =
50533             DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
50534                            Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
50535                            Ld->getMemOperand()->getFlags());
50536         DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
50537         return Load;
50538       }
50539     }
50540   }
50541 
50542   // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
50543   if (Opcode == InOpcode)
50544     return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0));
50545 
50546   // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
50547   // -> EXTEND_VECTOR_INREG(X).
50548   // TODO: Handle non-zero subvector indices.
50549   if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
50550       In.getOperand(0).getOpcode() == getOpcode_EXTEND(Opcode) &&
50551       In.getOperand(0).getOperand(0).getValueSizeInBits() ==
50552           In.getValueSizeInBits())
50553     return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0).getOperand(0));
50554 
50555   // Attempt to combine as a shuffle.
50556   // TODO: General ZERO_EXTEND_VECTOR_INREG support.
50557   if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
50558       (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasSSE41())) {
50559     SDValue Op(N, 0);
50560     if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
50561       if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50562         return Res;
50563   }
50564 
50565   return SDValue();
50566 }
50567 
combineKSHIFT(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)50568 static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
50569                              TargetLowering::DAGCombinerInfo &DCI) {
50570   EVT VT = N->getValueType(0);
50571 
50572   if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
50573     return DAG.getConstant(0, SDLoc(N), VT);
50574 
50575   APInt KnownUndef, KnownZero;
50576   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50577   APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
50578   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
50579                                      KnownZero, DCI))
50580     return SDValue(N, 0);
50581 
50582   return SDValue();
50583 }
50584 
50585 // Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
50586 // Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
50587 // extra instructions between the conversion due to going to scalar and back.
combineFP16_TO_FP(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)50588 static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
50589                                  const X86Subtarget &Subtarget) {
50590   if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
50591     return SDValue();
50592 
50593   if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
50594     return SDValue();
50595 
50596   if (N->getValueType(0) != MVT::f32 ||
50597       N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
50598     return SDValue();
50599 
50600   SDLoc dl(N);
50601   SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
50602                             N->getOperand(0).getOperand(0));
50603   Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
50604                     DAG.getTargetConstant(4, dl, MVT::i32));
50605   Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
50606   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
50607                      DAG.getIntPtrConstant(0, dl));
50608 }
50609 
combineFP_EXTEND(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)50610 static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
50611                                 const X86Subtarget &Subtarget) {
50612   if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
50613     return SDValue();
50614 
50615   bool IsStrict = N->isStrictFPOpcode();
50616   EVT VT = N->getValueType(0);
50617   SDValue Src = N->getOperand(IsStrict ? 1 : 0);
50618   EVT SrcVT = Src.getValueType();
50619 
50620   if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
50621     return SDValue();
50622 
50623   if (VT.getVectorElementType() != MVT::f32 &&
50624       VT.getVectorElementType() != MVT::f64)
50625     return SDValue();
50626 
50627   unsigned NumElts = VT.getVectorNumElements();
50628   if (NumElts == 1 || !isPowerOf2_32(NumElts))
50629     return SDValue();
50630 
50631   SDLoc dl(N);
50632 
50633   // Convert the input to vXi16.
50634   EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
50635   Src = DAG.getBitcast(IntVT, Src);
50636 
50637   // Widen to at least 8 input elements.
50638   if (NumElts < 8) {
50639     unsigned NumConcats = 8 / NumElts;
50640     SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
50641                                 : DAG.getConstant(0, dl, IntVT);
50642     SmallVector<SDValue, 4> Ops(NumConcats, Fill);
50643     Ops[0] = Src;
50644     Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
50645   }
50646 
50647   // Destination is vXf32 with at least 4 elements.
50648   EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
50649                                std::max(4U, NumElts));
50650   SDValue Cvt, Chain;
50651   if (IsStrict) {
50652     Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
50653                       {N->getOperand(0), Src});
50654     Chain = Cvt.getValue(1);
50655   } else {
50656     Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
50657   }
50658 
50659   if (NumElts < 4) {
50660     assert(NumElts == 2 && "Unexpected size");
50661     Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
50662                       DAG.getIntPtrConstant(0, dl));
50663   }
50664 
50665   if (IsStrict) {
50666     // Extend to the original VT if necessary.
50667     if (Cvt.getValueType() != VT) {
50668       Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
50669                         {Chain, Cvt});
50670       Chain = Cvt.getValue(1);
50671     }
50672     return DAG.getMergeValues({Cvt, Chain}, dl);
50673   }
50674 
50675   // Extend to the original VT if necessary.
50676   return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
50677 }
50678 
50679 // Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
50680 // from. Limit this to cases where the loads have the same input chain and the
50681 // output chains are unused. This avoids any memory ordering issues.
combineBROADCAST_LOAD(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)50682 static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
50683                                      TargetLowering::DAGCombinerInfo &DCI) {
50684   assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
50685           N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
50686          "Unknown broadcast load type");
50687 
50688   // Only do this if the chain result is unused.
50689   if (N->hasAnyUseOfValue(1))
50690     return SDValue();
50691 
50692   auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
50693 
50694   SDValue Ptr = MemIntrin->getBasePtr();
50695   SDValue Chain = MemIntrin->getChain();
50696   EVT VT = N->getSimpleValueType(0);
50697   EVT MemVT = MemIntrin->getMemoryVT();
50698 
50699   // Look at other users of our base pointer and try to find a wider broadcast.
50700   // The input chain and the size of the memory VT must match.
50701   for (SDNode *User : Ptr->uses())
50702     if (User != N && User->getOpcode() == N->getOpcode() &&
50703         cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
50704         cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
50705         cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
50706             MemVT.getSizeInBits() &&
50707         !User->hasAnyUseOfValue(1) &&
50708         User->getValueSizeInBits(0).getFixedSize() > VT.getFixedSizeInBits()) {
50709       SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
50710                                          VT.getSizeInBits());
50711       Extract = DAG.getBitcast(VT, Extract);
50712       return DCI.CombineTo(N, Extract, SDValue(User, 1));
50713     }
50714 
50715   return SDValue();
50716 }
50717 
combineFP_ROUND(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)50718 static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
50719                                const X86Subtarget &Subtarget) {
50720   if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
50721     return SDValue();
50722 
50723   EVT VT = N->getValueType(0);
50724   SDValue Src = N->getOperand(0);
50725   EVT SrcVT = Src.getValueType();
50726 
50727   if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
50728       SrcVT.getVectorElementType() != MVT::f32)
50729     return SDValue();
50730 
50731   unsigned NumElts = VT.getVectorNumElements();
50732   if (NumElts == 1 || !isPowerOf2_32(NumElts))
50733     return SDValue();
50734 
50735   SDLoc dl(N);
50736 
50737   // Widen to at least 4 input elements.
50738   if (NumElts < 4)
50739     Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
50740                       DAG.getConstantFP(0.0, dl, SrcVT));
50741 
50742   // Destination is v8i16 with at least 8 elements.
50743   EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
50744                                std::max(8U, NumElts));
50745   SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,
50746                             DAG.getTargetConstant(4, dl, MVT::i32));
50747 
50748   // Extract down to real number of elements.
50749   if (NumElts < 8) {
50750     EVT IntVT = VT.changeVectorElementTypeToInteger();
50751     Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
50752                       DAG.getIntPtrConstant(0, dl));
50753   }
50754 
50755   return DAG.getBitcast(VT, Cvt);
50756 }
50757 
combineMOVDQ2Q(SDNode * N,SelectionDAG & DAG)50758 static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
50759   SDValue Src = N->getOperand(0);
50760 
50761   // Turn MOVDQ2Q+simple_load into an mmx load.
50762   if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
50763     LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
50764 
50765     if (LN->isSimple()) {
50766       SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
50767                                   LN->getBasePtr(),
50768                                   LN->getPointerInfo(),
50769                                   LN->getOriginalAlign(),
50770                                   LN->getMemOperand()->getFlags());
50771       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
50772       return NewLd;
50773     }
50774   }
50775 
50776   return SDValue();
50777 }
50778 
combinePDEP(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)50779 static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
50780                            TargetLowering::DAGCombinerInfo &DCI) {
50781   unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
50782   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50783   if (TLI.SimplifyDemandedBits(SDValue(N, 0),
50784                                APInt::getAllOnesValue(NumBits), DCI))
50785     return SDValue(N, 0);
50786 
50787   return SDValue();
50788 }
50789 
PerformDAGCombine(SDNode * N,DAGCombinerInfo & DCI) const50790 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
50791                                              DAGCombinerInfo &DCI) const {
50792   SelectionDAG &DAG = DCI.DAG;
50793   switch (N->getOpcode()) {
50794   default: break;
50795   case ISD::SCALAR_TO_VECTOR:
50796     return combineScalarToVector(N, DAG);
50797   case ISD::EXTRACT_VECTOR_ELT:
50798   case X86ISD::PEXTRW:
50799   case X86ISD::PEXTRB:
50800     return combineExtractVectorElt(N, DAG, DCI, Subtarget);
50801   case ISD::CONCAT_VECTORS:
50802     return combineConcatVectors(N, DAG, DCI, Subtarget);
50803   case ISD::INSERT_SUBVECTOR:
50804     return combineInsertSubvector(N, DAG, DCI, Subtarget);
50805   case ISD::EXTRACT_SUBVECTOR:
50806     return combineExtractSubvector(N, DAG, DCI, Subtarget);
50807   case ISD::VSELECT:
50808   case ISD::SELECT:
50809   case X86ISD::BLENDV:      return combineSelect(N, DAG, DCI, Subtarget);
50810   case ISD::BITCAST:        return combineBitcast(N, DAG, DCI, Subtarget);
50811   case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);
50812   case X86ISD::CMP:         return combineCMP(N, DAG);
50813   case ISD::ADD:            return combineAdd(N, DAG, DCI, Subtarget);
50814   case ISD::SUB:            return combineSub(N, DAG, DCI, Subtarget);
50815   case X86ISD::ADD:
50816   case X86ISD::SUB:         return combineX86AddSub(N, DAG, DCI);
50817   case X86ISD::SBB:         return combineSBB(N, DAG);
50818   case X86ISD::ADC:         return combineADC(N, DAG, DCI);
50819   case ISD::MUL:            return combineMul(N, DAG, DCI, Subtarget);
50820   case ISD::SHL:            return combineShiftLeft(N, DAG);
50821   case ISD::SRA:            return combineShiftRightArithmetic(N, DAG, Subtarget);
50822   case ISD::SRL:            return combineShiftRightLogical(N, DAG, DCI, Subtarget);
50823   case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
50824   case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
50825   case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
50826   case X86ISD::BEXTR:
50827   case X86ISD::BEXTRI:      return combineBEXTR(N, DAG, DCI, Subtarget);
50828   case ISD::LOAD:           return combineLoad(N, DAG, DCI, Subtarget);
50829   case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
50830   case ISD::STORE:          return combineStore(N, DAG, DCI, Subtarget);
50831   case ISD::MSTORE:         return combineMaskedStore(N, DAG, DCI, Subtarget);
50832   case X86ISD::VEXTRACT_STORE:
50833     return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
50834   case ISD::SINT_TO_FP:
50835   case ISD::STRICT_SINT_TO_FP:
50836     return combineSIntToFP(N, DAG, DCI, Subtarget);
50837   case ISD::UINT_TO_FP:
50838   case ISD::STRICT_UINT_TO_FP:
50839     return combineUIntToFP(N, DAG, Subtarget);
50840   case ISD::FADD:
50841   case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
50842   case ISD::FNEG:           return combineFneg(N, DAG, DCI, Subtarget);
50843   case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
50844   case X86ISD::VTRUNC:      return combineVTRUNC(N, DAG, DCI);
50845   case X86ISD::ANDNP:       return combineAndnp(N, DAG, DCI, Subtarget);
50846   case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);
50847   case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);
50848   case X86ISD::FXOR:
50849   case X86ISD::FOR:         return combineFOr(N, DAG, DCI, Subtarget);
50850   case X86ISD::FMIN:
50851   case X86ISD::FMAX:        return combineFMinFMax(N, DAG);
50852   case ISD::FMINNUM:
50853   case ISD::FMAXNUM:        return combineFMinNumFMaxNum(N, DAG, Subtarget);
50854   case X86ISD::CVTSI2P:
50855   case X86ISD::CVTUI2P:     return combineX86INT_TO_FP(N, DAG, DCI);
50856   case X86ISD::CVTP2SI:
50857   case X86ISD::CVTP2UI:
50858   case X86ISD::STRICT_CVTTP2SI:
50859   case X86ISD::CVTTP2SI:
50860   case X86ISD::STRICT_CVTTP2UI:
50861   case X86ISD::CVTTP2UI:
50862                             return combineCVTP2I_CVTTP2I(N, DAG, DCI);
50863   case X86ISD::STRICT_CVTPH2PS:
50864   case X86ISD::CVTPH2PS:    return combineCVTPH2PS(N, DAG, DCI);
50865   case X86ISD::BT:          return combineBT(N, DAG, DCI);
50866   case ISD::ANY_EXTEND:
50867   case ISD::ZERO_EXTEND:    return combineZext(N, DAG, DCI, Subtarget);
50868   case ISD::SIGN_EXTEND:    return combineSext(N, DAG, DCI, Subtarget);
50869   case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
50870   case ISD::ANY_EXTEND_VECTOR_INREG:
50871   case ISD::SIGN_EXTEND_VECTOR_INREG:
50872   case ISD::ZERO_EXTEND_VECTOR_INREG:
50873     return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
50874   case ISD::SETCC:          return combineSetCC(N, DAG, DCI, Subtarget);
50875   case X86ISD::SETCC:       return combineX86SetCC(N, DAG, Subtarget);
50876   case X86ISD::BRCOND:      return combineBrCond(N, DAG, Subtarget);
50877   case X86ISD::PACKSS:
50878   case X86ISD::PACKUS:      return combineVectorPack(N, DAG, DCI, Subtarget);
50879   case X86ISD::HADD:
50880   case X86ISD::HSUB:
50881   case X86ISD::FHADD:
50882   case X86ISD::FHSUB:       return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
50883   case X86ISD::VSHL:
50884   case X86ISD::VSRA:
50885   case X86ISD::VSRL:
50886     return combineVectorShiftVar(N, DAG, DCI, Subtarget);
50887   case X86ISD::VSHLI:
50888   case X86ISD::VSRAI:
50889   case X86ISD::VSRLI:
50890     return combineVectorShiftImm(N, DAG, DCI, Subtarget);
50891   case ISD::INSERT_VECTOR_ELT:
50892   case X86ISD::PINSRB:
50893   case X86ISD::PINSRW:      return combineVectorInsert(N, DAG, DCI, Subtarget);
50894   case X86ISD::SHUFP:       // Handle all target specific shuffles
50895   case X86ISD::INSERTPS:
50896   case X86ISD::EXTRQI:
50897   case X86ISD::INSERTQI:
50898   case X86ISD::VALIGN:
50899   case X86ISD::PALIGNR:
50900   case X86ISD::VSHLDQ:
50901   case X86ISD::VSRLDQ:
50902   case X86ISD::BLENDI:
50903   case X86ISD::UNPCKH:
50904   case X86ISD::UNPCKL:
50905   case X86ISD::MOVHLPS:
50906   case X86ISD::MOVLHPS:
50907   case X86ISD::PSHUFB:
50908   case X86ISD::PSHUFD:
50909   case X86ISD::PSHUFHW:
50910   case X86ISD::PSHUFLW:
50911   case X86ISD::MOVSHDUP:
50912   case X86ISD::MOVSLDUP:
50913   case X86ISD::MOVDDUP:
50914   case X86ISD::MOVSS:
50915   case X86ISD::MOVSD:
50916   case X86ISD::VBROADCAST:
50917   case X86ISD::VPPERM:
50918   case X86ISD::VPERMI:
50919   case X86ISD::VPERMV:
50920   case X86ISD::VPERMV3:
50921   case X86ISD::VPERMIL2:
50922   case X86ISD::VPERMILPI:
50923   case X86ISD::VPERMILPV:
50924   case X86ISD::VPERM2X128:
50925   case X86ISD::SHUF128:
50926   case X86ISD::VZEXT_MOVL:
50927   case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
50928   case X86ISD::FMADD_RND:
50929   case X86ISD::FMSUB:
50930   case X86ISD::STRICT_FMSUB:
50931   case X86ISD::FMSUB_RND:
50932   case X86ISD::FNMADD:
50933   case X86ISD::STRICT_FNMADD:
50934   case X86ISD::FNMADD_RND:
50935   case X86ISD::FNMSUB:
50936   case X86ISD::STRICT_FNMSUB:
50937   case X86ISD::FNMSUB_RND:
50938   case ISD::FMA:
50939   case ISD::STRICT_FMA:     return combineFMA(N, DAG, DCI, Subtarget);
50940   case X86ISD::FMADDSUB_RND:
50941   case X86ISD::FMSUBADD_RND:
50942   case X86ISD::FMADDSUB:
50943   case X86ISD::FMSUBADD:    return combineFMADDSUB(N, DAG, DCI);
50944   case X86ISD::MOVMSK:      return combineMOVMSK(N, DAG, DCI, Subtarget);
50945   case X86ISD::MGATHER:
50946   case X86ISD::MSCATTER:    return combineX86GatherScatter(N, DAG, DCI);
50947   case ISD::MGATHER:
50948   case ISD::MSCATTER:       return combineGatherScatter(N, DAG, DCI);
50949   case X86ISD::PCMPEQ:
50950   case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
50951   case X86ISD::PMULDQ:
50952   case X86ISD::PMULUDQ:     return combinePMULDQ(N, DAG, DCI, Subtarget);
50953   case X86ISD::KSHIFTL:
50954   case X86ISD::KSHIFTR:     return combineKSHIFT(N, DAG, DCI);
50955   case ISD::FP16_TO_FP:     return combineFP16_TO_FP(N, DAG, Subtarget);
50956   case ISD::STRICT_FP_EXTEND:
50957   case ISD::FP_EXTEND:      return combineFP_EXTEND(N, DAG, Subtarget);
50958   case ISD::FP_ROUND:       return combineFP_ROUND(N, DAG, Subtarget);
50959   case X86ISD::VBROADCAST_LOAD:
50960   case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
50961   case X86ISD::MOVDQ2Q:     return combineMOVDQ2Q(N, DAG);
50962   case X86ISD::PDEP:        return combinePDEP(N, DAG, DCI);
50963   }
50964 
50965   return SDValue();
50966 }
50967 
isTypeDesirableForOp(unsigned Opc,EVT VT) const50968 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
50969   if (!isTypeLegal(VT))
50970     return false;
50971 
50972   // There are no vXi8 shifts.
50973   if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
50974     return false;
50975 
50976   // TODO: Almost no 8-bit ops are desirable because they have no actual
50977   //       size/speed advantages vs. 32-bit ops, but they do have a major
50978   //       potential disadvantage by causing partial register stalls.
50979   //
50980   // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
50981   // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
50982   // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
50983   // check for a constant operand to the multiply.
50984   if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
50985     return false;
50986 
50987   // i16 instruction encodings are longer and some i16 instructions are slow,
50988   // so those are not desirable.
50989   if (VT == MVT::i16) {
50990     switch (Opc) {
50991     default:
50992       break;
50993     case ISD::LOAD:
50994     case ISD::SIGN_EXTEND:
50995     case ISD::ZERO_EXTEND:
50996     case ISD::ANY_EXTEND:
50997     case ISD::SHL:
50998     case ISD::SRA:
50999     case ISD::SRL:
51000     case ISD::SUB:
51001     case ISD::ADD:
51002     case ISD::MUL:
51003     case ISD::AND:
51004     case ISD::OR:
51005     case ISD::XOR:
51006       return false;
51007     }
51008   }
51009 
51010   // Any legal type not explicitly accounted for above here is desirable.
51011   return true;
51012 }
51013 
expandIndirectJTBranch(const SDLoc & dl,SDValue Value,SDValue Addr,SelectionDAG & DAG) const51014 SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
51015                                                   SDValue Value, SDValue Addr,
51016                                                   SelectionDAG &DAG) const {
51017   const Module *M = DAG.getMachineFunction().getMMI().getModule();
51018   Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
51019   if (IsCFProtectionSupported) {
51020     // In case control-flow branch protection is enabled, we need to add
51021     // notrack prefix to the indirect branch.
51022     // In order to do that we create NT_BRIND SDNode.
51023     // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
51024     return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
51025   }
51026 
51027   return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
51028 }
51029 
IsDesirableToPromoteOp(SDValue Op,EVT & PVT) const51030 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
51031   EVT VT = Op.getValueType();
51032   bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
51033                              isa<ConstantSDNode>(Op.getOperand(1));
51034 
51035   // i16 is legal, but undesirable since i16 instruction encodings are longer
51036   // and some i16 instructions are slow.
51037   // 8-bit multiply-by-constant can usually be expanded to something cheaper
51038   // using LEA and/or other ALU ops.
51039   if (VT != MVT::i16 && !Is8BitMulByConstant)
51040     return false;
51041 
51042   auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
51043     if (!Op.hasOneUse())
51044       return false;
51045     SDNode *User = *Op->use_begin();
51046     if (!ISD::isNormalStore(User))
51047       return false;
51048     auto *Ld = cast<LoadSDNode>(Load);
51049     auto *St = cast<StoreSDNode>(User);
51050     return Ld->getBasePtr() == St->getBasePtr();
51051   };
51052 
51053   auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
51054     if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
51055       return false;
51056     if (!Op.hasOneUse())
51057       return false;
51058     SDNode *User = *Op->use_begin();
51059     if (User->getOpcode() != ISD::ATOMIC_STORE)
51060       return false;
51061     auto *Ld = cast<AtomicSDNode>(Load);
51062     auto *St = cast<AtomicSDNode>(User);
51063     return Ld->getBasePtr() == St->getBasePtr();
51064   };
51065 
51066   bool Commute = false;
51067   switch (Op.getOpcode()) {
51068   default: return false;
51069   case ISD::SIGN_EXTEND:
51070   case ISD::ZERO_EXTEND:
51071   case ISD::ANY_EXTEND:
51072     break;
51073   case ISD::SHL:
51074   case ISD::SRA:
51075   case ISD::SRL: {
51076     SDValue N0 = Op.getOperand(0);
51077     // Look out for (store (shl (load), x)).
51078     if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
51079       return false;
51080     break;
51081   }
51082   case ISD::ADD:
51083   case ISD::MUL:
51084   case ISD::AND:
51085   case ISD::OR:
51086   case ISD::XOR:
51087     Commute = true;
51088     LLVM_FALLTHROUGH;
51089   case ISD::SUB: {
51090     SDValue N0 = Op.getOperand(0);
51091     SDValue N1 = Op.getOperand(1);
51092     // Avoid disabling potential load folding opportunities.
51093     if (MayFoldLoad(N1) &&
51094         (!Commute || !isa<ConstantSDNode>(N0) ||
51095          (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
51096       return false;
51097     if (MayFoldLoad(N0) &&
51098         ((Commute && !isa<ConstantSDNode>(N1)) ||
51099          (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
51100       return false;
51101     if (IsFoldableAtomicRMW(N0, Op) ||
51102         (Commute && IsFoldableAtomicRMW(N1, Op)))
51103       return false;
51104   }
51105   }
51106 
51107   PVT = MVT::i32;
51108   return true;
51109 }
51110 
51111 //===----------------------------------------------------------------------===//
51112 //                           X86 Inline Assembly Support
51113 //===----------------------------------------------------------------------===//
51114 
51115 // Helper to match a string separated by whitespace.
matchAsm(StringRef S,ArrayRef<const char * > Pieces)51116 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
51117   S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
51118 
51119   for (StringRef Piece : Pieces) {
51120     if (!S.startswith(Piece)) // Check if the piece matches.
51121       return false;
51122 
51123     S = S.substr(Piece.size());
51124     StringRef::size_type Pos = S.find_first_not_of(" \t");
51125     if (Pos == 0) // We matched a prefix.
51126       return false;
51127 
51128     S = S.substr(Pos);
51129   }
51130 
51131   return S.empty();
51132 }
51133 
clobbersFlagRegisters(const SmallVector<StringRef,4> & AsmPieces)51134 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
51135 
51136   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
51137     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
51138         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
51139         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
51140 
51141       if (AsmPieces.size() == 3)
51142         return true;
51143       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
51144         return true;
51145     }
51146   }
51147   return false;
51148 }
51149 
ExpandInlineAsm(CallInst * CI) const51150 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
51151   InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
51152 
51153   const std::string &AsmStr = IA->getAsmString();
51154 
51155   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
51156   if (!Ty || Ty->getBitWidth() % 16 != 0)
51157     return false;
51158 
51159   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
51160   SmallVector<StringRef, 4> AsmPieces;
51161   SplitString(AsmStr, AsmPieces, ";\n");
51162 
51163   switch (AsmPieces.size()) {
51164   default: return false;
51165   case 1:
51166     // FIXME: this should verify that we are targeting a 486 or better.  If not,
51167     // we will turn this bswap into something that will be lowered to logical
51168     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
51169     // lower so don't worry about this.
51170     // bswap $0
51171     if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
51172         matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
51173         matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
51174         matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
51175         matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
51176         matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
51177       // No need to check constraints, nothing other than the equivalent of
51178       // "=r,0" would be valid here.
51179       return IntrinsicLowering::LowerToByteSwap(CI);
51180     }
51181 
51182     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
51183     if (CI->getType()->isIntegerTy(16) &&
51184         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
51185         (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
51186          matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
51187       AsmPieces.clear();
51188       StringRef ConstraintsStr = IA->getConstraintString();
51189       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
51190       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
51191       if (clobbersFlagRegisters(AsmPieces))
51192         return IntrinsicLowering::LowerToByteSwap(CI);
51193     }
51194     break;
51195   case 3:
51196     if (CI->getType()->isIntegerTy(32) &&
51197         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
51198         matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
51199         matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
51200         matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
51201       AsmPieces.clear();
51202       StringRef ConstraintsStr = IA->getConstraintString();
51203       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
51204       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
51205       if (clobbersFlagRegisters(AsmPieces))
51206         return IntrinsicLowering::LowerToByteSwap(CI);
51207     }
51208 
51209     if (CI->getType()->isIntegerTy(64)) {
51210       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
51211       if (Constraints.size() >= 2 &&
51212           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
51213           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
51214         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
51215         if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
51216             matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
51217             matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
51218           return IntrinsicLowering::LowerToByteSwap(CI);
51219       }
51220     }
51221     break;
51222   }
51223   return false;
51224 }
51225 
parseConstraintCode(llvm::StringRef Constraint)51226 static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
51227   X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
51228                            .Case("{@cca}", X86::COND_A)
51229                            .Case("{@ccae}", X86::COND_AE)
51230                            .Case("{@ccb}", X86::COND_B)
51231                            .Case("{@ccbe}", X86::COND_BE)
51232                            .Case("{@ccc}", X86::COND_B)
51233                            .Case("{@cce}", X86::COND_E)
51234                            .Case("{@ccz}", X86::COND_E)
51235                            .Case("{@ccg}", X86::COND_G)
51236                            .Case("{@ccge}", X86::COND_GE)
51237                            .Case("{@ccl}", X86::COND_L)
51238                            .Case("{@ccle}", X86::COND_LE)
51239                            .Case("{@ccna}", X86::COND_BE)
51240                            .Case("{@ccnae}", X86::COND_B)
51241                            .Case("{@ccnb}", X86::COND_AE)
51242                            .Case("{@ccnbe}", X86::COND_A)
51243                            .Case("{@ccnc}", X86::COND_AE)
51244                            .Case("{@ccne}", X86::COND_NE)
51245                            .Case("{@ccnz}", X86::COND_NE)
51246                            .Case("{@ccng}", X86::COND_LE)
51247                            .Case("{@ccnge}", X86::COND_L)
51248                            .Case("{@ccnl}", X86::COND_GE)
51249                            .Case("{@ccnle}", X86::COND_G)
51250                            .Case("{@ccno}", X86::COND_NO)
51251                            .Case("{@ccnp}", X86::COND_NP)
51252                            .Case("{@ccns}", X86::COND_NS)
51253                            .Case("{@cco}", X86::COND_O)
51254                            .Case("{@ccp}", X86::COND_P)
51255                            .Case("{@ccs}", X86::COND_S)
51256                            .Default(X86::COND_INVALID);
51257   return Cond;
51258 }
51259 
51260 /// Given a constraint letter, return the type of constraint for this target.
51261 X86TargetLowering::ConstraintType
getConstraintType(StringRef Constraint) const51262 X86TargetLowering::getConstraintType(StringRef Constraint) const {
51263   if (Constraint.size() == 1) {
51264     switch (Constraint[0]) {
51265     case 'R':
51266     case 'q':
51267     case 'Q':
51268     case 'f':
51269     case 't':
51270     case 'u':
51271     case 'y':
51272     case 'x':
51273     case 'v':
51274     case 'l':
51275     case 'k': // AVX512 masking registers.
51276       return C_RegisterClass;
51277     case 'a':
51278     case 'b':
51279     case 'c':
51280     case 'd':
51281     case 'S':
51282     case 'D':
51283     case 'A':
51284       return C_Register;
51285     case 'I':
51286     case 'J':
51287     case 'K':
51288     case 'N':
51289     case 'G':
51290     case 'L':
51291     case 'M':
51292       return C_Immediate;
51293     case 'C':
51294     case 'e':
51295     case 'Z':
51296       return C_Other;
51297     default:
51298       break;
51299     }
51300   }
51301   else if (Constraint.size() == 2) {
51302     switch (Constraint[0]) {
51303     default:
51304       break;
51305     case 'Y':
51306       switch (Constraint[1]) {
51307       default:
51308         break;
51309       case 'z':
51310         return C_Register;
51311       case 'i':
51312       case 'm':
51313       case 'k':
51314       case 't':
51315       case '2':
51316         return C_RegisterClass;
51317       }
51318     }
51319   } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
51320     return C_Other;
51321   return TargetLowering::getConstraintType(Constraint);
51322 }
51323 
51324 /// Examine constraint type and operand type and determine a weight value.
51325 /// This object must already have been set up with the operand type
51326 /// and the current alternative constraint selected.
51327 TargetLowering::ConstraintWeight
getSingleConstraintMatchWeight(AsmOperandInfo & info,const char * constraint) const51328   X86TargetLowering::getSingleConstraintMatchWeight(
51329     AsmOperandInfo &info, const char *constraint) const {
51330   ConstraintWeight weight = CW_Invalid;
51331   Value *CallOperandVal = info.CallOperandVal;
51332     // If we don't have a value, we can't do a match,
51333     // but allow it at the lowest weight.
51334   if (!CallOperandVal)
51335     return CW_Default;
51336   Type *type = CallOperandVal->getType();
51337   // Look at the constraint type.
51338   switch (*constraint) {
51339   default:
51340     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
51341     LLVM_FALLTHROUGH;
51342   case 'R':
51343   case 'q':
51344   case 'Q':
51345   case 'a':
51346   case 'b':
51347   case 'c':
51348   case 'd':
51349   case 'S':
51350   case 'D':
51351   case 'A':
51352     if (CallOperandVal->getType()->isIntegerTy())
51353       weight = CW_SpecificReg;
51354     break;
51355   case 'f':
51356   case 't':
51357   case 'u':
51358     if (type->isFloatingPointTy())
51359       weight = CW_SpecificReg;
51360     break;
51361   case 'y':
51362     if (type->isX86_MMXTy() && Subtarget.hasMMX())
51363       weight = CW_SpecificReg;
51364     break;
51365   case 'Y':
51366     if (StringRef(constraint).size() != 2)
51367       break;
51368     switch (constraint[1]) {
51369       default:
51370         return CW_Invalid;
51371       // XMM0
51372       case 'z':
51373         if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
51374             ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
51375             ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
51376           return CW_SpecificReg;
51377         return CW_Invalid;
51378       // Conditional OpMask regs (AVX512)
51379       case 'k':
51380         if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
51381           return CW_Register;
51382         return CW_Invalid;
51383       // Any MMX reg
51384       case 'm':
51385         if (type->isX86_MMXTy() && Subtarget.hasMMX())
51386           return weight;
51387         return CW_Invalid;
51388       // Any SSE reg when ISA >= SSE2, same as 'x'
51389       case 'i':
51390       case 't':
51391       case '2':
51392         if (!Subtarget.hasSSE2())
51393           return CW_Invalid;
51394         break;
51395     }
51396     break;
51397   case 'v':
51398     if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
51399       weight = CW_Register;
51400     LLVM_FALLTHROUGH;
51401   case 'x':
51402     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
51403         ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
51404       weight = CW_Register;
51405     break;
51406   case 'k':
51407     // Enable conditional vector operations using %k<#> registers.
51408     if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
51409       weight = CW_Register;
51410     break;
51411   case 'I':
51412     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
51413       if (C->getZExtValue() <= 31)
51414         weight = CW_Constant;
51415     }
51416     break;
51417   case 'J':
51418     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51419       if (C->getZExtValue() <= 63)
51420         weight = CW_Constant;
51421     }
51422     break;
51423   case 'K':
51424     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51425       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
51426         weight = CW_Constant;
51427     }
51428     break;
51429   case 'L':
51430     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51431       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
51432         weight = CW_Constant;
51433     }
51434     break;
51435   case 'M':
51436     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51437       if (C->getZExtValue() <= 3)
51438         weight = CW_Constant;
51439     }
51440     break;
51441   case 'N':
51442     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51443       if (C->getZExtValue() <= 0xff)
51444         weight = CW_Constant;
51445     }
51446     break;
51447   case 'G':
51448   case 'C':
51449     if (isa<ConstantFP>(CallOperandVal)) {
51450       weight = CW_Constant;
51451     }
51452     break;
51453   case 'e':
51454     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51455       if ((C->getSExtValue() >= -0x80000000LL) &&
51456           (C->getSExtValue() <= 0x7fffffffLL))
51457         weight = CW_Constant;
51458     }
51459     break;
51460   case 'Z':
51461     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51462       if (C->getZExtValue() <= 0xffffffff)
51463         weight = CW_Constant;
51464     }
51465     break;
51466   }
51467   return weight;
51468 }
51469 
51470 /// Try to replace an X constraint, which matches anything, with another that
51471 /// has more specific requirements based on the type of the corresponding
51472 /// operand.
51473 const char *X86TargetLowering::
LowerXConstraint(EVT ConstraintVT) const51474 LowerXConstraint(EVT ConstraintVT) const {
51475   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
51476   // 'f' like normal targets.
51477   if (ConstraintVT.isFloatingPoint()) {
51478     if (Subtarget.hasSSE1())
51479       return "x";
51480   }
51481 
51482   return TargetLowering::LowerXConstraint(ConstraintVT);
51483 }
51484 
51485 // Lower @cc targets via setcc.
LowerAsmOutputForConstraint(SDValue & Chain,SDValue & Flag,const SDLoc & DL,const AsmOperandInfo & OpInfo,SelectionDAG & DAG) const51486 SDValue X86TargetLowering::LowerAsmOutputForConstraint(
51487     SDValue &Chain, SDValue &Flag, const SDLoc &DL,
51488     const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
51489   X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
51490   if (Cond == X86::COND_INVALID)
51491     return SDValue();
51492   // Check that return type is valid.
51493   if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
51494       OpInfo.ConstraintVT.getSizeInBits() < 8)
51495     report_fatal_error("Flag output operand is of invalid type");
51496 
51497   // Get EFLAGS register. Only update chain when copyfrom is glued.
51498   if (Flag.getNode()) {
51499     Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
51500     Chain = Flag.getValue(1);
51501   } else
51502     Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
51503   // Extract CC code.
51504   SDValue CC = getSETCC(Cond, Flag, DL, DAG);
51505   // Extend to 32-bits
51506   SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
51507 
51508   return Result;
51509 }
51510 
51511 /// Lower the specified operand into the Ops vector.
51512 /// If it is invalid, don't add anything to Ops.
LowerAsmOperandForConstraint(SDValue Op,std::string & Constraint,std::vector<SDValue> & Ops,SelectionDAG & DAG) const51513 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
51514                                                      std::string &Constraint,
51515                                                      std::vector<SDValue>&Ops,
51516                                                      SelectionDAG &DAG) const {
51517   SDValue Result;
51518 
51519   // Only support length 1 constraints for now.
51520   if (Constraint.length() > 1) return;
51521 
51522   char ConstraintLetter = Constraint[0];
51523   switch (ConstraintLetter) {
51524   default: break;
51525   case 'I':
51526     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51527       if (C->getZExtValue() <= 31) {
51528         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51529                                        Op.getValueType());
51530         break;
51531       }
51532     }
51533     return;
51534   case 'J':
51535     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51536       if (C->getZExtValue() <= 63) {
51537         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51538                                        Op.getValueType());
51539         break;
51540       }
51541     }
51542     return;
51543   case 'K':
51544     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51545       if (isInt<8>(C->getSExtValue())) {
51546         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51547                                        Op.getValueType());
51548         break;
51549       }
51550     }
51551     return;
51552   case 'L':
51553     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51554       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
51555           (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
51556         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
51557                                        Op.getValueType());
51558         break;
51559       }
51560     }
51561     return;
51562   case 'M':
51563     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51564       if (C->getZExtValue() <= 3) {
51565         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51566                                        Op.getValueType());
51567         break;
51568       }
51569     }
51570     return;
51571   case 'N':
51572     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51573       if (C->getZExtValue() <= 255) {
51574         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51575                                        Op.getValueType());
51576         break;
51577       }
51578     }
51579     return;
51580   case 'O':
51581     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51582       if (C->getZExtValue() <= 127) {
51583         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51584                                        Op.getValueType());
51585         break;
51586       }
51587     }
51588     return;
51589   case 'e': {
51590     // 32-bit signed value
51591     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51592       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
51593                                            C->getSExtValue())) {
51594         // Widen to 64 bits here to get it sign extended.
51595         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
51596         break;
51597       }
51598     // FIXME gcc accepts some relocatable values here too, but only in certain
51599     // memory models; it's complicated.
51600     }
51601     return;
51602   }
51603   case 'Z': {
51604     // 32-bit unsigned value
51605     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51606       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
51607                                            C->getZExtValue())) {
51608         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51609                                        Op.getValueType());
51610         break;
51611       }
51612     }
51613     // FIXME gcc accepts some relocatable values here too, but only in certain
51614     // memory models; it's complicated.
51615     return;
51616   }
51617   case 'i': {
51618     // Literal immediates are always ok.
51619     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
51620       bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
51621       BooleanContent BCont = getBooleanContents(MVT::i64);
51622       ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
51623                                     : ISD::SIGN_EXTEND;
51624       int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
51625                                                   : CST->getSExtValue();
51626       Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
51627       break;
51628     }
51629 
51630     // In any sort of PIC mode addresses need to be computed at runtime by
51631     // adding in a register or some sort of table lookup.  These can't
51632     // be used as immediates.
51633     if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
51634       return;
51635 
51636     // If we are in non-pic codegen mode, we allow the address of a global (with
51637     // an optional displacement) to be used with 'i'.
51638     if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
51639       // If we require an extra load to get this address, as in PIC mode, we
51640       // can't accept it.
51641       if (isGlobalStubReference(
51642               Subtarget.classifyGlobalReference(GA->getGlobal())))
51643         return;
51644     break;
51645   }
51646   }
51647 
51648   if (Result.getNode()) {
51649     Ops.push_back(Result);
51650     return;
51651   }
51652   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
51653 }
51654 
51655 /// Check if \p RC is a general purpose register class.
51656 /// I.e., GR* or one of their variant.
isGRClass(const TargetRegisterClass & RC)51657 static bool isGRClass(const TargetRegisterClass &RC) {
51658   return RC.hasSuperClassEq(&X86::GR8RegClass) ||
51659          RC.hasSuperClassEq(&X86::GR16RegClass) ||
51660          RC.hasSuperClassEq(&X86::GR32RegClass) ||
51661          RC.hasSuperClassEq(&X86::GR64RegClass) ||
51662          RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
51663 }
51664 
51665 /// Check if \p RC is a vector register class.
51666 /// I.e., FR* / VR* or one of their variant.
isFRClass(const TargetRegisterClass & RC)51667 static bool isFRClass(const TargetRegisterClass &RC) {
51668   return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
51669          RC.hasSuperClassEq(&X86::FR64XRegClass) ||
51670          RC.hasSuperClassEq(&X86::VR128XRegClass) ||
51671          RC.hasSuperClassEq(&X86::VR256XRegClass) ||
51672          RC.hasSuperClassEq(&X86::VR512RegClass);
51673 }
51674 
51675 /// Check if \p RC is a mask register class.
51676 /// I.e., VK* or one of their variant.
isVKClass(const TargetRegisterClass & RC)51677 static bool isVKClass(const TargetRegisterClass &RC) {
51678   return RC.hasSuperClassEq(&X86::VK1RegClass) ||
51679          RC.hasSuperClassEq(&X86::VK2RegClass) ||
51680          RC.hasSuperClassEq(&X86::VK4RegClass) ||
51681          RC.hasSuperClassEq(&X86::VK8RegClass) ||
51682          RC.hasSuperClassEq(&X86::VK16RegClass) ||
51683          RC.hasSuperClassEq(&X86::VK32RegClass) ||
51684          RC.hasSuperClassEq(&X86::VK64RegClass);
51685 }
51686 
51687 std::pair<unsigned, const TargetRegisterClass *>
getRegForInlineAsmConstraint(const TargetRegisterInfo * TRI,StringRef Constraint,MVT VT) const51688 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
51689                                                 StringRef Constraint,
51690                                                 MVT VT) const {
51691   // First, see if this is a constraint that directly corresponds to an LLVM
51692   // register class.
51693   if (Constraint.size() == 1) {
51694     // GCC Constraint Letters
51695     switch (Constraint[0]) {
51696     default: break;
51697     // 'A' means [ER]AX + [ER]DX.
51698     case 'A':
51699       if (Subtarget.is64Bit())
51700         return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
51701       assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
51702              "Expecting 64, 32 or 16 bit subtarget");
51703       return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
51704 
51705       // TODO: Slight differences here in allocation order and leaving
51706       // RIP in the class. Do they matter any more here than they do
51707       // in the normal allocation?
51708     case 'k':
51709       if (Subtarget.hasAVX512()) {
51710         if (VT == MVT::i1)
51711           return std::make_pair(0U, &X86::VK1RegClass);
51712         if (VT == MVT::i8)
51713           return std::make_pair(0U, &X86::VK8RegClass);
51714         if (VT == MVT::i16)
51715           return std::make_pair(0U, &X86::VK16RegClass);
51716       }
51717       if (Subtarget.hasBWI()) {
51718         if (VT == MVT::i32)
51719           return std::make_pair(0U, &X86::VK32RegClass);
51720         if (VT == MVT::i64)
51721           return std::make_pair(0U, &X86::VK64RegClass);
51722       }
51723       break;
51724     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
51725       if (Subtarget.is64Bit()) {
51726         if (VT == MVT::i8 || VT == MVT::i1)
51727           return std::make_pair(0U, &X86::GR8RegClass);
51728         if (VT == MVT::i16)
51729           return std::make_pair(0U, &X86::GR16RegClass);
51730         if (VT == MVT::i32 || VT == MVT::f32)
51731           return std::make_pair(0U, &X86::GR32RegClass);
51732         if (VT != MVT::f80)
51733           return std::make_pair(0U, &X86::GR64RegClass);
51734         break;
51735       }
51736       LLVM_FALLTHROUGH;
51737       // 32-bit fallthrough
51738     case 'Q':   // Q_REGS
51739       if (VT == MVT::i8 || VT == MVT::i1)
51740         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
51741       if (VT == MVT::i16)
51742         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
51743       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
51744         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
51745       if (VT != MVT::f80)
51746         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
51747       break;
51748     case 'r':   // GENERAL_REGS
51749     case 'l':   // INDEX_REGS
51750       if (VT == MVT::i8 || VT == MVT::i1)
51751         return std::make_pair(0U, &X86::GR8RegClass);
51752       if (VT == MVT::i16)
51753         return std::make_pair(0U, &X86::GR16RegClass);
51754       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
51755         return std::make_pair(0U, &X86::GR32RegClass);
51756       if (VT != MVT::f80)
51757         return std::make_pair(0U, &X86::GR64RegClass);
51758       break;
51759     case 'R':   // LEGACY_REGS
51760       if (VT == MVT::i8 || VT == MVT::i1)
51761         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
51762       if (VT == MVT::i16)
51763         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
51764       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
51765         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
51766       if (VT != MVT::f80)
51767         return std::make_pair(0U, &X86::GR64_NOREXRegClass);
51768       break;
51769     case 'f':  // FP Stack registers.
51770       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
51771       // value to the correct fpstack register class.
51772       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
51773         return std::make_pair(0U, &X86::RFP32RegClass);
51774       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
51775         return std::make_pair(0U, &X86::RFP64RegClass);
51776       if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
51777         return std::make_pair(0U, &X86::RFP80RegClass);
51778       break;
51779     case 'y':   // MMX_REGS if MMX allowed.
51780       if (!Subtarget.hasMMX()) break;
51781       return std::make_pair(0U, &X86::VR64RegClass);
51782     case 'v':
51783     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
51784       if (!Subtarget.hasSSE1()) break;
51785       bool VConstraint = (Constraint[0] == 'v');
51786 
51787       switch (VT.SimpleTy) {
51788       default: break;
51789       // Scalar SSE types.
51790       case MVT::f32:
51791       case MVT::i32:
51792         if (VConstraint && Subtarget.hasVLX())
51793           return std::make_pair(0U, &X86::FR32XRegClass);
51794         return std::make_pair(0U, &X86::FR32RegClass);
51795       case MVT::f64:
51796       case MVT::i64:
51797         if (VConstraint && Subtarget.hasVLX())
51798           return std::make_pair(0U, &X86::FR64XRegClass);
51799         return std::make_pair(0U, &X86::FR64RegClass);
51800       case MVT::i128:
51801         if (Subtarget.is64Bit()) {
51802           if (VConstraint && Subtarget.hasVLX())
51803             return std::make_pair(0U, &X86::VR128XRegClass);
51804           return std::make_pair(0U, &X86::VR128RegClass);
51805         }
51806         break;
51807       // Vector types and fp128.
51808       case MVT::f128:
51809       case MVT::v16i8:
51810       case MVT::v8i16:
51811       case MVT::v4i32:
51812       case MVT::v2i64:
51813       case MVT::v4f32:
51814       case MVT::v2f64:
51815         if (VConstraint && Subtarget.hasVLX())
51816           return std::make_pair(0U, &X86::VR128XRegClass);
51817         return std::make_pair(0U, &X86::VR128RegClass);
51818       // AVX types.
51819       case MVT::v32i8:
51820       case MVT::v16i16:
51821       case MVT::v8i32:
51822       case MVT::v4i64:
51823       case MVT::v8f32:
51824       case MVT::v4f64:
51825         if (VConstraint && Subtarget.hasVLX())
51826           return std::make_pair(0U, &X86::VR256XRegClass);
51827         if (Subtarget.hasAVX())
51828           return std::make_pair(0U, &X86::VR256RegClass);
51829         break;
51830       case MVT::v64i8:
51831       case MVT::v32i16:
51832       case MVT::v8f64:
51833       case MVT::v16f32:
51834       case MVT::v16i32:
51835       case MVT::v8i64:
51836         if (!Subtarget.hasAVX512()) break;
51837         if (VConstraint)
51838           return std::make_pair(0U, &X86::VR512RegClass);
51839         return std::make_pair(0U, &X86::VR512_0_15RegClass);
51840       }
51841       break;
51842     }
51843   } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
51844     switch (Constraint[1]) {
51845     default:
51846       break;
51847     case 'i':
51848     case 't':
51849     case '2':
51850       return getRegForInlineAsmConstraint(TRI, "x", VT);
51851     case 'm':
51852       if (!Subtarget.hasMMX()) break;
51853       return std::make_pair(0U, &X86::VR64RegClass);
51854     case 'z':
51855       if (!Subtarget.hasSSE1()) break;
51856       switch (VT.SimpleTy) {
51857       default: break;
51858       // Scalar SSE types.
51859       case MVT::f32:
51860       case MVT::i32:
51861         return std::make_pair(X86::XMM0, &X86::FR32RegClass);
51862       case MVT::f64:
51863       case MVT::i64:
51864         return std::make_pair(X86::XMM0, &X86::FR64RegClass);
51865       case MVT::f128:
51866       case MVT::v16i8:
51867       case MVT::v8i16:
51868       case MVT::v4i32:
51869       case MVT::v2i64:
51870       case MVT::v4f32:
51871       case MVT::v2f64:
51872         return std::make_pair(X86::XMM0, &X86::VR128RegClass);
51873       // AVX types.
51874       case MVT::v32i8:
51875       case MVT::v16i16:
51876       case MVT::v8i32:
51877       case MVT::v4i64:
51878       case MVT::v8f32:
51879       case MVT::v4f64:
51880         if (Subtarget.hasAVX())
51881           return std::make_pair(X86::YMM0, &X86::VR256RegClass);
51882         break;
51883       case MVT::v64i8:
51884       case MVT::v32i16:
51885       case MVT::v8f64:
51886       case MVT::v16f32:
51887       case MVT::v16i32:
51888       case MVT::v8i64:
51889         if (Subtarget.hasAVX512())
51890           return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
51891         break;
51892       }
51893       break;
51894     case 'k':
51895       // This register class doesn't allocate k0 for masked vector operation.
51896       if (Subtarget.hasAVX512()) {
51897         if (VT == MVT::i1)
51898           return std::make_pair(0U, &X86::VK1WMRegClass);
51899         if (VT == MVT::i8)
51900           return std::make_pair(0U, &X86::VK8WMRegClass);
51901         if (VT == MVT::i16)
51902           return std::make_pair(0U, &X86::VK16WMRegClass);
51903       }
51904       if (Subtarget.hasBWI()) {
51905         if (VT == MVT::i32)
51906           return std::make_pair(0U, &X86::VK32WMRegClass);
51907         if (VT == MVT::i64)
51908           return std::make_pair(0U, &X86::VK64WMRegClass);
51909       }
51910       break;
51911     }
51912   }
51913 
51914   if (parseConstraintCode(Constraint) != X86::COND_INVALID)
51915     return std::make_pair(0U, &X86::GR32RegClass);
51916 
51917   // Use the default implementation in TargetLowering to convert the register
51918   // constraint into a member of a register class.
51919   std::pair<Register, const TargetRegisterClass*> Res;
51920   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
51921 
51922   // Not found as a standard register?
51923   if (!Res.second) {
51924     // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
51925     // to/from f80.
51926     if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
51927       // Map st(0) -> st(7) -> ST0
51928       if (Constraint.size() == 7 && Constraint[0] == '{' &&
51929           tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
51930           Constraint[3] == '(' &&
51931           (Constraint[4] >= '0' && Constraint[4] <= '7') &&
51932           Constraint[5] == ')' && Constraint[6] == '}') {
51933         // st(7) is not allocatable and thus not a member of RFP80. Return
51934         // singleton class in cases where we have a reference to it.
51935         if (Constraint[4] == '7')
51936           return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
51937         return std::make_pair(X86::FP0 + Constraint[4] - '0',
51938                               &X86::RFP80RegClass);
51939       }
51940 
51941       // GCC allows "st(0)" to be called just plain "st".
51942       if (StringRef("{st}").equals_lower(Constraint))
51943         return std::make_pair(X86::FP0, &X86::RFP80RegClass);
51944     }
51945 
51946     // flags -> EFLAGS
51947     if (StringRef("{flags}").equals_lower(Constraint))
51948       return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
51949 
51950     // dirflag -> DF
51951     // Only allow for clobber.
51952     if (StringRef("{dirflag}").equals_lower(Constraint) && VT == MVT::Other)
51953       return std::make_pair(X86::DF, &X86::DFCCRRegClass);
51954 
51955     // fpsr -> FPSW
51956     if (StringRef("{fpsr}").equals_lower(Constraint))
51957       return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
51958 
51959     return Res;
51960   }
51961 
51962   // Make sure it isn't a register that requires 64-bit mode.
51963   if (!Subtarget.is64Bit() &&
51964       (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
51965       TRI->getEncodingValue(Res.first) >= 8) {
51966     // Register requires REX prefix, but we're in 32-bit mode.
51967     return std::make_pair(0, nullptr);
51968   }
51969 
51970   // Make sure it isn't a register that requires AVX512.
51971   if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
51972       TRI->getEncodingValue(Res.first) & 0x10) {
51973     // Register requires EVEX prefix.
51974     return std::make_pair(0, nullptr);
51975   }
51976 
51977   // Otherwise, check to see if this is a register class of the wrong value
51978   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
51979   // turn into {ax},{dx}.
51980   // MVT::Other is used to specify clobber names.
51981   if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
51982     return Res;   // Correct type already, nothing to do.
51983 
51984   // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
51985   // return "eax". This should even work for things like getting 64bit integer
51986   // registers when given an f64 type.
51987   const TargetRegisterClass *Class = Res.second;
51988   // The generic code will match the first register class that contains the
51989   // given register. Thus, based on the ordering of the tablegened file,
51990   // the "plain" GR classes might not come first.
51991   // Therefore, use a helper method.
51992   if (isGRClass(*Class)) {
51993     unsigned Size = VT.getSizeInBits();
51994     if (Size == 1) Size = 8;
51995     Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
51996     if (DestReg > 0) {
51997       bool is64Bit = Subtarget.is64Bit();
51998       const TargetRegisterClass *RC =
51999           Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
52000         : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
52001         : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
52002         : Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
52003         : nullptr;
52004       if (Size == 64 && !is64Bit) {
52005         // Model GCC's behavior here and select a fixed pair of 32-bit
52006         // registers.
52007         switch (DestReg) {
52008         case X86::RAX:
52009           return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
52010         case X86::RDX:
52011           return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
52012         case X86::RCX:
52013           return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
52014         case X86::RBX:
52015           return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
52016         case X86::RSI:
52017           return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
52018         case X86::RDI:
52019           return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
52020         case X86::RBP:
52021           return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
52022         default:
52023           return std::make_pair(0, nullptr);
52024         }
52025       }
52026       if (RC && RC->contains(DestReg))
52027         return std::make_pair(DestReg, RC);
52028       return Res;
52029     }
52030     // No register found/type mismatch.
52031     return std::make_pair(0, nullptr);
52032   } else if (isFRClass(*Class)) {
52033     // Handle references to XMM physical registers that got mapped into the
52034     // wrong class.  This can happen with constraints like {xmm0} where the
52035     // target independent register mapper will just pick the first match it can
52036     // find, ignoring the required type.
52037 
52038     // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
52039     if (VT == MVT::f32 || VT == MVT::i32)
52040       Res.second = &X86::FR32XRegClass;
52041     else if (VT == MVT::f64 || VT == MVT::i64)
52042       Res.second = &X86::FR64XRegClass;
52043     else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
52044       Res.second = &X86::VR128XRegClass;
52045     else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
52046       Res.second = &X86::VR256XRegClass;
52047     else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
52048       Res.second = &X86::VR512RegClass;
52049     else {
52050       // Type mismatch and not a clobber: Return an error;
52051       Res.first = 0;
52052       Res.second = nullptr;
52053     }
52054   } else if (isVKClass(*Class)) {
52055     if (VT == MVT::i1)
52056       Res.second = &X86::VK1RegClass;
52057     else if (VT == MVT::i8)
52058       Res.second = &X86::VK8RegClass;
52059     else if (VT == MVT::i16)
52060       Res.second = &X86::VK16RegClass;
52061     else if (VT == MVT::i32)
52062       Res.second = &X86::VK32RegClass;
52063     else if (VT == MVT::i64)
52064       Res.second = &X86::VK64RegClass;
52065     else {
52066       // Type mismatch and not a clobber: Return an error;
52067       Res.first = 0;
52068       Res.second = nullptr;
52069     }
52070   }
52071 
52072   return Res;
52073 }
52074 
getScalingFactorCost(const DataLayout & DL,const AddrMode & AM,Type * Ty,unsigned AS) const52075 InstructionCost X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
52076                                                         const AddrMode &AM,
52077                                                         Type *Ty,
52078                                                         unsigned AS) const {
52079   // Scaling factors are not free at all.
52080   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
52081   // will take 2 allocations in the out of order engine instead of 1
52082   // for plain addressing mode, i.e. inst (reg1).
52083   // E.g.,
52084   // vaddps (%rsi,%rdx), %ymm0, %ymm1
52085   // Requires two allocations (one for the load, one for the computation)
52086   // whereas:
52087   // vaddps (%rsi), %ymm0, %ymm1
52088   // Requires just 1 allocation, i.e., freeing allocations for other operations
52089   // and having less micro operations to execute.
52090   //
52091   // For some X86 architectures, this is even worse because for instance for
52092   // stores, the complex addressing mode forces the instruction to use the
52093   // "load" ports instead of the dedicated "store" port.
52094   // E.g., on Haswell:
52095   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
52096   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
52097   if (isLegalAddressingMode(DL, AM, Ty, AS))
52098     // Scale represents reg2 * scale, thus account for 1
52099     // as soon as we use a second register.
52100     return AM.Scale != 0;
52101   return -1;
52102 }
52103 
isIntDivCheap(EVT VT,AttributeList Attr) const52104 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
52105   // Integer division on x86 is expensive. However, when aggressively optimizing
52106   // for code size, we prefer to use a div instruction, as it is usually smaller
52107   // than the alternative sequence.
52108   // The exception to this is vector division. Since x86 doesn't have vector
52109   // integer division, leaving the division as-is is a loss even in terms of
52110   // size, because it will have to be scalarized, while the alternative code
52111   // sequence can be performed in vector form.
52112   bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
52113   return OptSize && !VT.isVector();
52114 }
52115 
initializeSplitCSR(MachineBasicBlock * Entry) const52116 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
52117   if (!Subtarget.is64Bit())
52118     return;
52119 
52120   // Update IsSplitCSR in X86MachineFunctionInfo.
52121   X86MachineFunctionInfo *AFI =
52122       Entry->getParent()->getInfo<X86MachineFunctionInfo>();
52123   AFI->setIsSplitCSR(true);
52124 }
52125 
insertCopiesSplitCSR(MachineBasicBlock * Entry,const SmallVectorImpl<MachineBasicBlock * > & Exits) const52126 void X86TargetLowering::insertCopiesSplitCSR(
52127     MachineBasicBlock *Entry,
52128     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
52129   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
52130   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
52131   if (!IStart)
52132     return;
52133 
52134   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
52135   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
52136   MachineBasicBlock::iterator MBBI = Entry->begin();
52137   for (const MCPhysReg *I = IStart; *I; ++I) {
52138     const TargetRegisterClass *RC = nullptr;
52139     if (X86::GR64RegClass.contains(*I))
52140       RC = &X86::GR64RegClass;
52141     else
52142       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
52143 
52144     Register NewVR = MRI->createVirtualRegister(RC);
52145     // Create copy from CSR to a virtual register.
52146     // FIXME: this currently does not emit CFI pseudo-instructions, it works
52147     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
52148     // nounwind. If we want to generalize this later, we may need to emit
52149     // CFI pseudo-instructions.
52150     assert(
52151         Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
52152         "Function should be nounwind in insertCopiesSplitCSR!");
52153     Entry->addLiveIn(*I);
52154     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
52155         .addReg(*I);
52156 
52157     // Insert the copy-back instructions right before the terminator.
52158     for (auto *Exit : Exits)
52159       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
52160               TII->get(TargetOpcode::COPY), *I)
52161           .addReg(NewVR);
52162   }
52163 }
52164 
supportSwiftError() const52165 bool X86TargetLowering::supportSwiftError() const {
52166   return Subtarget.is64Bit();
52167 }
52168 
52169 /// Returns true if stack probing through a function call is requested.
hasStackProbeSymbol(MachineFunction & MF) const52170 bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {
52171   return !getStackProbeSymbolName(MF).empty();
52172 }
52173 
52174 /// Returns true if stack probing through inline assembly is requested.
hasInlineStackProbe(MachineFunction & MF) const52175 bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
52176 
52177   // No inline stack probe for Windows, they have their own mechanism.
52178   if (Subtarget.isOSWindows() ||
52179       MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
52180     return false;
52181 
52182   // If the function specifically requests inline stack probes, emit them.
52183   if (MF.getFunction().hasFnAttribute("probe-stack"))
52184     return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
52185            "inline-asm";
52186 
52187   return false;
52188 }
52189 
52190 /// Returns the name of the symbol used to emit stack probes or the empty
52191 /// string if not applicable.
52192 StringRef
getStackProbeSymbolName(MachineFunction & MF) const52193 X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
52194   // Inline Stack probes disable stack probe call
52195   if (hasInlineStackProbe(MF))
52196     return "";
52197 
52198   // If the function specifically requests stack probes, emit them.
52199   if (MF.getFunction().hasFnAttribute("probe-stack"))
52200     return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
52201 
52202   // Generally, if we aren't on Windows, the platform ABI does not include
52203   // support for stack probes, so don't emit them.
52204   if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
52205       MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
52206     return "";
52207 
52208   // We need a stack probe to conform to the Windows ABI. Choose the right
52209   // symbol.
52210   if (Subtarget.is64Bit())
52211     return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
52212   return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
52213 }
52214 
52215 unsigned
getStackProbeSize(MachineFunction & MF) const52216 X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
52217   // The default stack probe size is 4096 if the function has no stackprobesize
52218   // attribute.
52219   unsigned StackProbeSize = 4096;
52220   const Function &Fn = MF.getFunction();
52221   if (Fn.hasFnAttribute("stack-probe-size"))
52222     Fn.getFnAttribute("stack-probe-size")
52223         .getValueAsString()
52224         .getAsInteger(0, StackProbeSize);
52225   return StackProbeSize;
52226 }
52227 
getPrefLoopAlignment(MachineLoop * ML) const52228 Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
52229   if (ML->isInnermost() &&
52230       ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
52231     return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
52232   return TargetLowering::getPrefLoopAlignment();
52233 }
52234