1 //===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the interfaces that X86 uses to lower LLVM code into a
10 // selection DAG.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
15 #define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
16 
17 #include "llvm/CodeGen/MachineFunction.h"
18 #include "llvm/CodeGen/TargetLowering.h"
19 
20 namespace llvm {
21   class X86Subtarget;
22   class X86TargetMachine;
23 
24   namespace X86ISD {
25     // X86 Specific DAG Nodes
26   enum NodeType : unsigned {
27     // Start the numbering where the builtin ops leave off.
28     FIRST_NUMBER = ISD::BUILTIN_OP_END,
29 
30     /// Bit scan forward.
31     BSF,
32     /// Bit scan reverse.
33     BSR,
34 
35     /// X86 funnel/double shift i16 instructions. These correspond to
36     /// X86::SHLDW and X86::SHRDW instructions which have different amt
37     /// modulo rules to generic funnel shifts.
38     /// NOTE: The operand order matches ISD::FSHL/FSHR not SHLD/SHRD.
39     FSHL,
40     FSHR,
41 
42     /// Bitwise logical AND of floating point values. This corresponds
43     /// to X86::ANDPS or X86::ANDPD.
44     FAND,
45 
46     /// Bitwise logical OR of floating point values. This corresponds
47     /// to X86::ORPS or X86::ORPD.
48     FOR,
49 
50     /// Bitwise logical XOR of floating point values. This corresponds
51     /// to X86::XORPS or X86::XORPD.
52     FXOR,
53 
54     ///  Bitwise logical ANDNOT of floating point values. This
55     /// corresponds to X86::ANDNPS or X86::ANDNPD.
56     FANDN,
57 
58     /// These operations represent an abstract X86 call
59     /// instruction, which includes a bunch of information.  In particular the
60     /// operands of these node are:
61     ///
62     ///     #0 - The incoming token chain
63     ///     #1 - The callee
64     ///     #2 - The number of arg bytes the caller pushes on the stack.
65     ///     #3 - The number of arg bytes the callee pops off the stack.
66     ///     #4 - The value to pass in AL/AX/EAX (optional)
67     ///     #5 - The value to pass in DL/DX/EDX (optional)
68     ///
69     /// The result values of these nodes are:
70     ///
71     ///     #0 - The outgoing token chain
72     ///     #1 - The first register result value (optional)
73     ///     #2 - The second register result value (optional)
74     ///
75     CALL,
76 
77     /// Same as call except it adds the NoTrack prefix.
78     NT_CALL,
79 
80     // Pseudo for a OBJC call that gets emitted together with a special
81     // marker instruction.
82     CALL_RVMARKER,
83 
84     /// X86 compare and logical compare instructions.
85     CMP,
86     FCMP,
87     COMI,
88     UCOMI,
89 
90     /// X86 bit-test instructions.
91     BT,
92 
93     /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
94     /// operand, usually produced by a CMP instruction.
95     SETCC,
96 
97     /// X86 Select
98     SELECTS,
99 
100     // Same as SETCC except it's materialized with a sbb and the value is all
101     // one's or all zero's.
102     SETCC_CARRY, // R = carry_bit ? ~0 : 0
103 
104     /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
105     /// Operands are two FP values to compare; result is a mask of
106     /// 0s or 1s.  Generally DTRT for C/C++ with NaNs.
107     FSETCC,
108 
109     /// X86 FP SETCC, similar to above, but with output as an i1 mask and
110     /// and a version with SAE.
111     FSETCCM,
112     FSETCCM_SAE,
113 
114     /// X86 conditional moves. Operand 0 and operand 1 are the two values
115     /// to select from. Operand 2 is the condition code, and operand 3 is the
116     /// flag operand produced by a CMP or TEST instruction.
117     CMOV,
118 
119     /// X86 conditional branches. Operand 0 is the chain operand, operand 1
120     /// is the block to branch if condition is true, operand 2 is the
121     /// condition code, and operand 3 is the flag operand produced by a CMP
122     /// or TEST instruction.
123     BRCOND,
124 
125     /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and
126     /// operand 1 is the target address.
127     NT_BRIND,
128 
129     /// Return with a flag operand. Operand 0 is the chain operand, operand
130     /// 1 is the number of bytes of stack to pop.
131     RET_FLAG,
132 
133     /// Return from interrupt. Operand 0 is the number of bytes to pop.
134     IRET,
135 
136     /// Repeat fill, corresponds to X86::REP_STOSx.
137     REP_STOS,
138 
139     /// Repeat move, corresponds to X86::REP_MOVSx.
140     REP_MOVS,
141 
142     /// On Darwin, this node represents the result of the popl
143     /// at function entry, used for PIC code.
144     GlobalBaseReg,
145 
146     /// A wrapper node for TargetConstantPool, TargetJumpTable,
147     /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress,
148     /// MCSymbol and TargetBlockAddress.
149     Wrapper,
150 
151     /// Special wrapper used under X86-64 PIC mode for RIP
152     /// relative displacements.
153     WrapperRIP,
154 
155     /// Copies a 64-bit value from an MMX vector to the low word
156     /// of an XMM vector, with the high word zero filled.
157     MOVQ2DQ,
158 
159     /// Copies a 64-bit value from the low word of an XMM vector
160     /// to an MMX vector.
161     MOVDQ2Q,
162 
163     /// Copies a 32-bit value from the low word of a MMX
164     /// vector to a GPR.
165     MMX_MOVD2W,
166 
167     /// Copies a GPR into the low 32-bit word of a MMX vector
168     /// and zero out the high word.
169     MMX_MOVW2D,
170 
171     /// Extract an 8-bit value from a vector and zero extend it to
172     /// i32, corresponds to X86::PEXTRB.
173     PEXTRB,
174 
175     /// Extract a 16-bit value from a vector and zero extend it to
176     /// i32, corresponds to X86::PEXTRW.
177     PEXTRW,
178 
179     /// Insert any element of a 4 x float vector into any element
180     /// of a destination 4 x floatvector.
181     INSERTPS,
182 
183     /// Insert the lower 8-bits of a 32-bit value to a vector,
184     /// corresponds to X86::PINSRB.
185     PINSRB,
186 
187     /// Insert the lower 16-bits of a 32-bit value to a vector,
188     /// corresponds to X86::PINSRW.
189     PINSRW,
190 
191     /// Shuffle 16 8-bit values within a vector.
192     PSHUFB,
193 
194     /// Compute Sum of Absolute Differences.
195     PSADBW,
196     /// Compute Double Block Packed Sum-Absolute-Differences
197     DBPSADBW,
198 
199     /// Bitwise Logical AND NOT of Packed FP values.
200     ANDNP,
201 
202     /// Blend where the selector is an immediate.
203     BLENDI,
204 
205     /// Dynamic (non-constant condition) vector blend where only the sign bits
206     /// of the condition elements are used. This is used to enforce that the
207     /// condition mask is not valid for generic VSELECT optimizations. This
208     /// is also used to implement the intrinsics.
209     /// Operands are in VSELECT order: MASK, TRUE, FALSE
210     BLENDV,
211 
212     /// Combined add and sub on an FP vector.
213     ADDSUB,
214 
215     //  FP vector ops with rounding mode.
216     FADD_RND,
217     FADDS,
218     FADDS_RND,
219     FSUB_RND,
220     FSUBS,
221     FSUBS_RND,
222     FMUL_RND,
223     FMULS,
224     FMULS_RND,
225     FDIV_RND,
226     FDIVS,
227     FDIVS_RND,
228     FMAX_SAE,
229     FMAXS_SAE,
230     FMIN_SAE,
231     FMINS_SAE,
232     FSQRT_RND,
233     FSQRTS,
234     FSQRTS_RND,
235 
236     // FP vector get exponent.
237     FGETEXP,
238     FGETEXP_SAE,
239     FGETEXPS,
240     FGETEXPS_SAE,
241     // Extract Normalized Mantissas.
242     VGETMANT,
243     VGETMANT_SAE,
244     VGETMANTS,
245     VGETMANTS_SAE,
246     // FP Scale.
247     SCALEF,
248     SCALEF_RND,
249     SCALEFS,
250     SCALEFS_RND,
251 
252     // Unsigned Integer average.
253     AVG,
254 
255     /// Integer horizontal add/sub.
256     HADD,
257     HSUB,
258 
259     /// Floating point horizontal add/sub.
260     FHADD,
261     FHSUB,
262 
263     // Detect Conflicts Within a Vector
264     CONFLICT,
265 
266     /// Floating point max and min.
267     FMAX,
268     FMIN,
269 
270     /// Commutative FMIN and FMAX.
271     FMAXC,
272     FMINC,
273 
274     /// Scalar intrinsic floating point max and min.
275     FMAXS,
276     FMINS,
277 
278     /// Floating point reciprocal-sqrt and reciprocal approximation.
279     /// Note that these typically require refinement
280     /// in order to obtain suitable precision.
281     FRSQRT,
282     FRCP,
283 
284     // AVX-512 reciprocal approximations with a little more precision.
285     RSQRT14,
286     RSQRT14S,
287     RCP14,
288     RCP14S,
289 
290     // Thread Local Storage.
291     TLSADDR,
292 
293     // Thread Local Storage. A call to get the start address
294     // of the TLS block for the current module.
295     TLSBASEADDR,
296 
297     // Thread Local Storage.  When calling to an OS provided
298     // thunk at the address from an earlier relocation.
299     TLSCALL,
300 
301     // Exception Handling helpers.
302     EH_RETURN,
303 
304     // SjLj exception handling setjmp.
305     EH_SJLJ_SETJMP,
306 
307     // SjLj exception handling longjmp.
308     EH_SJLJ_LONGJMP,
309 
310     // SjLj exception handling dispatch.
311     EH_SJLJ_SETUP_DISPATCH,
312 
313     /// Tail call return. See X86TargetLowering::LowerCall for
314     /// the list of operands.
315     TC_RETURN,
316 
317     // Vector move to low scalar and zero higher vector elements.
318     VZEXT_MOVL,
319 
320     // Vector integer truncate.
321     VTRUNC,
322     // Vector integer truncate with unsigned/signed saturation.
323     VTRUNCUS,
324     VTRUNCS,
325 
326     // Masked version of the above. Used when less than a 128-bit result is
327     // produced since the mask only applies to the lower elements and can't
328     // be represented by a select.
329     // SRC, PASSTHRU, MASK
330     VMTRUNC,
331     VMTRUNCUS,
332     VMTRUNCS,
333 
334     // Vector FP extend.
335     VFPEXT,
336     VFPEXT_SAE,
337     VFPEXTS,
338     VFPEXTS_SAE,
339 
340     // Vector FP round.
341     VFPROUND,
342     VFPROUND_RND,
343     VFPROUNDS,
344     VFPROUNDS_RND,
345 
346     // Masked version of above. Used for v2f64->v4f32.
347     // SRC, PASSTHRU, MASK
348     VMFPROUND,
349 
350     // 128-bit vector logical left / right shift
351     VSHLDQ,
352     VSRLDQ,
353 
354     // Vector shift elements
355     VSHL,
356     VSRL,
357     VSRA,
358 
359     // Vector variable shift
360     VSHLV,
361     VSRLV,
362     VSRAV,
363 
364     // Vector shift elements by immediate
365     VSHLI,
366     VSRLI,
367     VSRAI,
368 
369     // Shifts of mask registers.
370     KSHIFTL,
371     KSHIFTR,
372 
373     // Bit rotate by immediate
374     VROTLI,
375     VROTRI,
376 
377     // Vector packed double/float comparison.
378     CMPP,
379 
380     // Vector integer comparisons.
381     PCMPEQ,
382     PCMPGT,
383 
384     // v8i16 Horizontal minimum and position.
385     PHMINPOS,
386 
387     MULTISHIFT,
388 
389     /// Vector comparison generating mask bits for fp and
390     /// integer signed and unsigned data types.
391     CMPM,
392     // Vector mask comparison generating mask bits for FP values.
393     CMPMM,
394     // Vector mask comparison with SAE for FP values.
395     CMPMM_SAE,
396 
397     // Arithmetic operations with FLAGS results.
398     ADD,
399     SUB,
400     ADC,
401     SBB,
402     SMUL,
403     UMUL,
404     OR,
405     XOR,
406     AND,
407 
408     // Bit field extract.
409     BEXTR,
410     BEXTRI,
411 
412     // Zero High Bits Starting with Specified Bit Position.
413     BZHI,
414 
415     // Parallel extract and deposit.
416     PDEP,
417     PEXT,
418 
419     // X86-specific multiply by immediate.
420     MUL_IMM,
421 
422     // Vector sign bit extraction.
423     MOVMSK,
424 
425     // Vector bitwise comparisons.
426     PTEST,
427 
428     // Vector packed fp sign bitwise comparisons.
429     TESTP,
430 
431     // OR/AND test for masks.
432     KORTEST,
433     KTEST,
434 
435     // ADD for masks.
436     KADD,
437 
438     // Several flavors of instructions with vector shuffle behaviors.
439     // Saturated signed/unnsigned packing.
440     PACKSS,
441     PACKUS,
442     // Intra-lane alignr.
443     PALIGNR,
444     // AVX512 inter-lane alignr.
445     VALIGN,
446     PSHUFD,
447     PSHUFHW,
448     PSHUFLW,
449     SHUFP,
450     // VBMI2 Concat & Shift.
451     VSHLD,
452     VSHRD,
453     VSHLDV,
454     VSHRDV,
455     // Shuffle Packed Values at 128-bit granularity.
456     SHUF128,
457     MOVDDUP,
458     MOVSHDUP,
459     MOVSLDUP,
460     MOVLHPS,
461     MOVHLPS,
462     MOVSD,
463     MOVSS,
464     MOVSH,
465     UNPCKL,
466     UNPCKH,
467     VPERMILPV,
468     VPERMILPI,
469     VPERMI,
470     VPERM2X128,
471 
472     // Variable Permute (VPERM).
473     // Res = VPERMV MaskV, V0
474     VPERMV,
475 
476     // 3-op Variable Permute (VPERMT2).
477     // Res = VPERMV3 V0, MaskV, V1
478     VPERMV3,
479 
480     // Bitwise ternary logic.
481     VPTERNLOG,
482     // Fix Up Special Packed Float32/64 values.
483     VFIXUPIMM,
484     VFIXUPIMM_SAE,
485     VFIXUPIMMS,
486     VFIXUPIMMS_SAE,
487     // Range Restriction Calculation For Packed Pairs of Float32/64 values.
488     VRANGE,
489     VRANGE_SAE,
490     VRANGES,
491     VRANGES_SAE,
492     // Reduce - Perform Reduction Transformation on scalar\packed FP.
493     VREDUCE,
494     VREDUCE_SAE,
495     VREDUCES,
496     VREDUCES_SAE,
497     // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
498     // Also used by the legacy (V)ROUND intrinsics where we mask out the
499     // scaling part of the immediate.
500     VRNDSCALE,
501     VRNDSCALE_SAE,
502     VRNDSCALES,
503     VRNDSCALES_SAE,
504     // Tests Types Of a FP Values for packed types.
505     VFPCLASS,
506     // Tests Types Of a FP Values for scalar types.
507     VFPCLASSS,
508 
509     // Broadcast (splat) scalar or element 0 of a vector. If the operand is
510     // a vector, this node may change the vector length as part of the splat.
511     VBROADCAST,
512     // Broadcast mask to vector.
513     VBROADCASTM,
514 
515     /// SSE4A Extraction and Insertion.
516     EXTRQI,
517     INSERTQI,
518 
519     // XOP arithmetic/logical shifts.
520     VPSHA,
521     VPSHL,
522     // XOP signed/unsigned integer comparisons.
523     VPCOM,
524     VPCOMU,
525     // XOP packed permute bytes.
526     VPPERM,
527     // XOP two source permutation.
528     VPERMIL2,
529 
530     // Vector multiply packed unsigned doubleword integers.
531     PMULUDQ,
532     // Vector multiply packed signed doubleword integers.
533     PMULDQ,
534     // Vector Multiply Packed UnsignedIntegers with Round and Scale.
535     MULHRS,
536 
537     // Multiply and Add Packed Integers.
538     VPMADDUBSW,
539     VPMADDWD,
540 
541     // AVX512IFMA multiply and add.
542     // NOTE: These are different than the instruction and perform
543     // op0 x op1 + op2.
544     VPMADD52L,
545     VPMADD52H,
546 
547     // VNNI
548     VPDPBUSD,
549     VPDPBUSDS,
550     VPDPWSSD,
551     VPDPWSSDS,
552 
553     // FMA nodes.
554     // We use the target independent ISD::FMA for the non-inverted case.
555     FNMADD,
556     FMSUB,
557     FNMSUB,
558     FMADDSUB,
559     FMSUBADD,
560 
561     // FMA with rounding mode.
562     FMADD_RND,
563     FNMADD_RND,
564     FMSUB_RND,
565     FNMSUB_RND,
566     FMADDSUB_RND,
567     FMSUBADD_RND,
568 
569     // AVX512-FP16 complex addition and multiplication.
570     VFMADDC,
571     VFMADDC_RND,
572     VFCMADDC,
573     VFCMADDC_RND,
574 
575     VFMULC,
576     VFMULC_RND,
577     VFCMULC,
578     VFCMULC_RND,
579 
580     VFMADDCSH,
581     VFMADDCSH_RND,
582     VFCMADDCSH,
583     VFCMADDCSH_RND,
584 
585     VFMULCSH,
586     VFMULCSH_RND,
587     VFCMULCSH,
588     VFCMULCSH_RND,
589 
590     // Compress and expand.
591     COMPRESS,
592     EXPAND,
593 
594     // Bits shuffle
595     VPSHUFBITQMB,
596 
597     // Convert Unsigned/Integer to Floating-Point Value with rounding mode.
598     SINT_TO_FP_RND,
599     UINT_TO_FP_RND,
600     SCALAR_SINT_TO_FP,
601     SCALAR_UINT_TO_FP,
602     SCALAR_SINT_TO_FP_RND,
603     SCALAR_UINT_TO_FP_RND,
604 
605     // Vector float/double to signed/unsigned integer.
606     CVTP2SI,
607     CVTP2UI,
608     CVTP2SI_RND,
609     CVTP2UI_RND,
610     // Scalar float/double to signed/unsigned integer.
611     CVTS2SI,
612     CVTS2UI,
613     CVTS2SI_RND,
614     CVTS2UI_RND,
615 
616     // Vector float/double to signed/unsigned integer with truncation.
617     CVTTP2SI,
618     CVTTP2UI,
619     CVTTP2SI_SAE,
620     CVTTP2UI_SAE,
621     // Scalar float/double to signed/unsigned integer with truncation.
622     CVTTS2SI,
623     CVTTS2UI,
624     CVTTS2SI_SAE,
625     CVTTS2UI_SAE,
626 
627     // Vector signed/unsigned integer to float/double.
628     CVTSI2P,
629     CVTUI2P,
630 
631     // Masked versions of above. Used for v2f64->v4f32.
632     // SRC, PASSTHRU, MASK
633     MCVTP2SI,
634     MCVTP2UI,
635     MCVTTP2SI,
636     MCVTTP2UI,
637     MCVTSI2P,
638     MCVTUI2P,
639 
640     // Vector float to bfloat16.
641     // Convert TWO packed single data to one packed BF16 data
642     CVTNE2PS2BF16,
643     // Convert packed single data to packed BF16 data
644     CVTNEPS2BF16,
645     // Masked version of above.
646     // SRC, PASSTHRU, MASK
647     MCVTNEPS2BF16,
648 
649     // Dot product of BF16 pairs to accumulated into
650     // packed single precision.
651     DPBF16PS,
652 
653     // A stack checking function call. On Windows it's _chkstk call.
654     DYN_ALLOCA,
655 
656     // For allocating variable amounts of stack space when using
657     // segmented stacks. Check if the current stacklet has enough space, and
658     // falls back to heap allocation if not.
659     SEG_ALLOCA,
660 
661     // For allocating stack space when using stack clash protector.
662     // Allocation is performed by block, and each block is probed.
663     PROBED_ALLOCA,
664 
665     // Memory barriers.
666     MEMBARRIER,
667     MFENCE,
668 
669     // Get a random integer and indicate whether it is valid in CF.
670     RDRAND,
671 
672     // Get a NIST SP800-90B & C compliant random integer and
673     // indicate whether it is valid in CF.
674     RDSEED,
675 
676     // Protection keys
677     // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX.
678     // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is
679     // value for ECX.
680     RDPKRU,
681     WRPKRU,
682 
683     // SSE42 string comparisons.
684     // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
685     // will emit one or two instructions based on which results are used. If
686     // flags and index/mask this allows us to use a single instruction since
687     // we won't have to pick and opcode for flags. Instead we can rely on the
688     // DAG to CSE everything and decide at isel.
689     PCMPISTR,
690     PCMPESTR,
691 
692     // Test if in transactional execution.
693     XTEST,
694 
695     // ERI instructions.
696     RSQRT28,
697     RSQRT28_SAE,
698     RSQRT28S,
699     RSQRT28S_SAE,
700     RCP28,
701     RCP28_SAE,
702     RCP28S,
703     RCP28S_SAE,
704     EXP2,
705     EXP2_SAE,
706 
707     // Conversions between float and half-float.
708     CVTPS2PH,
709     CVTPH2PS,
710     CVTPH2PS_SAE,
711 
712     // Masked version of above.
713     // SRC, RND, PASSTHRU, MASK
714     MCVTPS2PH,
715 
716     // Galois Field Arithmetic Instructions
717     GF2P8AFFINEINVQB,
718     GF2P8AFFINEQB,
719     GF2P8MULB,
720 
721     // LWP insert record.
722     LWPINS,
723 
724     // User level wait
725     UMWAIT,
726     TPAUSE,
727 
728     // Enqueue Stores Instructions
729     ENQCMD,
730     ENQCMDS,
731 
732     // For avx512-vp2intersect
733     VP2INTERSECT,
734 
735     // User level interrupts - testui
736     TESTUI,
737 
738     /// X86 strict FP compare instructions.
739     STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
740     STRICT_FCMPS,
741 
742     // Vector packed double/float comparison.
743     STRICT_CMPP,
744 
745     /// Vector comparison generating mask bits for fp and
746     /// integer signed and unsigned data types.
747     STRICT_CMPM,
748 
749     // Vector float/double to signed/unsigned integer with truncation.
750     STRICT_CVTTP2SI,
751     STRICT_CVTTP2UI,
752 
753     // Vector FP extend.
754     STRICT_VFPEXT,
755 
756     // Vector FP round.
757     STRICT_VFPROUND,
758 
759     // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
760     // Also used by the legacy (V)ROUND intrinsics where we mask out the
761     // scaling part of the immediate.
762     STRICT_VRNDSCALE,
763 
764     // Vector signed/unsigned integer to float/double.
765     STRICT_CVTSI2P,
766     STRICT_CVTUI2P,
767 
768     // Strict FMA nodes.
769     STRICT_FNMADD,
770     STRICT_FMSUB,
771     STRICT_FNMSUB,
772 
773     // Conversions between float and half-float.
774     STRICT_CVTPS2PH,
775     STRICT_CVTPH2PS,
776 
777     // WARNING: Only add nodes here if they are stric FP nodes. Non-memory and
778     // non-strict FP nodes should be above FIRST_TARGET_STRICTFP_OPCODE.
779 
780     // Compare and swap.
781     LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
782     LCMPXCHG8_DAG,
783     LCMPXCHG16_DAG,
784     LCMPXCHG16_SAVE_RBX_DAG,
785 
786     /// LOCK-prefixed arithmetic read-modify-write instructions.
787     /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
788     LADD,
789     LSUB,
790     LOR,
791     LXOR,
792     LAND,
793 
794     // Load, scalar_to_vector, and zero extend.
795     VZEXT_LOAD,
796 
797     // extract_vector_elt, store.
798     VEXTRACT_STORE,
799 
800     // scalar broadcast from memory.
801     VBROADCAST_LOAD,
802 
803     // subvector broadcast from memory.
804     SUBV_BROADCAST_LOAD,
805 
806     // Store FP control word into i16 memory.
807     FNSTCW16m,
808 
809     // Load FP control word from i16 memory.
810     FLDCW16m,
811 
812     /// This instruction implements FP_TO_SINT with the
813     /// integer destination in memory and a FP reg source.  This corresponds
814     /// to the X86::FIST*m instructions and the rounding mode change stuff. It
815     /// has two inputs (token chain and address) and two outputs (int value
816     /// and token chain). Memory VT specifies the type to store to.
817     FP_TO_INT_IN_MEM,
818 
819     /// This instruction implements SINT_TO_FP with the
820     /// integer source in memory and FP reg result.  This corresponds to the
821     /// X86::FILD*m instructions. It has two inputs (token chain and address)
822     /// and two outputs (FP value and token chain). The integer source type is
823     /// specified by the memory VT.
824     FILD,
825 
826     /// This instruction implements a fp->int store from FP stack
827     /// slots. This corresponds to the fist instruction. It takes a
828     /// chain operand, value to store, address, and glue. The memory VT
829     /// specifies the type to store as.
830     FIST,
831 
832     /// This instruction implements an extending load to FP stack slots.
833     /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
834     /// operand, and ptr to load from. The memory VT specifies the type to
835     /// load from.
836     FLD,
837 
838     /// This instruction implements a truncating store from FP stack
839     /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
840     /// chain operand, value to store, address, and glue. The memory VT
841     /// specifies the type to store as.
842     FST,
843 
844     /// These instructions grab the address of the next argument
845     /// from a va_list. (reads and modifies the va_list in memory)
846     VAARG_64,
847     VAARG_X32,
848 
849     // Vector truncating store with unsigned/signed saturation
850     VTRUNCSTOREUS,
851     VTRUNCSTORES,
852     // Vector truncating masked store with unsigned/signed saturation
853     VMTRUNCSTOREUS,
854     VMTRUNCSTORES,
855 
856     // X86 specific gather and scatter
857     MGATHER,
858     MSCATTER,
859 
860     // Key locker nodes that produce flags.
861     AESENC128KL,
862     AESDEC128KL,
863     AESENC256KL,
864     AESDEC256KL,
865     AESENCWIDE128KL,
866     AESDECWIDE128KL,
867     AESENCWIDE256KL,
868     AESDECWIDE256KL,
869 
870     // Save xmm argument registers to the stack, according to %al. An operator
871     // is needed so that this can be expanded with control flow.
872     VASTART_SAVE_XMM_REGS,
873 
874     // WARNING: Do not add anything in the end unless you want the node to
875     // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
876     // opcodes will be thought as target memory ops!
877   };
878   } // end namespace X86ISD
879 
880   namespace X86 {
881     /// Current rounding mode is represented in bits 11:10 of FPSR. These
882     /// values are same as corresponding constants for rounding mode used
883     /// in glibc.
884     enum RoundingMode {
885       rmToNearest   = 0,        // FE_TONEAREST
886       rmDownward    = 1 << 10,  // FE_DOWNWARD
887       rmUpward      = 2 << 10,  // FE_UPWARD
888       rmTowardZero  = 3 << 10,  // FE_TOWARDZERO
889       rmMask        = 3 << 10   // Bit mask selecting rounding mode
890     };
891   }
892 
893   /// Define some predicates that are used for node matching.
894   namespace X86 {
895     /// Returns true if Elt is a constant zero or floating point constant +0.0.
896     bool isZeroNode(SDValue Elt);
897 
898     /// Returns true of the given offset can be
899     /// fit into displacement field of the instruction.
900     bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
901                                       bool hasSymbolicDisplacement);
902 
903     /// Determines whether the callee is required to pop its
904     /// own arguments. Callee pop is necessary to support tail calls.
905     bool isCalleePop(CallingConv::ID CallingConv,
906                      bool is64Bit, bool IsVarArg, bool GuaranteeTCO);
907 
908     /// If Op is a constant whose elements are all the same constant or
909     /// undefined, return true and return the constant value in \p SplatVal.
910     /// If we have undef bits that don't cover an entire element, we treat these
911     /// as zero if AllowPartialUndefs is set, else we fail and return false.
912     bool isConstantSplat(SDValue Op, APInt &SplatVal,
913                          bool AllowPartialUndefs = true);
914 
915     /// Check if Op is a load operation that could be folded into some other x86
916     /// instruction as a memory operand. Example: vpaddd (%rdi), %xmm0, %xmm0.
917     bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
918                      bool AssumeSingleUse = false);
919 
920     /// Check if Op is a load operation that could be folded into a vector splat
921     /// instruction as a memory operand. Example: vbroadcastss 16(%rdi), %xmm2.
922     bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
923                                          const X86Subtarget &Subtarget,
924                                          bool AssumeSingleUse = false);
925 
926     /// Check if Op is a value that could be used to fold a store into some
927     /// other x86 instruction as a memory operand. Ex: pextrb $0, %xmm0, (%rdi).
928     bool mayFoldIntoStore(SDValue Op);
929 
930     /// Check if Op is an operation that could be folded into a zero extend x86
931     /// instruction.
932     bool mayFoldIntoZeroExtend(SDValue Op);
933   } // end namespace X86
934 
935   //===--------------------------------------------------------------------===//
936   //  X86 Implementation of the TargetLowering interface
937   class X86TargetLowering final : public TargetLowering {
938   public:
939     explicit X86TargetLowering(const X86TargetMachine &TM,
940                                const X86Subtarget &STI);
941 
942     unsigned getJumpTableEncoding() const override;
943     bool useSoftFloat() const override;
944 
945     void markLibCallAttributes(MachineFunction *MF, unsigned CC,
946                                ArgListTy &Args) const override;
947 
948     MVT getScalarShiftAmountTy(const DataLayout &, EVT VT) const override {
949       return MVT::i8;
950     }
951 
952     const MCExpr *
953     LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
954                               const MachineBasicBlock *MBB, unsigned uid,
955                               MCContext &Ctx) const override;
956 
957     /// Returns relocation base for the given PIC jumptable.
958     SDValue getPICJumpTableRelocBase(SDValue Table,
959                                      SelectionDAG &DAG) const override;
960     const MCExpr *
961     getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
962                                  unsigned JTI, MCContext &Ctx) const override;
963 
964     /// Return the desired alignment for ByVal aggregate
965     /// function arguments in the caller parameter area. For X86, aggregates
966     /// that contains are placed at 16-byte boundaries while the rest are at
967     /// 4-byte boundaries.
968     uint64_t getByValTypeAlignment(Type *Ty,
969                                    const DataLayout &DL) const override;
970 
971     EVT getOptimalMemOpType(const MemOp &Op,
972                             const AttributeList &FuncAttributes) const override;
973 
974     /// Returns true if it's safe to use load / store of the
975     /// specified type to expand memcpy / memset inline. This is mostly true
976     /// for all types except for some special cases. For example, on X86
977     /// targets without SSE2 f64 load / store are done with fldl / fstpl which
978     /// also does type conversion. Note the specified type doesn't have to be
979     /// legal as the hook is used before type legalization.
980     bool isSafeMemOpType(MVT VT) const override;
981 
982     /// Returns true if the target allows unaligned memory accesses of the
983     /// specified type. Returns whether it is "fast" in the last argument.
984     bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, Align Alignment,
985                                         MachineMemOperand::Flags Flags,
986                                         bool *Fast) const override;
987 
988     /// Provide custom lowering hooks for some operations.
989     ///
990     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
991 
992     /// Replace the results of node with an illegal result
993     /// type with new values built out of custom code.
994     ///
995     void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
996                             SelectionDAG &DAG) const override;
997 
998     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
999 
1000     /// Return true if the target has native support for
1001     /// the specified value type and it is 'desirable' to use the type for the
1002     /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
1003     /// instruction encodings are longer and some i16 instructions are slow.
1004     bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override;
1005 
1006     /// Return true if the target has native support for the
1007     /// specified value type and it is 'desirable' to use the type. e.g. On x86
1008     /// i16 is legal, but undesirable since i16 instruction encodings are longer
1009     /// and some i16 instructions are slow.
1010     bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
1011 
1012     /// Return the newly negated expression if the cost is not expensive and
1013     /// set the cost in \p Cost to indicate that if it is cheaper or neutral to
1014     /// do the negation.
1015     SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG,
1016                                  bool LegalOperations, bool ForCodeSize,
1017                                  NegatibleCost &Cost,
1018                                  unsigned Depth) const override;
1019 
1020     MachineBasicBlock *
1021     EmitInstrWithCustomInserter(MachineInstr &MI,
1022                                 MachineBasicBlock *MBB) const override;
1023 
1024     /// This method returns the name of a target specific DAG node.
1025     const char *getTargetNodeName(unsigned Opcode) const override;
1026 
1027     /// Do not merge vector stores after legalization because that may conflict
1028     /// with x86-specific store splitting optimizations.
1029     bool mergeStoresAfterLegalization(EVT MemVT) const override {
1030       return !MemVT.isVector();
1031     }
1032 
1033     bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
1034                           const MachineFunction &MF) const override;
1035 
1036     bool isCheapToSpeculateCttz() const override;
1037 
1038     bool isCheapToSpeculateCtlz() const override;
1039 
1040     bool isCtlzFast() const override;
1041 
1042     bool hasBitPreservingFPLogic(EVT VT) const override {
1043       return VT == MVT::f32 || VT == MVT::f64 || VT.isVector() ||
1044              (VT == MVT::f16 && X86ScalarSSEf16);
1045     }
1046 
1047     bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override {
1048       // If the pair to store is a mixture of float and int values, we will
1049       // save two bitwise instructions and one float-to-int instruction and
1050       // increase one store instruction. There is potentially a more
1051       // significant benefit because it avoids the float->int domain switch
1052       // for input value. So It is more likely a win.
1053       if ((LTy.isFloatingPoint() && HTy.isInteger()) ||
1054           (LTy.isInteger() && HTy.isFloatingPoint()))
1055         return true;
1056       // If the pair only contains int values, we will save two bitwise
1057       // instructions and increase one store instruction (costing one more
1058       // store buffer). Since the benefit is more blurred so we leave
1059       // such pair out until we get testcase to prove it is a win.
1060       return false;
1061     }
1062 
1063     bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
1064 
1065     bool hasAndNotCompare(SDValue Y) const override;
1066 
1067     bool hasAndNot(SDValue Y) const override;
1068 
1069     bool hasBitTest(SDValue X, SDValue Y) const override;
1070 
1071     bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
1072         SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
1073         unsigned OldShiftOpcode, unsigned NewShiftOpcode,
1074         SelectionDAG &DAG) const override;
1075 
1076     bool shouldFoldConstantShiftPairToMask(const SDNode *N,
1077                                            CombineLevel Level) const override;
1078 
1079     bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override;
1080 
1081     bool
1082     shouldTransformSignedTruncationCheck(EVT XVT,
1083                                          unsigned KeptBits) const override {
1084       // For vectors, we don't have a preference..
1085       if (XVT.isVector())
1086         return false;
1087 
1088       auto VTIsOk = [](EVT VT) -> bool {
1089         return VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 ||
1090                VT == MVT::i64;
1091       };
1092 
1093       // We are ok with KeptBitsVT being byte/word/dword, what MOVS supports.
1094       // XVT will be larger than KeptBitsVT.
1095       MVT KeptBitsVT = MVT::getIntegerVT(KeptBits);
1096       return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
1097     }
1098 
1099     bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override;
1100 
1101     bool shouldSplatInsEltVarIndex(EVT VT) const override;
1102 
1103     bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override {
1104       // Converting to sat variants holds little benefit on X86 as we will just
1105       // need to saturate the value back using fp arithmatic.
1106       return Op != ISD::FP_TO_UINT_SAT && isOperationLegalOrCustom(Op, VT);
1107     }
1108 
1109     bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
1110       return VT.isScalarInteger();
1111     }
1112 
1113     /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
1114     MVT hasFastEqualityCompare(unsigned NumBits) const override;
1115 
1116     /// Return the value type to use for ISD::SETCC.
1117     EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
1118                            EVT VT) const override;
1119 
1120     bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits,
1121                                       const APInt &DemandedElts,
1122                                       TargetLoweringOpt &TLO) const override;
1123 
1124     /// Determine which of the bits specified in Mask are known to be either
1125     /// zero or one and return them in the KnownZero/KnownOne bitsets.
1126     void computeKnownBitsForTargetNode(const SDValue Op,
1127                                        KnownBits &Known,
1128                                        const APInt &DemandedElts,
1129                                        const SelectionDAG &DAG,
1130                                        unsigned Depth = 0) const override;
1131 
1132     /// Determine the number of bits in the operation that are sign bits.
1133     unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
1134                                              const APInt &DemandedElts,
1135                                              const SelectionDAG &DAG,
1136                                              unsigned Depth) const override;
1137 
1138     bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op,
1139                                                  const APInt &DemandedElts,
1140                                                  APInt &KnownUndef,
1141                                                  APInt &KnownZero,
1142                                                  TargetLoweringOpt &TLO,
1143                                                  unsigned Depth) const override;
1144 
1145     bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op,
1146                                                     const APInt &DemandedElts,
1147                                                     unsigned MaskIndex,
1148                                                     TargetLoweringOpt &TLO,
1149                                                     unsigned Depth) const;
1150 
1151     bool SimplifyDemandedBitsForTargetNode(SDValue Op,
1152                                            const APInt &DemandedBits,
1153                                            const APInt &DemandedElts,
1154                                            KnownBits &Known,
1155                                            TargetLoweringOpt &TLO,
1156                                            unsigned Depth) const override;
1157 
1158     SDValue SimplifyMultipleUseDemandedBitsForTargetNode(
1159         SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
1160         SelectionDAG &DAG, unsigned Depth) const override;
1161 
1162     bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts,
1163                                    APInt &UndefElts,
1164                                    unsigned Depth) const override;
1165 
1166     const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override;
1167 
1168     SDValue unwrapAddress(SDValue N) const override;
1169 
1170     SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
1171 
1172     bool ExpandInlineAsm(CallInst *CI) const override;
1173 
1174     ConstraintType getConstraintType(StringRef Constraint) const override;
1175 
1176     /// Examine constraint string and operand type and determine a weight value.
1177     /// The operand object must already have been set up with the operand type.
1178     ConstraintWeight
1179       getSingleConstraintMatchWeight(AsmOperandInfo &info,
1180                                      const char *constraint) const override;
1181 
1182     const char *LowerXConstraint(EVT ConstraintVT) const override;
1183 
1184     /// Lower the specified operand into the Ops vector. If it is invalid, don't
1185     /// add anything to Ops. If hasMemory is true it means one of the asm
1186     /// constraint of the inline asm instruction being processed is 'm'.
1187     void LowerAsmOperandForConstraint(SDValue Op,
1188                                       std::string &Constraint,
1189                                       std::vector<SDValue> &Ops,
1190                                       SelectionDAG &DAG) const override;
1191 
1192     unsigned
1193     getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
1194       if (ConstraintCode == "v")
1195         return InlineAsm::Constraint_v;
1196       return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
1197     }
1198 
1199     /// Handle Lowering flag assembly outputs.
1200     SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag,
1201                                         const SDLoc &DL,
1202                                         const AsmOperandInfo &Constraint,
1203                                         SelectionDAG &DAG) const override;
1204 
1205     /// Given a physical register constraint
1206     /// (e.g. {edx}), return the register number and the register class for the
1207     /// register.  This should only be used for C_Register constraints.  On
1208     /// error, this returns a register number of 0.
1209     std::pair<unsigned, const TargetRegisterClass *>
1210     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
1211                                  StringRef Constraint, MVT VT) const override;
1212 
1213     /// Return true if the addressing mode represented
1214     /// by AM is legal for this target, for a load/store of the specified type.
1215     bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
1216                                Type *Ty, unsigned AS,
1217                                Instruction *I = nullptr) const override;
1218 
1219     /// Return true if the specified immediate is legal
1220     /// icmp immediate, that is the target has icmp instructions which can
1221     /// compare a register against the immediate without having to materialize
1222     /// the immediate into a register.
1223     bool isLegalICmpImmediate(int64_t Imm) const override;
1224 
1225     /// Return true if the specified immediate is legal
1226     /// add immediate, that is the target has add instructions which can
1227     /// add a register and the immediate without having to materialize
1228     /// the immediate into a register.
1229     bool isLegalAddImmediate(int64_t Imm) const override;
1230 
1231     bool isLegalStoreImmediate(int64_t Imm) const override;
1232 
1233     /// Return the cost of the scaling factor used in the addressing
1234     /// mode represented by AM for this target, for a load/store
1235     /// of the specified type.
1236     /// If the AM is supported, the return value must be >= 0.
1237     /// If the AM is not supported, it returns a negative value.
1238     InstructionCost getScalingFactorCost(const DataLayout &DL,
1239                                          const AddrMode &AM, Type *Ty,
1240                                          unsigned AS) const override;
1241 
1242     /// This is used to enable splatted operand transforms for vector shifts
1243     /// and vector funnel shifts.
1244     bool isVectorShiftByScalarCheap(Type *Ty) const override;
1245 
1246     /// Add x86-specific opcodes to the default list.
1247     bool isBinOp(unsigned Opcode) const override;
1248 
1249     /// Returns true if the opcode is a commutative binary operation.
1250     bool isCommutativeBinOp(unsigned Opcode) const override;
1251 
1252     /// Return true if it's free to truncate a value of
1253     /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
1254     /// register EAX to i16 by referencing its sub-register AX.
1255     bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
1256     bool isTruncateFree(EVT VT1, EVT VT2) const override;
1257 
1258     bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
1259 
1260     /// Return true if any actual instruction that defines a
1261     /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
1262     /// register. This does not necessarily include registers defined in
1263     /// unknown ways, such as incoming arguments, or copies from unknown
1264     /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
1265     /// does not necessarily apply to truncate instructions. e.g. on x86-64,
1266     /// all instructions that define 32-bit values implicit zero-extend the
1267     /// result out to 64 bits.
1268     bool isZExtFree(Type *Ty1, Type *Ty2) const override;
1269     bool isZExtFree(EVT VT1, EVT VT2) const override;
1270     bool isZExtFree(SDValue Val, EVT VT2) const override;
1271 
1272     bool shouldSinkOperands(Instruction *I,
1273                             SmallVectorImpl<Use *> &Ops) const override;
1274     bool shouldConvertPhiType(Type *From, Type *To) const override;
1275 
1276     /// Return true if folding a vector load into ExtVal (a sign, zero, or any
1277     /// extend node) is profitable.
1278     bool isVectorLoadExtDesirable(SDValue) const override;
1279 
1280     /// Return true if an FMA operation is faster than a pair of fmul and fadd
1281     /// instructions. fmuladd intrinsics will be expanded to FMAs when this
1282     /// method returns true, otherwise fmuladd is expanded to fmul + fadd.
1283     bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
1284                                     EVT VT) const override;
1285 
1286     /// Return true if it's profitable to narrow
1287     /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
1288     /// from i32 to i8 but not from i32 to i16.
1289     bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
1290 
1291     bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
1292                                               EVT VT) const override;
1293 
1294     /// Given an intrinsic, checks if on the target the intrinsic will need to map
1295     /// to a MemIntrinsicNode (touches memory). If this is the case, it returns
1296     /// true and stores the intrinsic information into the IntrinsicInfo that was
1297     /// passed to the function.
1298     bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
1299                             MachineFunction &MF,
1300                             unsigned Intrinsic) const override;
1301 
1302     /// Returns true if the target can instruction select the
1303     /// specified FP immediate natively. If false, the legalizer will
1304     /// materialize the FP immediate as a load from a constant pool.
1305     bool isFPImmLegal(const APFloat &Imm, EVT VT,
1306                       bool ForCodeSize) const override;
1307 
1308     /// Targets can use this to indicate that they only support *some*
1309     /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a
1310     /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to
1311     /// be legal.
1312     bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
1313 
1314     /// Similar to isShuffleMaskLegal. Targets can use this to indicate if there
1315     /// is a suitable VECTOR_SHUFFLE that can be used to replace a VAND with a
1316     /// constant pool entry.
1317     bool isVectorClearMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
1318 
1319     /// Returns true if lowering to a jump table is allowed.
1320     bool areJTsAllowed(const Function *Fn) const override;
1321 
1322     /// If true, then instruction selection should
1323     /// seek to shrink the FP constant of the specified type to a smaller type
1324     /// in order to save space and / or reduce runtime.
1325     bool ShouldShrinkFPConstant(EVT VT) const override {
1326       // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
1327       // expensive than a straight movsd. On the other hand, it's important to
1328       // shrink long double fp constant since fldt is very slow.
1329       return !X86ScalarSSEf64 || VT == MVT::f80;
1330     }
1331 
1332     /// Return true if we believe it is correct and profitable to reduce the
1333     /// load node to a smaller type.
1334     bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
1335                                EVT NewVT) const override;
1336 
1337     /// Return true if the specified scalar FP type is computed in an SSE
1338     /// register, not on the X87 floating point stack.
1339     bool isScalarFPTypeInSSEReg(EVT VT) const {
1340       return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
1341              (VT == MVT::f32 && X86ScalarSSEf32) || // f32 is when SSE1
1342              (VT == MVT::f16 && X86ScalarSSEf16);   // f16 is when AVX512FP16
1343     }
1344 
1345     /// Returns true if it is beneficial to convert a load of a constant
1346     /// to just the constant itself.
1347     bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
1348                                            Type *Ty) const override;
1349 
1350     bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override;
1351 
1352     bool convertSelectOfConstantsToMath(EVT VT) const override;
1353 
1354     bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
1355                                 SDValue C) const override;
1356 
1357     /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
1358     /// with this index.
1359     bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
1360                                  unsigned Index) const override;
1361 
1362     /// Scalar ops always have equal or better analysis/performance/power than
1363     /// the vector equivalent, so this always makes sense if the scalar op is
1364     /// supported.
1365     bool shouldScalarizeBinop(SDValue) const override;
1366 
1367     /// Extract of a scalar FP value from index 0 of a vector is free.
1368     bool isExtractVecEltCheap(EVT VT, unsigned Index) const override {
1369       EVT EltVT = VT.getScalarType();
1370       return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0;
1371     }
1372 
1373     /// Overflow nodes should get combined/lowered to optimal instructions
1374     /// (they should allow eliminating explicit compares by getting flags from
1375     /// math ops).
1376     bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
1377                               bool MathUsed) const override;
1378 
1379     bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem,
1380                                       unsigned AddrSpace) const override {
1381       // If we can replace more than 2 scalar stores, there will be a reduction
1382       // in instructions even after we add a vector constant load.
1383       return NumElem > 2;
1384     }
1385 
1386     bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
1387                                  const SelectionDAG &DAG,
1388                                  const MachineMemOperand &MMO) const override;
1389 
1390     /// Intel processors have a unified instruction and data cache
1391     const char * getClearCacheBuiltinName() const override {
1392       return nullptr; // nothing to do, move along.
1393     }
1394 
1395     Register getRegisterByName(const char* RegName, LLT VT,
1396                                const MachineFunction &MF) const override;
1397 
1398     /// If a physical register, this returns the register that receives the
1399     /// exception address on entry to an EH pad.
1400     Register
1401     getExceptionPointerRegister(const Constant *PersonalityFn) const override;
1402 
1403     /// If a physical register, this returns the register that receives the
1404     /// exception typeid on entry to a landing pad.
1405     Register
1406     getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
1407 
1408     virtual bool needsFixedCatchObjects() const override;
1409 
1410     /// This method returns a target specific FastISel object,
1411     /// or null if the target does not support "fast" ISel.
1412     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
1413                              const TargetLibraryInfo *libInfo) const override;
1414 
1415     /// If the target has a standard location for the stack protector cookie,
1416     /// returns the address of that location. Otherwise, returns nullptr.
1417     Value *getIRStackGuard(IRBuilderBase &IRB) const override;
1418 
1419     bool useLoadStackGuardNode() const override;
1420     bool useStackGuardXorFP() const override;
1421     void insertSSPDeclarations(Module &M) const override;
1422     Value *getSDagStackGuard(const Module &M) const override;
1423     Function *getSSPStackGuardCheck(const Module &M) const override;
1424     SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
1425                                 const SDLoc &DL) const override;
1426 
1427 
1428     /// Return true if the target stores SafeStack pointer at a fixed offset in
1429     /// some non-standard address space, and populates the address space and
1430     /// offset as appropriate.
1431     Value *getSafeStackPointerLocation(IRBuilderBase &IRB) const override;
1432 
1433     std::pair<SDValue, SDValue> BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL,
1434                                           SDValue Chain, SDValue Pointer,
1435                                           MachinePointerInfo PtrInfo,
1436                                           Align Alignment,
1437                                           SelectionDAG &DAG) const;
1438 
1439     /// Customize the preferred legalization strategy for certain types.
1440     LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;
1441 
1442     bool softPromoteHalfType() const override { return true; }
1443 
1444     MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
1445                                       EVT VT) const override;
1446 
1447     unsigned getNumRegistersForCallingConv(LLVMContext &Context,
1448                                            CallingConv::ID CC,
1449                                            EVT VT) const override;
1450 
1451     unsigned getVectorTypeBreakdownForCallingConv(
1452         LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1453         unsigned &NumIntermediates, MVT &RegisterVT) const override;
1454 
1455     bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
1456 
1457     bool supportSwiftError() const override;
1458 
1459     bool hasStackProbeSymbol(MachineFunction &MF) const override;
1460     bool hasInlineStackProbe(MachineFunction &MF) const override;
1461     StringRef getStackProbeSymbolName(MachineFunction &MF) const override;
1462 
1463     unsigned getStackProbeSize(MachineFunction &MF) const;
1464 
1465     bool hasVectorBlend() const override { return true; }
1466 
1467     unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
1468 
1469     /// Lower interleaved load(s) into target specific
1470     /// instructions/intrinsics.
1471     bool lowerInterleavedLoad(LoadInst *LI,
1472                               ArrayRef<ShuffleVectorInst *> Shuffles,
1473                               ArrayRef<unsigned> Indices,
1474                               unsigned Factor) const override;
1475 
1476     /// Lower interleaved store(s) into target specific
1477     /// instructions/intrinsics.
1478     bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
1479                                unsigned Factor) const override;
1480 
1481     SDValue expandIndirectJTBranch(const SDLoc& dl, SDValue Value,
1482                                    SDValue Addr, SelectionDAG &DAG)
1483                                    const override;
1484 
1485     Align getPrefLoopAlignment(MachineLoop *ML) const override;
1486 
1487   protected:
1488     std::pair<const TargetRegisterClass *, uint8_t>
1489     findRepresentativeClass(const TargetRegisterInfo *TRI,
1490                             MVT VT) const override;
1491 
1492   private:
1493     /// Keep a reference to the X86Subtarget around so that we can
1494     /// make the right decision when generating code for different targets.
1495     const X86Subtarget &Subtarget;
1496 
1497     /// Select between SSE or x87 floating point ops.
1498     /// When SSE is available, use it for f32 operations.
1499     /// When SSE2 is available, use it for f64 operations.
1500     bool X86ScalarSSEf32;
1501     bool X86ScalarSSEf64;
1502     bool X86ScalarSSEf16;
1503 
1504     /// A list of legal FP immediates.
1505     std::vector<APFloat> LegalFPImmediates;
1506 
1507     /// Indicate that this x86 target can instruction
1508     /// select the specified FP immediate natively.
1509     void addLegalFPImmediate(const APFloat& Imm) {
1510       LegalFPImmediates.push_back(Imm);
1511     }
1512 
1513     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
1514                             CallingConv::ID CallConv, bool isVarArg,
1515                             const SmallVectorImpl<ISD::InputArg> &Ins,
1516                             const SDLoc &dl, SelectionDAG &DAG,
1517                             SmallVectorImpl<SDValue> &InVals,
1518                             uint32_t *RegMask) const;
1519     SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
1520                              const SmallVectorImpl<ISD::InputArg> &ArgInfo,
1521                              const SDLoc &dl, SelectionDAG &DAG,
1522                              const CCValAssign &VA, MachineFrameInfo &MFI,
1523                              unsigned i) const;
1524     SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
1525                              const SDLoc &dl, SelectionDAG &DAG,
1526                              const CCValAssign &VA,
1527                              ISD::ArgFlagsTy Flags, bool isByval) const;
1528 
1529     // Call lowering helpers.
1530 
1531     /// Check whether the call is eligible for tail call optimization. Targets
1532     /// that want to do tail call optimization should implement this function.
1533     bool IsEligibleForTailCallOptimization(
1534         SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleeStackStructRet,
1535         bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,
1536         const SmallVectorImpl<SDValue> &OutVals,
1537         const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
1538     SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
1539                                     SDValue Chain, bool IsTailCall,
1540                                     bool Is64Bit, int FPDiff,
1541                                     const SDLoc &dl) const;
1542 
1543     unsigned GetAlignedArgumentStackSize(unsigned StackSize,
1544                                          SelectionDAG &DAG) const;
1545 
1546     unsigned getAddressSpace() const;
1547 
1548     SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned,
1549                             SDValue &Chain) const;
1550     SDValue LRINT_LLRINTHelper(SDNode *N, SelectionDAG &DAG) const;
1551 
1552     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
1553     SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
1554     SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
1555     SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
1556 
1557     unsigned getGlobalWrapperKind(const GlobalValue *GV = nullptr,
1558                                   const unsigned char OpFlags = 0) const;
1559     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
1560     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
1561     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
1562     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
1563     SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
1564 
1565     /// Creates target global address or external symbol nodes for calls or
1566     /// other uses.
1567     SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
1568                                   bool ForCall) const;
1569 
1570     SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
1571     SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
1572     SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
1573     SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
1574     SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
1575     SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const;
1576     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
1577     SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
1578     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
1579     SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
1580     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
1581     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
1582     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
1583     SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
1584     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
1585     SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
1586     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
1587     SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
1588     SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
1589     SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
1590     SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
1591     SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
1592     SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
1593     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
1594     SDValue LowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
1595     SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
1596     SDValue LowerWin64_FP_TO_INT128(SDValue Op, SelectionDAG &DAG,
1597                                     SDValue &Chain) const;
1598     SDValue LowerWin64_INT128_TO_FP(SDValue Op, SelectionDAG &DAG) const;
1599     SDValue LowerGC_TRANSITION(SDValue Op, SelectionDAG &DAG) const;
1600     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
1601     SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const;
1602     SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
1603     SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
1604 
1605     SDValue
1606     LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1607                          const SmallVectorImpl<ISD::InputArg> &Ins,
1608                          const SDLoc &dl, SelectionDAG &DAG,
1609                          SmallVectorImpl<SDValue> &InVals) const override;
1610     SDValue LowerCall(CallLoweringInfo &CLI,
1611                       SmallVectorImpl<SDValue> &InVals) const override;
1612 
1613     SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1614                         const SmallVectorImpl<ISD::OutputArg> &Outs,
1615                         const SmallVectorImpl<SDValue> &OutVals,
1616                         const SDLoc &dl, SelectionDAG &DAG) const override;
1617 
1618     bool supportSplitCSR(MachineFunction *MF) const override {
1619       return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
1620           MF->getFunction().hasFnAttribute(Attribute::NoUnwind);
1621     }
1622     void initializeSplitCSR(MachineBasicBlock *Entry) const override;
1623     void insertCopiesSplitCSR(
1624       MachineBasicBlock *Entry,
1625       const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
1626 
1627     bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
1628 
1629     bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
1630 
1631     EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
1632                             ISD::NodeType ExtendKind) const override;
1633 
1634     bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
1635                         bool isVarArg,
1636                         const SmallVectorImpl<ISD::OutputArg> &Outs,
1637                         LLVMContext &Context) const override;
1638 
1639     const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
1640 
1641     TargetLoweringBase::AtomicExpansionKind
1642     shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
1643     bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
1644     TargetLoweringBase::AtomicExpansionKind
1645     shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
1646 
1647     LoadInst *
1648     lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
1649 
1650     bool lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const override;
1651     bool lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const override;
1652 
1653     bool needsCmpXchgNb(Type *MemType) const;
1654 
1655     void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
1656                                 MachineBasicBlock *DispatchBB, int FI) const;
1657 
1658     // Utility function to emit the low-level va_arg code for X86-64.
1659     MachineBasicBlock *
1660     EmitVAARGWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const;
1661 
1662     /// Utility function to emit the xmm reg save portion of va_start.
1663     MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1,
1664                                                  MachineInstr &MI2,
1665                                                  MachineBasicBlock *BB) const;
1666 
1667     MachineBasicBlock *EmitLoweredSelect(MachineInstr &I,
1668                                          MachineBasicBlock *BB) const;
1669 
1670     MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
1671                                            MachineBasicBlock *BB) const;
1672 
1673     MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI,
1674                                             MachineBasicBlock *BB) const;
1675 
1676     MachineBasicBlock *EmitLoweredProbedAlloca(MachineInstr &MI,
1677                                                MachineBasicBlock *BB) const;
1678 
1679     MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr &MI,
1680                                           MachineBasicBlock *BB) const;
1681 
1682     MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI,
1683                                           MachineBasicBlock *BB) const;
1684 
1685     MachineBasicBlock *EmitLoweredIndirectThunk(MachineInstr &MI,
1686                                                 MachineBasicBlock *BB) const;
1687 
1688     MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
1689                                         MachineBasicBlock *MBB) const;
1690 
1691     void emitSetJmpShadowStackFix(MachineInstr &MI,
1692                                   MachineBasicBlock *MBB) const;
1693 
1694     MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
1695                                          MachineBasicBlock *MBB) const;
1696 
1697     MachineBasicBlock *emitLongJmpShadowStackFix(MachineInstr &MI,
1698                                                  MachineBasicBlock *MBB) const;
1699 
1700     MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI,
1701                                              MachineBasicBlock *MBB) const;
1702 
1703     /// Emit flags for the given setcc condition and operands. Also returns the
1704     /// corresponding X86 condition code constant in X86CC.
1705     SDValue emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC,
1706                               const SDLoc &dl, SelectionDAG &DAG,
1707                               SDValue &X86CC) const;
1708 
1709     /// Check if replacement of SQRT with RSQRT should be disabled.
1710     bool isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const override;
1711 
1712     /// Use rsqrt* to speed up sqrt calculations.
1713     SDValue getSqrtEstimate(SDValue Op, SelectionDAG &DAG, int Enabled,
1714                             int &RefinementSteps, bool &UseOneConstNR,
1715                             bool Reciprocal) const override;
1716 
1717     /// Use rcp* to speed up fdiv calculations.
1718     SDValue getRecipEstimate(SDValue Op, SelectionDAG &DAG, int Enabled,
1719                              int &RefinementSteps) const override;
1720 
1721     /// Reassociate floating point divisions into multiply by reciprocal.
1722     unsigned combineRepeatedFPDivisors() const override;
1723 
1724     SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
1725                           SmallVectorImpl<SDNode *> &Created) const override;
1726   };
1727 
1728   namespace X86 {
1729     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
1730                              const TargetLibraryInfo *libInfo);
1731   } // end namespace X86
1732 
1733   // X86 specific Gather/Scatter nodes.
1734   // The class has the same order of operands as MaskedGatherScatterSDNode for
1735   // convenience.
1736   class X86MaskedGatherScatterSDNode : public MemIntrinsicSDNode {
1737   public:
1738     // This is a intended as a utility and should never be directly created.
1739     X86MaskedGatherScatterSDNode() = delete;
1740     ~X86MaskedGatherScatterSDNode() = delete;
1741 
1742     const SDValue &getBasePtr() const { return getOperand(3); }
1743     const SDValue &getIndex()   const { return getOperand(4); }
1744     const SDValue &getMask()    const { return getOperand(2); }
1745     const SDValue &getScale()   const { return getOperand(5); }
1746 
1747     static bool classof(const SDNode *N) {
1748       return N->getOpcode() == X86ISD::MGATHER ||
1749              N->getOpcode() == X86ISD::MSCATTER;
1750     }
1751   };
1752 
1753   class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode {
1754   public:
1755     const SDValue &getPassThru() const { return getOperand(1); }
1756 
1757     static bool classof(const SDNode *N) {
1758       return N->getOpcode() == X86ISD::MGATHER;
1759     }
1760   };
1761 
1762   class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode {
1763   public:
1764     const SDValue &getValue() const { return getOperand(1); }
1765 
1766     static bool classof(const SDNode *N) {
1767       return N->getOpcode() == X86ISD::MSCATTER;
1768     }
1769   };
1770 
1771   /// Generate unpacklo/unpackhi shuffle mask.
1772   void createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask, bool Lo,
1773                                bool Unary);
1774 
1775   /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
1776   /// imposed by AVX and specific to the unary pattern. Example:
1777   /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
1778   /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
1779   void createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo);
1780 
1781 } // end namespace llvm
1782 
1783 #endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
1784