1 //===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that X86 uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H 15 #define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H 16 17 #include "llvm/CodeGen/MachineFunction.h" 18 #include "llvm/CodeGen/TargetLowering.h" 19 20 namespace llvm { 21 class X86Subtarget; 22 class X86TargetMachine; 23 24 namespace X86ISD { 25 // X86 Specific DAG Nodes 26 enum NodeType : unsigned { 27 // Start the numbering where the builtin ops leave off. 28 FIRST_NUMBER = ISD::BUILTIN_OP_END, 29 30 /// Bit scan forward. 31 BSF, 32 /// Bit scan reverse. 33 BSR, 34 35 /// X86 funnel/double shift i16 instructions. These correspond to 36 /// X86::SHLDW and X86::SHRDW instructions which have different amt 37 /// modulo rules to generic funnel shifts. 38 /// NOTE: The operand order matches ISD::FSHL/FSHR not SHLD/SHRD. 39 FSHL, 40 FSHR, 41 42 /// Bitwise logical AND of floating point values. This corresponds 43 /// to X86::ANDPS or X86::ANDPD. 44 FAND, 45 46 /// Bitwise logical OR of floating point values. This corresponds 47 /// to X86::ORPS or X86::ORPD. 48 FOR, 49 50 /// Bitwise logical XOR of floating point values. This corresponds 51 /// to X86::XORPS or X86::XORPD. 52 FXOR, 53 54 /// Bitwise logical ANDNOT of floating point values. This 55 /// corresponds to X86::ANDNPS or X86::ANDNPD. 56 FANDN, 57 58 /// These operations represent an abstract X86 call 59 /// instruction, which includes a bunch of information. In particular the 60 /// operands of these node are: 61 /// 62 /// #0 - The incoming token chain 63 /// #1 - The callee 64 /// #2 - The number of arg bytes the caller pushes on the stack. 65 /// #3 - The number of arg bytes the callee pops off the stack. 66 /// #4 - The value to pass in AL/AX/EAX (optional) 67 /// #5 - The value to pass in DL/DX/EDX (optional) 68 /// 69 /// The result values of these nodes are: 70 /// 71 /// #0 - The outgoing token chain 72 /// #1 - The first register result value (optional) 73 /// #2 - The second register result value (optional) 74 /// 75 CALL, 76 77 /// Same as call except it adds the NoTrack prefix. 78 NT_CALL, 79 80 // Pseudo for a OBJC call that gets emitted together with a special 81 // marker instruction. 82 CALL_RVMARKER, 83 84 /// X86 compare and logical compare instructions. 85 CMP, 86 FCMP, 87 COMI, 88 UCOMI, 89 90 /// X86 bit-test instructions. 91 BT, 92 93 /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS 94 /// operand, usually produced by a CMP instruction. 95 SETCC, 96 97 /// X86 Select 98 SELECTS, 99 100 // Same as SETCC except it's materialized with a sbb and the value is all 101 // one's or all zero's. 102 SETCC_CARRY, // R = carry_bit ? ~0 : 0 103 104 /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD. 105 /// Operands are two FP values to compare; result is a mask of 106 /// 0s or 1s. Generally DTRT for C/C++ with NaNs. 107 FSETCC, 108 109 /// X86 FP SETCC, similar to above, but with output as an i1 mask and 110 /// and a version with SAE. 111 FSETCCM, 112 FSETCCM_SAE, 113 114 /// X86 conditional moves. Operand 0 and operand 1 are the two values 115 /// to select from. Operand 2 is the condition code, and operand 3 is the 116 /// flag operand produced by a CMP or TEST instruction. 117 CMOV, 118 119 /// X86 conditional branches. Operand 0 is the chain operand, operand 1 120 /// is the block to branch if condition is true, operand 2 is the 121 /// condition code, and operand 3 is the flag operand produced by a CMP 122 /// or TEST instruction. 123 BRCOND, 124 125 /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and 126 /// operand 1 is the target address. 127 NT_BRIND, 128 129 /// Return with a flag operand. Operand 0 is the chain operand, operand 130 /// 1 is the number of bytes of stack to pop. 131 RET_FLAG, 132 133 /// Return from interrupt. Operand 0 is the number of bytes to pop. 134 IRET, 135 136 /// Repeat fill, corresponds to X86::REP_STOSx. 137 REP_STOS, 138 139 /// Repeat move, corresponds to X86::REP_MOVSx. 140 REP_MOVS, 141 142 /// On Darwin, this node represents the result of the popl 143 /// at function entry, used for PIC code. 144 GlobalBaseReg, 145 146 /// A wrapper node for TargetConstantPool, TargetJumpTable, 147 /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress, 148 /// MCSymbol and TargetBlockAddress. 149 Wrapper, 150 151 /// Special wrapper used under X86-64 PIC mode for RIP 152 /// relative displacements. 153 WrapperRIP, 154 155 /// Copies a 64-bit value from an MMX vector to the low word 156 /// of an XMM vector, with the high word zero filled. 157 MOVQ2DQ, 158 159 /// Copies a 64-bit value from the low word of an XMM vector 160 /// to an MMX vector. 161 MOVDQ2Q, 162 163 /// Copies a 32-bit value from the low word of a MMX 164 /// vector to a GPR. 165 MMX_MOVD2W, 166 167 /// Copies a GPR into the low 32-bit word of a MMX vector 168 /// and zero out the high word. 169 MMX_MOVW2D, 170 171 /// Extract an 8-bit value from a vector and zero extend it to 172 /// i32, corresponds to X86::PEXTRB. 173 PEXTRB, 174 175 /// Extract a 16-bit value from a vector and zero extend it to 176 /// i32, corresponds to X86::PEXTRW. 177 PEXTRW, 178 179 /// Insert any element of a 4 x float vector into any element 180 /// of a destination 4 x floatvector. 181 INSERTPS, 182 183 /// Insert the lower 8-bits of a 32-bit value to a vector, 184 /// corresponds to X86::PINSRB. 185 PINSRB, 186 187 /// Insert the lower 16-bits of a 32-bit value to a vector, 188 /// corresponds to X86::PINSRW. 189 PINSRW, 190 191 /// Shuffle 16 8-bit values within a vector. 192 PSHUFB, 193 194 /// Compute Sum of Absolute Differences. 195 PSADBW, 196 /// Compute Double Block Packed Sum-Absolute-Differences 197 DBPSADBW, 198 199 /// Bitwise Logical AND NOT of Packed FP values. 200 ANDNP, 201 202 /// Blend where the selector is an immediate. 203 BLENDI, 204 205 /// Dynamic (non-constant condition) vector blend where only the sign bits 206 /// of the condition elements are used. This is used to enforce that the 207 /// condition mask is not valid for generic VSELECT optimizations. This 208 /// is also used to implement the intrinsics. 209 /// Operands are in VSELECT order: MASK, TRUE, FALSE 210 BLENDV, 211 212 /// Combined add and sub on an FP vector. 213 ADDSUB, 214 215 // FP vector ops with rounding mode. 216 FADD_RND, 217 FADDS, 218 FADDS_RND, 219 FSUB_RND, 220 FSUBS, 221 FSUBS_RND, 222 FMUL_RND, 223 FMULS, 224 FMULS_RND, 225 FDIV_RND, 226 FDIVS, 227 FDIVS_RND, 228 FMAX_SAE, 229 FMAXS_SAE, 230 FMIN_SAE, 231 FMINS_SAE, 232 FSQRT_RND, 233 FSQRTS, 234 FSQRTS_RND, 235 236 // FP vector get exponent. 237 FGETEXP, 238 FGETEXP_SAE, 239 FGETEXPS, 240 FGETEXPS_SAE, 241 // Extract Normalized Mantissas. 242 VGETMANT, 243 VGETMANT_SAE, 244 VGETMANTS, 245 VGETMANTS_SAE, 246 // FP Scale. 247 SCALEF, 248 SCALEF_RND, 249 SCALEFS, 250 SCALEFS_RND, 251 252 // Unsigned Integer average. 253 AVG, 254 255 /// Integer horizontal add/sub. 256 HADD, 257 HSUB, 258 259 /// Floating point horizontal add/sub. 260 FHADD, 261 FHSUB, 262 263 // Detect Conflicts Within a Vector 264 CONFLICT, 265 266 /// Floating point max and min. 267 FMAX, 268 FMIN, 269 270 /// Commutative FMIN and FMAX. 271 FMAXC, 272 FMINC, 273 274 /// Scalar intrinsic floating point max and min. 275 FMAXS, 276 FMINS, 277 278 /// Floating point reciprocal-sqrt and reciprocal approximation. 279 /// Note that these typically require refinement 280 /// in order to obtain suitable precision. 281 FRSQRT, 282 FRCP, 283 284 // AVX-512 reciprocal approximations with a little more precision. 285 RSQRT14, 286 RSQRT14S, 287 RCP14, 288 RCP14S, 289 290 // Thread Local Storage. 291 TLSADDR, 292 293 // Thread Local Storage. A call to get the start address 294 // of the TLS block for the current module. 295 TLSBASEADDR, 296 297 // Thread Local Storage. When calling to an OS provided 298 // thunk at the address from an earlier relocation. 299 TLSCALL, 300 301 // Exception Handling helpers. 302 EH_RETURN, 303 304 // SjLj exception handling setjmp. 305 EH_SJLJ_SETJMP, 306 307 // SjLj exception handling longjmp. 308 EH_SJLJ_LONGJMP, 309 310 // SjLj exception handling dispatch. 311 EH_SJLJ_SETUP_DISPATCH, 312 313 /// Tail call return. See X86TargetLowering::LowerCall for 314 /// the list of operands. 315 TC_RETURN, 316 317 // Vector move to low scalar and zero higher vector elements. 318 VZEXT_MOVL, 319 320 // Vector integer truncate. 321 VTRUNC, 322 // Vector integer truncate with unsigned/signed saturation. 323 VTRUNCUS, 324 VTRUNCS, 325 326 // Masked version of the above. Used when less than a 128-bit result is 327 // produced since the mask only applies to the lower elements and can't 328 // be represented by a select. 329 // SRC, PASSTHRU, MASK 330 VMTRUNC, 331 VMTRUNCUS, 332 VMTRUNCS, 333 334 // Vector FP extend. 335 VFPEXT, 336 VFPEXT_SAE, 337 VFPEXTS, 338 VFPEXTS_SAE, 339 340 // Vector FP round. 341 VFPROUND, 342 VFPROUND_RND, 343 VFPROUNDS, 344 VFPROUNDS_RND, 345 346 // Masked version of above. Used for v2f64->v4f32. 347 // SRC, PASSTHRU, MASK 348 VMFPROUND, 349 350 // 128-bit vector logical left / right shift 351 VSHLDQ, 352 VSRLDQ, 353 354 // Vector shift elements 355 VSHL, 356 VSRL, 357 VSRA, 358 359 // Vector variable shift 360 VSHLV, 361 VSRLV, 362 VSRAV, 363 364 // Vector shift elements by immediate 365 VSHLI, 366 VSRLI, 367 VSRAI, 368 369 // Shifts of mask registers. 370 KSHIFTL, 371 KSHIFTR, 372 373 // Bit rotate by immediate 374 VROTLI, 375 VROTRI, 376 377 // Vector packed double/float comparison. 378 CMPP, 379 380 // Vector integer comparisons. 381 PCMPEQ, 382 PCMPGT, 383 384 // v8i16 Horizontal minimum and position. 385 PHMINPOS, 386 387 MULTISHIFT, 388 389 /// Vector comparison generating mask bits for fp and 390 /// integer signed and unsigned data types. 391 CMPM, 392 // Vector mask comparison generating mask bits for FP values. 393 CMPMM, 394 // Vector mask comparison with SAE for FP values. 395 CMPMM_SAE, 396 397 // Arithmetic operations with FLAGS results. 398 ADD, 399 SUB, 400 ADC, 401 SBB, 402 SMUL, 403 UMUL, 404 OR, 405 XOR, 406 AND, 407 408 // Bit field extract. 409 BEXTR, 410 BEXTRI, 411 412 // Zero High Bits Starting with Specified Bit Position. 413 BZHI, 414 415 // Parallel extract and deposit. 416 PDEP, 417 PEXT, 418 419 // X86-specific multiply by immediate. 420 MUL_IMM, 421 422 // Vector sign bit extraction. 423 MOVMSK, 424 425 // Vector bitwise comparisons. 426 PTEST, 427 428 // Vector packed fp sign bitwise comparisons. 429 TESTP, 430 431 // OR/AND test for masks. 432 KORTEST, 433 KTEST, 434 435 // ADD for masks. 436 KADD, 437 438 // Several flavors of instructions with vector shuffle behaviors. 439 // Saturated signed/unnsigned packing. 440 PACKSS, 441 PACKUS, 442 // Intra-lane alignr. 443 PALIGNR, 444 // AVX512 inter-lane alignr. 445 VALIGN, 446 PSHUFD, 447 PSHUFHW, 448 PSHUFLW, 449 SHUFP, 450 // VBMI2 Concat & Shift. 451 VSHLD, 452 VSHRD, 453 VSHLDV, 454 VSHRDV, 455 // Shuffle Packed Values at 128-bit granularity. 456 SHUF128, 457 MOVDDUP, 458 MOVSHDUP, 459 MOVSLDUP, 460 MOVLHPS, 461 MOVHLPS, 462 MOVSD, 463 MOVSS, 464 MOVSH, 465 UNPCKL, 466 UNPCKH, 467 VPERMILPV, 468 VPERMILPI, 469 VPERMI, 470 VPERM2X128, 471 472 // Variable Permute (VPERM). 473 // Res = VPERMV MaskV, V0 474 VPERMV, 475 476 // 3-op Variable Permute (VPERMT2). 477 // Res = VPERMV3 V0, MaskV, V1 478 VPERMV3, 479 480 // Bitwise ternary logic. 481 VPTERNLOG, 482 // Fix Up Special Packed Float32/64 values. 483 VFIXUPIMM, 484 VFIXUPIMM_SAE, 485 VFIXUPIMMS, 486 VFIXUPIMMS_SAE, 487 // Range Restriction Calculation For Packed Pairs of Float32/64 values. 488 VRANGE, 489 VRANGE_SAE, 490 VRANGES, 491 VRANGES_SAE, 492 // Reduce - Perform Reduction Transformation on scalar\packed FP. 493 VREDUCE, 494 VREDUCE_SAE, 495 VREDUCES, 496 VREDUCES_SAE, 497 // RndScale - Round FP Values To Include A Given Number Of Fraction Bits. 498 // Also used by the legacy (V)ROUND intrinsics where we mask out the 499 // scaling part of the immediate. 500 VRNDSCALE, 501 VRNDSCALE_SAE, 502 VRNDSCALES, 503 VRNDSCALES_SAE, 504 // Tests Types Of a FP Values for packed types. 505 VFPCLASS, 506 // Tests Types Of a FP Values for scalar types. 507 VFPCLASSS, 508 509 // Broadcast (splat) scalar or element 0 of a vector. If the operand is 510 // a vector, this node may change the vector length as part of the splat. 511 VBROADCAST, 512 // Broadcast mask to vector. 513 VBROADCASTM, 514 515 /// SSE4A Extraction and Insertion. 516 EXTRQI, 517 INSERTQI, 518 519 // XOP arithmetic/logical shifts. 520 VPSHA, 521 VPSHL, 522 // XOP signed/unsigned integer comparisons. 523 VPCOM, 524 VPCOMU, 525 // XOP packed permute bytes. 526 VPPERM, 527 // XOP two source permutation. 528 VPERMIL2, 529 530 // Vector multiply packed unsigned doubleword integers. 531 PMULUDQ, 532 // Vector multiply packed signed doubleword integers. 533 PMULDQ, 534 // Vector Multiply Packed UnsignedIntegers with Round and Scale. 535 MULHRS, 536 537 // Multiply and Add Packed Integers. 538 VPMADDUBSW, 539 VPMADDWD, 540 541 // AVX512IFMA multiply and add. 542 // NOTE: These are different than the instruction and perform 543 // op0 x op1 + op2. 544 VPMADD52L, 545 VPMADD52H, 546 547 // VNNI 548 VPDPBUSD, 549 VPDPBUSDS, 550 VPDPWSSD, 551 VPDPWSSDS, 552 553 // FMA nodes. 554 // We use the target independent ISD::FMA for the non-inverted case. 555 FNMADD, 556 FMSUB, 557 FNMSUB, 558 FMADDSUB, 559 FMSUBADD, 560 561 // FMA with rounding mode. 562 FMADD_RND, 563 FNMADD_RND, 564 FMSUB_RND, 565 FNMSUB_RND, 566 FMADDSUB_RND, 567 FMSUBADD_RND, 568 569 // AVX512-FP16 complex addition and multiplication. 570 VFMADDC, 571 VFMADDC_RND, 572 VFCMADDC, 573 VFCMADDC_RND, 574 575 VFMULC, 576 VFMULC_RND, 577 VFCMULC, 578 VFCMULC_RND, 579 580 VFMADDCSH, 581 VFMADDCSH_RND, 582 VFCMADDCSH, 583 VFCMADDCSH_RND, 584 585 VFMULCSH, 586 VFMULCSH_RND, 587 VFCMULCSH, 588 VFCMULCSH_RND, 589 590 // Compress and expand. 591 COMPRESS, 592 EXPAND, 593 594 // Bits shuffle 595 VPSHUFBITQMB, 596 597 // Convert Unsigned/Integer to Floating-Point Value with rounding mode. 598 SINT_TO_FP_RND, 599 UINT_TO_FP_RND, 600 SCALAR_SINT_TO_FP, 601 SCALAR_UINT_TO_FP, 602 SCALAR_SINT_TO_FP_RND, 603 SCALAR_UINT_TO_FP_RND, 604 605 // Vector float/double to signed/unsigned integer. 606 CVTP2SI, 607 CVTP2UI, 608 CVTP2SI_RND, 609 CVTP2UI_RND, 610 // Scalar float/double to signed/unsigned integer. 611 CVTS2SI, 612 CVTS2UI, 613 CVTS2SI_RND, 614 CVTS2UI_RND, 615 616 // Vector float/double to signed/unsigned integer with truncation. 617 CVTTP2SI, 618 CVTTP2UI, 619 CVTTP2SI_SAE, 620 CVTTP2UI_SAE, 621 // Scalar float/double to signed/unsigned integer with truncation. 622 CVTTS2SI, 623 CVTTS2UI, 624 CVTTS2SI_SAE, 625 CVTTS2UI_SAE, 626 627 // Vector signed/unsigned integer to float/double. 628 CVTSI2P, 629 CVTUI2P, 630 631 // Masked versions of above. Used for v2f64->v4f32. 632 // SRC, PASSTHRU, MASK 633 MCVTP2SI, 634 MCVTP2UI, 635 MCVTTP2SI, 636 MCVTTP2UI, 637 MCVTSI2P, 638 MCVTUI2P, 639 640 // Vector float to bfloat16. 641 // Convert TWO packed single data to one packed BF16 data 642 CVTNE2PS2BF16, 643 // Convert packed single data to packed BF16 data 644 CVTNEPS2BF16, 645 // Masked version of above. 646 // SRC, PASSTHRU, MASK 647 MCVTNEPS2BF16, 648 649 // Dot product of BF16 pairs to accumulated into 650 // packed single precision. 651 DPBF16PS, 652 653 // Save xmm argument registers to the stack, according to %al. An operator 654 // is needed so that this can be expanded with control flow. 655 VASTART_SAVE_XMM_REGS, 656 657 // A stack checking function call. On Windows it's _chkstk call. 658 DYN_ALLOCA, 659 660 // For allocating variable amounts of stack space when using 661 // segmented stacks. Check if the current stacklet has enough space, and 662 // falls back to heap allocation if not. 663 SEG_ALLOCA, 664 665 // For allocating stack space when using stack clash protector. 666 // Allocation is performed by block, and each block is probed. 667 PROBED_ALLOCA, 668 669 // Memory barriers. 670 MEMBARRIER, 671 MFENCE, 672 673 // Get a random integer and indicate whether it is valid in CF. 674 RDRAND, 675 676 // Get a NIST SP800-90B & C compliant random integer and 677 // indicate whether it is valid in CF. 678 RDSEED, 679 680 // Protection keys 681 // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX. 682 // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is 683 // value for ECX. 684 RDPKRU, 685 WRPKRU, 686 687 // SSE42 string comparisons. 688 // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG 689 // will emit one or two instructions based on which results are used. If 690 // flags and index/mask this allows us to use a single instruction since 691 // we won't have to pick and opcode for flags. Instead we can rely on the 692 // DAG to CSE everything and decide at isel. 693 PCMPISTR, 694 PCMPESTR, 695 696 // Test if in transactional execution. 697 XTEST, 698 699 // ERI instructions. 700 RSQRT28, 701 RSQRT28_SAE, 702 RSQRT28S, 703 RSQRT28S_SAE, 704 RCP28, 705 RCP28_SAE, 706 RCP28S, 707 RCP28S_SAE, 708 EXP2, 709 EXP2_SAE, 710 711 // Conversions between float and half-float. 712 CVTPS2PH, 713 CVTPH2PS, 714 CVTPH2PS_SAE, 715 716 // Masked version of above. 717 // SRC, RND, PASSTHRU, MASK 718 MCVTPS2PH, 719 720 // Galois Field Arithmetic Instructions 721 GF2P8AFFINEINVQB, 722 GF2P8AFFINEQB, 723 GF2P8MULB, 724 725 // LWP insert record. 726 LWPINS, 727 728 // User level wait 729 UMWAIT, 730 TPAUSE, 731 732 // Enqueue Stores Instructions 733 ENQCMD, 734 ENQCMDS, 735 736 // For avx512-vp2intersect 737 VP2INTERSECT, 738 739 // User level interrupts - testui 740 TESTUI, 741 742 /// X86 strict FP compare instructions. 743 STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE, 744 STRICT_FCMPS, 745 746 // Vector packed double/float comparison. 747 STRICT_CMPP, 748 749 /// Vector comparison generating mask bits for fp and 750 /// integer signed and unsigned data types. 751 STRICT_CMPM, 752 753 // Vector float/double to signed/unsigned integer with truncation. 754 STRICT_CVTTP2SI, 755 STRICT_CVTTP2UI, 756 757 // Vector FP extend. 758 STRICT_VFPEXT, 759 760 // Vector FP round. 761 STRICT_VFPROUND, 762 763 // RndScale - Round FP Values To Include A Given Number Of Fraction Bits. 764 // Also used by the legacy (V)ROUND intrinsics where we mask out the 765 // scaling part of the immediate. 766 STRICT_VRNDSCALE, 767 768 // Vector signed/unsigned integer to float/double. 769 STRICT_CVTSI2P, 770 STRICT_CVTUI2P, 771 772 // Strict FMA nodes. 773 STRICT_FNMADD, 774 STRICT_FMSUB, 775 STRICT_FNMSUB, 776 777 // Conversions between float and half-float. 778 STRICT_CVTPS2PH, 779 STRICT_CVTPH2PS, 780 781 // WARNING: Only add nodes here if they are stric FP nodes. Non-memory and 782 // non-strict FP nodes should be above FIRST_TARGET_STRICTFP_OPCODE. 783 784 // Compare and swap. 785 LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, 786 LCMPXCHG8_DAG, 787 LCMPXCHG16_DAG, 788 LCMPXCHG16_SAVE_RBX_DAG, 789 790 /// LOCK-prefixed arithmetic read-modify-write instructions. 791 /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS) 792 LADD, 793 LSUB, 794 LOR, 795 LXOR, 796 LAND, 797 798 // Load, scalar_to_vector, and zero extend. 799 VZEXT_LOAD, 800 801 // extract_vector_elt, store. 802 VEXTRACT_STORE, 803 804 // scalar broadcast from memory. 805 VBROADCAST_LOAD, 806 807 // subvector broadcast from memory. 808 SUBV_BROADCAST_LOAD, 809 810 // Store FP control word into i16 memory. 811 FNSTCW16m, 812 813 // Load FP control word from i16 memory. 814 FLDCW16m, 815 816 /// This instruction implements FP_TO_SINT with the 817 /// integer destination in memory and a FP reg source. This corresponds 818 /// to the X86::FIST*m instructions and the rounding mode change stuff. It 819 /// has two inputs (token chain and address) and two outputs (int value 820 /// and token chain). Memory VT specifies the type to store to. 821 FP_TO_INT_IN_MEM, 822 823 /// This instruction implements SINT_TO_FP with the 824 /// integer source in memory and FP reg result. This corresponds to the 825 /// X86::FILD*m instructions. It has two inputs (token chain and address) 826 /// and two outputs (FP value and token chain). The integer source type is 827 /// specified by the memory VT. 828 FILD, 829 830 /// This instruction implements a fp->int store from FP stack 831 /// slots. This corresponds to the fist instruction. It takes a 832 /// chain operand, value to store, address, and glue. The memory VT 833 /// specifies the type to store as. 834 FIST, 835 836 /// This instruction implements an extending load to FP stack slots. 837 /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain 838 /// operand, and ptr to load from. The memory VT specifies the type to 839 /// load from. 840 FLD, 841 842 /// This instruction implements a truncating store from FP stack 843 /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a 844 /// chain operand, value to store, address, and glue. The memory VT 845 /// specifies the type to store as. 846 FST, 847 848 /// These instructions grab the address of the next argument 849 /// from a va_list. (reads and modifies the va_list in memory) 850 VAARG_64, 851 VAARG_X32, 852 853 // Vector truncating store with unsigned/signed saturation 854 VTRUNCSTOREUS, 855 VTRUNCSTORES, 856 // Vector truncating masked store with unsigned/signed saturation 857 VMTRUNCSTOREUS, 858 VMTRUNCSTORES, 859 860 // X86 specific gather and scatter 861 MGATHER, 862 MSCATTER, 863 864 // Key locker nodes that produce flags. 865 AESENC128KL, 866 AESDEC128KL, 867 AESENC256KL, 868 AESDEC256KL, 869 AESENCWIDE128KL, 870 AESDECWIDE128KL, 871 AESENCWIDE256KL, 872 AESDECWIDE256KL, 873 874 // WARNING: Do not add anything in the end unless you want the node to 875 // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all 876 // opcodes will be thought as target memory ops! 877 }; 878 } // end namespace X86ISD 879 880 namespace X86 { 881 /// Current rounding mode is represented in bits 11:10 of FPSR. These 882 /// values are same as corresponding constants for rounding mode used 883 /// in glibc. 884 enum RoundingMode { 885 rmToNearest = 0, // FE_TONEAREST 886 rmDownward = 1 << 10, // FE_DOWNWARD 887 rmUpward = 2 << 10, // FE_UPWARD 888 rmTowardZero = 3 << 10, // FE_TOWARDZERO 889 rmMask = 3 << 10 // Bit mask selecting rounding mode 890 }; 891 } 892 893 /// Define some predicates that are used for node matching. 894 namespace X86 { 895 /// Returns true if Elt is a constant zero or floating point constant +0.0. 896 bool isZeroNode(SDValue Elt); 897 898 /// Returns true of the given offset can be 899 /// fit into displacement field of the instruction. 900 bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 901 bool hasSymbolicDisplacement); 902 903 /// Determines whether the callee is required to pop its 904 /// own arguments. Callee pop is necessary to support tail calls. 905 bool isCalleePop(CallingConv::ID CallingConv, 906 bool is64Bit, bool IsVarArg, bool GuaranteeTCO); 907 908 /// If Op is a constant whose elements are all the same constant or 909 /// undefined, return true and return the constant value in \p SplatVal. 910 /// If we have undef bits that don't cover an entire element, we treat these 911 /// as zero if AllowPartialUndefs is set, else we fail and return false. 912 bool isConstantSplat(SDValue Op, APInt &SplatVal, 913 bool AllowPartialUndefs = true); 914 } // end namespace X86 915 916 //===--------------------------------------------------------------------===// 917 // X86 Implementation of the TargetLowering interface 918 class X86TargetLowering final : public TargetLowering { 919 public: 920 explicit X86TargetLowering(const X86TargetMachine &TM, 921 const X86Subtarget &STI); 922 923 unsigned getJumpTableEncoding() const override; 924 bool useSoftFloat() const override; 925 926 void markLibCallAttributes(MachineFunction *MF, unsigned CC, 927 ArgListTy &Args) const override; 928 getScalarShiftAmountTy(const DataLayout &,EVT VT)929 MVT getScalarShiftAmountTy(const DataLayout &, EVT VT) const override { 930 return MVT::i8; 931 } 932 933 const MCExpr * 934 LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 935 const MachineBasicBlock *MBB, unsigned uid, 936 MCContext &Ctx) const override; 937 938 /// Returns relocation base for the given PIC jumptable. 939 SDValue getPICJumpTableRelocBase(SDValue Table, 940 SelectionDAG &DAG) const override; 941 const MCExpr * 942 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, 943 unsigned JTI, MCContext &Ctx) const override; 944 945 /// Return the desired alignment for ByVal aggregate 946 /// function arguments in the caller parameter area. For X86, aggregates 947 /// that contains are placed at 16-byte boundaries while the rest are at 948 /// 4-byte boundaries. 949 uint64_t getByValTypeAlignment(Type *Ty, 950 const DataLayout &DL) const override; 951 952 EVT getOptimalMemOpType(const MemOp &Op, 953 const AttributeList &FuncAttributes) const override; 954 955 /// Returns true if it's safe to use load / store of the 956 /// specified type to expand memcpy / memset inline. This is mostly true 957 /// for all types except for some special cases. For example, on X86 958 /// targets without SSE2 f64 load / store are done with fldl / fstpl which 959 /// also does type conversion. Note the specified type doesn't have to be 960 /// legal as the hook is used before type legalization. 961 bool isSafeMemOpType(MVT VT) const override; 962 963 /// Returns true if the target allows unaligned memory accesses of the 964 /// specified type. Returns whether it is "fast" in the last argument. 965 bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, Align Alignment, 966 MachineMemOperand::Flags Flags, 967 bool *Fast) const override; 968 969 /// Provide custom lowering hooks for some operations. 970 /// 971 SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; 972 973 /// Replace the results of node with an illegal result 974 /// type with new values built out of custom code. 975 /// 976 void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results, 977 SelectionDAG &DAG) const override; 978 979 SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; 980 981 /// Return true if the target has native support for 982 /// the specified value type and it is 'desirable' to use the type for the 983 /// given node type. e.g. On x86 i16 is legal, but undesirable since i16 984 /// instruction encodings are longer and some i16 instructions are slow. 985 bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override; 986 987 /// Return true if the target has native support for the 988 /// specified value type and it is 'desirable' to use the type. e.g. On x86 989 /// i16 is legal, but undesirable since i16 instruction encodings are longer 990 /// and some i16 instructions are slow. 991 bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override; 992 993 /// Return the newly negated expression if the cost is not expensive and 994 /// set the cost in \p Cost to indicate that if it is cheaper or neutral to 995 /// do the negation. 996 SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, 997 bool LegalOperations, bool ForCodeSize, 998 NegatibleCost &Cost, 999 unsigned Depth) const override; 1000 1001 MachineBasicBlock * 1002 EmitInstrWithCustomInserter(MachineInstr &MI, 1003 MachineBasicBlock *MBB) const override; 1004 1005 /// This method returns the name of a target specific DAG node. 1006 const char *getTargetNodeName(unsigned Opcode) const override; 1007 1008 /// Do not merge vector stores after legalization because that may conflict 1009 /// with x86-specific store splitting optimizations. mergeStoresAfterLegalization(EVT MemVT)1010 bool mergeStoresAfterLegalization(EVT MemVT) const override { 1011 return !MemVT.isVector(); 1012 } 1013 1014 bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, 1015 const MachineFunction &MF) const override; 1016 1017 bool isCheapToSpeculateCttz() const override; 1018 1019 bool isCheapToSpeculateCtlz() const override; 1020 1021 bool isCtlzFast() const override; 1022 hasBitPreservingFPLogic(EVT VT)1023 bool hasBitPreservingFPLogic(EVT VT) const override { 1024 return VT == MVT::f32 || VT == MVT::f64 || VT.isVector() || 1025 (VT == MVT::f16 && X86ScalarSSEf16); 1026 } 1027 isMultiStoresCheaperThanBitsMerge(EVT LTy,EVT HTy)1028 bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override { 1029 // If the pair to store is a mixture of float and int values, we will 1030 // save two bitwise instructions and one float-to-int instruction and 1031 // increase one store instruction. There is potentially a more 1032 // significant benefit because it avoids the float->int domain switch 1033 // for input value. So It is more likely a win. 1034 if ((LTy.isFloatingPoint() && HTy.isInteger()) || 1035 (LTy.isInteger() && HTy.isFloatingPoint())) 1036 return true; 1037 // If the pair only contains int values, we will save two bitwise 1038 // instructions and increase one store instruction (costing one more 1039 // store buffer). Since the benefit is more blurred so we leave 1040 // such pair out until we get testcase to prove it is a win. 1041 return false; 1042 } 1043 1044 bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override; 1045 1046 bool hasAndNotCompare(SDValue Y) const override; 1047 1048 bool hasAndNot(SDValue Y) const override; 1049 1050 bool hasBitTest(SDValue X, SDValue Y) const override; 1051 1052 bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( 1053 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, 1054 unsigned OldShiftOpcode, unsigned NewShiftOpcode, 1055 SelectionDAG &DAG) const override; 1056 1057 bool shouldFoldConstantShiftPairToMask(const SDNode *N, 1058 CombineLevel Level) const override; 1059 1060 bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override; 1061 1062 bool shouldTransformSignedTruncationCheck(EVT XVT,unsigned KeptBits)1063 shouldTransformSignedTruncationCheck(EVT XVT, 1064 unsigned KeptBits) const override { 1065 // For vectors, we don't have a preference.. 1066 if (XVT.isVector()) 1067 return false; 1068 1069 auto VTIsOk = [](EVT VT) -> bool { 1070 return VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || 1071 VT == MVT::i64; 1072 }; 1073 1074 // We are ok with KeptBitsVT being byte/word/dword, what MOVS supports. 1075 // XVT will be larger than KeptBitsVT. 1076 MVT KeptBitsVT = MVT::getIntegerVT(KeptBits); 1077 return VTIsOk(XVT) && VTIsOk(KeptBitsVT); 1078 } 1079 1080 bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override; 1081 1082 bool shouldSplatInsEltVarIndex(EVT VT) const override; 1083 convertSetCCLogicToBitwiseLogic(EVT VT)1084 bool convertSetCCLogicToBitwiseLogic(EVT VT) const override { 1085 return VT.isScalarInteger(); 1086 } 1087 1088 /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST. 1089 MVT hasFastEqualityCompare(unsigned NumBits) const override; 1090 1091 /// Return the value type to use for ISD::SETCC. 1092 EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, 1093 EVT VT) const override; 1094 1095 bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, 1096 const APInt &DemandedElts, 1097 TargetLoweringOpt &TLO) const override; 1098 1099 /// Determine which of the bits specified in Mask are known to be either 1100 /// zero or one and return them in the KnownZero/KnownOne bitsets. 1101 void computeKnownBitsForTargetNode(const SDValue Op, 1102 KnownBits &Known, 1103 const APInt &DemandedElts, 1104 const SelectionDAG &DAG, 1105 unsigned Depth = 0) const override; 1106 1107 /// Determine the number of bits in the operation that are sign bits. 1108 unsigned ComputeNumSignBitsForTargetNode(SDValue Op, 1109 const APInt &DemandedElts, 1110 const SelectionDAG &DAG, 1111 unsigned Depth) const override; 1112 1113 bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, 1114 const APInt &DemandedElts, 1115 APInt &KnownUndef, 1116 APInt &KnownZero, 1117 TargetLoweringOpt &TLO, 1118 unsigned Depth) const override; 1119 1120 bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, 1121 const APInt &DemandedElts, 1122 unsigned MaskIndex, 1123 TargetLoweringOpt &TLO, 1124 unsigned Depth) const; 1125 1126 bool SimplifyDemandedBitsForTargetNode(SDValue Op, 1127 const APInt &DemandedBits, 1128 const APInt &DemandedElts, 1129 KnownBits &Known, 1130 TargetLoweringOpt &TLO, 1131 unsigned Depth) const override; 1132 1133 SDValue SimplifyMultipleUseDemandedBitsForTargetNode( 1134 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, 1135 SelectionDAG &DAG, unsigned Depth) const override; 1136 1137 const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override; 1138 1139 SDValue unwrapAddress(SDValue N) const override; 1140 1141 SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const; 1142 1143 bool ExpandInlineAsm(CallInst *CI) const override; 1144 1145 ConstraintType getConstraintType(StringRef Constraint) const override; 1146 1147 /// Examine constraint string and operand type and determine a weight value. 1148 /// The operand object must already have been set up with the operand type. 1149 ConstraintWeight 1150 getSingleConstraintMatchWeight(AsmOperandInfo &info, 1151 const char *constraint) const override; 1152 1153 const char *LowerXConstraint(EVT ConstraintVT) const override; 1154 1155 /// Lower the specified operand into the Ops vector. If it is invalid, don't 1156 /// add anything to Ops. If hasMemory is true it means one of the asm 1157 /// constraint of the inline asm instruction being processed is 'm'. 1158 void LowerAsmOperandForConstraint(SDValue Op, 1159 std::string &Constraint, 1160 std::vector<SDValue> &Ops, 1161 SelectionDAG &DAG) const override; 1162 1163 unsigned getInlineAsmMemConstraint(StringRef ConstraintCode)1164 getInlineAsmMemConstraint(StringRef ConstraintCode) const override { 1165 if (ConstraintCode == "v") 1166 return InlineAsm::Constraint_v; 1167 return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); 1168 } 1169 1170 /// Handle Lowering flag assembly outputs. 1171 SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, 1172 const SDLoc &DL, 1173 const AsmOperandInfo &Constraint, 1174 SelectionDAG &DAG) const override; 1175 1176 /// Given a physical register constraint 1177 /// (e.g. {edx}), return the register number and the register class for the 1178 /// register. This should only be used for C_Register constraints. On 1179 /// error, this returns a register number of 0. 1180 std::pair<unsigned, const TargetRegisterClass *> 1181 getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 1182 StringRef Constraint, MVT VT) const override; 1183 1184 /// Return true if the addressing mode represented 1185 /// by AM is legal for this target, for a load/store of the specified type. 1186 bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, 1187 Type *Ty, unsigned AS, 1188 Instruction *I = nullptr) const override; 1189 1190 /// Return true if the specified immediate is legal 1191 /// icmp immediate, that is the target has icmp instructions which can 1192 /// compare a register against the immediate without having to materialize 1193 /// the immediate into a register. 1194 bool isLegalICmpImmediate(int64_t Imm) const override; 1195 1196 /// Return true if the specified immediate is legal 1197 /// add immediate, that is the target has add instructions which can 1198 /// add a register and the immediate without having to materialize 1199 /// the immediate into a register. 1200 bool isLegalAddImmediate(int64_t Imm) const override; 1201 1202 bool isLegalStoreImmediate(int64_t Imm) const override; 1203 1204 /// Return the cost of the scaling factor used in the addressing 1205 /// mode represented by AM for this target, for a load/store 1206 /// of the specified type. 1207 /// If the AM is supported, the return value must be >= 0. 1208 /// If the AM is not supported, it returns a negative value. 1209 InstructionCost getScalingFactorCost(const DataLayout &DL, 1210 const AddrMode &AM, Type *Ty, 1211 unsigned AS) const override; 1212 1213 /// This is used to enable splatted operand transforms for vector shifts 1214 /// and vector funnel shifts. 1215 bool isVectorShiftByScalarCheap(Type *Ty) const override; 1216 1217 /// Add x86-specific opcodes to the default list. 1218 bool isBinOp(unsigned Opcode) const override; 1219 1220 /// Returns true if the opcode is a commutative binary operation. 1221 bool isCommutativeBinOp(unsigned Opcode) const override; 1222 1223 /// Return true if it's free to truncate a value of 1224 /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in 1225 /// register EAX to i16 by referencing its sub-register AX. 1226 bool isTruncateFree(Type *Ty1, Type *Ty2) const override; 1227 bool isTruncateFree(EVT VT1, EVT VT2) const override; 1228 1229 bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override; 1230 1231 /// Return true if any actual instruction that defines a 1232 /// value of type Ty1 implicit zero-extends the value to Ty2 in the result 1233 /// register. This does not necessarily include registers defined in 1234 /// unknown ways, such as incoming arguments, or copies from unknown 1235 /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this 1236 /// does not necessarily apply to truncate instructions. e.g. on x86-64, 1237 /// all instructions that define 32-bit values implicit zero-extend the 1238 /// result out to 64 bits. 1239 bool isZExtFree(Type *Ty1, Type *Ty2) const override; 1240 bool isZExtFree(EVT VT1, EVT VT2) const override; 1241 bool isZExtFree(SDValue Val, EVT VT2) const override; 1242 1243 bool shouldSinkOperands(Instruction *I, 1244 SmallVectorImpl<Use *> &Ops) const override; 1245 bool shouldConvertPhiType(Type *From, Type *To) const override; 1246 1247 /// Return true if folding a vector load into ExtVal (a sign, zero, or any 1248 /// extend node) is profitable. 1249 bool isVectorLoadExtDesirable(SDValue) const override; 1250 1251 /// Return true if an FMA operation is faster than a pair of fmul and fadd 1252 /// instructions. fmuladd intrinsics will be expanded to FMAs when this 1253 /// method returns true, otherwise fmuladd is expanded to fmul + fadd. 1254 bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, 1255 EVT VT) const override; 1256 1257 /// Return true if it's profitable to narrow 1258 /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow 1259 /// from i32 to i8 but not from i32 to i16. 1260 bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; 1261 1262 /// Given an intrinsic, checks if on the target the intrinsic will need to map 1263 /// to a MemIntrinsicNode (touches memory). If this is the case, it returns 1264 /// true and stores the intrinsic information into the IntrinsicInfo that was 1265 /// passed to the function. 1266 bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, 1267 MachineFunction &MF, 1268 unsigned Intrinsic) const override; 1269 1270 /// Returns true if the target can instruction select the 1271 /// specified FP immediate natively. If false, the legalizer will 1272 /// materialize the FP immediate as a load from a constant pool. 1273 bool isFPImmLegal(const APFloat &Imm, EVT VT, 1274 bool ForCodeSize) const override; 1275 1276 /// Targets can use this to indicate that they only support *some* 1277 /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a 1278 /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to 1279 /// be legal. 1280 bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override; 1281 1282 /// Similar to isShuffleMaskLegal. Targets can use this to indicate if there 1283 /// is a suitable VECTOR_SHUFFLE that can be used to replace a VAND with a 1284 /// constant pool entry. 1285 bool isVectorClearMaskLegal(ArrayRef<int> Mask, EVT VT) const override; 1286 1287 /// Returns true if lowering to a jump table is allowed. 1288 bool areJTsAllowed(const Function *Fn) const override; 1289 1290 /// If true, then instruction selection should 1291 /// seek to shrink the FP constant of the specified type to a smaller type 1292 /// in order to save space and / or reduce runtime. ShouldShrinkFPConstant(EVT VT)1293 bool ShouldShrinkFPConstant(EVT VT) const override { 1294 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more 1295 // expensive than a straight movsd. On the other hand, it's important to 1296 // shrink long double fp constant since fldt is very slow. 1297 return !X86ScalarSSEf64 || VT == MVT::f80; 1298 } 1299 1300 /// Return true if we believe it is correct and profitable to reduce the 1301 /// load node to a smaller type. 1302 bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, 1303 EVT NewVT) const override; 1304 1305 /// Return true if the specified scalar FP type is computed in an SSE 1306 /// register, not on the X87 floating point stack. isScalarFPTypeInSSEReg(EVT VT)1307 bool isScalarFPTypeInSSEReg(EVT VT) const { 1308 return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 1309 (VT == MVT::f32 && X86ScalarSSEf32) || // f32 is when SSE1 1310 (VT == MVT::f16 && X86ScalarSSEf16); // f16 is when AVX512FP16 1311 } 1312 1313 /// Returns true if it is beneficial to convert a load of a constant 1314 /// to just the constant itself. 1315 bool shouldConvertConstantLoadToIntImm(const APInt &Imm, 1316 Type *Ty) const override; 1317 1318 bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override; 1319 1320 bool convertSelectOfConstantsToMath(EVT VT) const override; 1321 1322 bool decomposeMulByConstant(LLVMContext &Context, EVT VT, 1323 SDValue C) const override; 1324 1325 /// Return true if EXTRACT_SUBVECTOR is cheap for this result type 1326 /// with this index. 1327 bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, 1328 unsigned Index) const override; 1329 1330 /// Scalar ops always have equal or better analysis/performance/power than 1331 /// the vector equivalent, so this always makes sense if the scalar op is 1332 /// supported. shouldScalarizeBinop(SDValue)1333 bool shouldScalarizeBinop(SDValue) const override; 1334 1335 /// Extract of a scalar FP value from index 0 of a vector is free. 1336 bool isExtractVecEltCheap(EVT VT, unsigned Index) const override { 1337 EVT EltVT = VT.getScalarType(); 1338 return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0; 1339 } 1340 1341 /// Overflow nodes should get combined/lowered to optimal instructions 1342 /// (they should allow eliminating explicit compares by getting flags from 1343 /// math ops). 1344 bool shouldFormOverflowOp(unsigned Opcode, EVT VT, 1345 bool MathUsed) const override; 1346 storeOfVectorConstantIsCheap(EVT MemVT,unsigned NumElem,unsigned AddrSpace)1347 bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem, 1348 unsigned AddrSpace) const override { 1349 // If we can replace more than 2 scalar stores, there will be a reduction 1350 // in instructions even after we add a vector constant load. 1351 return NumElem > 2; 1352 } 1353 1354 bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, 1355 const SelectionDAG &DAG, 1356 const MachineMemOperand &MMO) const override; 1357 1358 /// Intel processors have a unified instruction and data cache getClearCacheBuiltinName()1359 const char * getClearCacheBuiltinName() const override { 1360 return nullptr; // nothing to do, move along. 1361 } 1362 1363 Register getRegisterByName(const char* RegName, LLT VT, 1364 const MachineFunction &MF) const override; 1365 1366 /// If a physical register, this returns the register that receives the 1367 /// exception address on entry to an EH pad. 1368 Register 1369 getExceptionPointerRegister(const Constant *PersonalityFn) const override; 1370 1371 /// If a physical register, this returns the register that receives the 1372 /// exception typeid on entry to a landing pad. 1373 Register 1374 getExceptionSelectorRegister(const Constant *PersonalityFn) const override; 1375 1376 virtual bool needsFixedCatchObjects() const override; 1377 1378 /// This method returns a target specific FastISel object, 1379 /// or null if the target does not support "fast" ISel. 1380 FastISel *createFastISel(FunctionLoweringInfo &funcInfo, 1381 const TargetLibraryInfo *libInfo) const override; 1382 1383 /// If the target has a standard location for the stack protector cookie, 1384 /// returns the address of that location. Otherwise, returns nullptr. 1385 Value *getIRStackGuard(IRBuilderBase &IRB) const override; 1386 1387 bool useLoadStackGuardNode() const override; 1388 bool useStackGuardXorFP() const override; 1389 void insertSSPDeclarations(Module &M) const override; 1390 Value *getSDagStackGuard(const Module &M) const override; 1391 Function *getSSPStackGuardCheck(const Module &M) const override; 1392 SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, 1393 const SDLoc &DL) const override; 1394 1395 1396 /// Return true if the target stores SafeStack pointer at a fixed offset in 1397 /// some non-standard address space, and populates the address space and 1398 /// offset as appropriate. 1399 Value *getSafeStackPointerLocation(IRBuilderBase &IRB) const override; 1400 1401 std::pair<SDValue, SDValue> BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, 1402 SDValue Chain, SDValue Pointer, 1403 MachinePointerInfo PtrInfo, 1404 Align Alignment, 1405 SelectionDAG &DAG) const; 1406 1407 /// Customize the preferred legalization strategy for certain types. 1408 LegalizeTypeAction getPreferredVectorAction(MVT VT) const override; 1409 softPromoteHalfType()1410 bool softPromoteHalfType() const override { return true; } 1411 1412 MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, 1413 EVT VT) const override; 1414 1415 unsigned getNumRegistersForCallingConv(LLVMContext &Context, 1416 CallingConv::ID CC, 1417 EVT VT) const override; 1418 1419 unsigned getVectorTypeBreakdownForCallingConv( 1420 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, 1421 unsigned &NumIntermediates, MVT &RegisterVT) const override; 1422 1423 bool isIntDivCheap(EVT VT, AttributeList Attr) const override; 1424 1425 bool supportSwiftError() const override; 1426 1427 bool hasStackProbeSymbol(MachineFunction &MF) const override; 1428 bool hasInlineStackProbe(MachineFunction &MF) const override; 1429 StringRef getStackProbeSymbolName(MachineFunction &MF) const override; 1430 1431 unsigned getStackProbeSize(MachineFunction &MF) const; 1432 hasVectorBlend()1433 bool hasVectorBlend() const override { return true; } 1434 getMaxSupportedInterleaveFactor()1435 unsigned getMaxSupportedInterleaveFactor() const override { return 4; } 1436 1437 /// Lower interleaved load(s) into target specific 1438 /// instructions/intrinsics. 1439 bool lowerInterleavedLoad(LoadInst *LI, 1440 ArrayRef<ShuffleVectorInst *> Shuffles, 1441 ArrayRef<unsigned> Indices, 1442 unsigned Factor) const override; 1443 1444 /// Lower interleaved store(s) into target specific 1445 /// instructions/intrinsics. 1446 bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, 1447 unsigned Factor) const override; 1448 1449 SDValue expandIndirectJTBranch(const SDLoc& dl, SDValue Value, 1450 SDValue Addr, SelectionDAG &DAG) 1451 const override; 1452 1453 Align getPrefLoopAlignment(MachineLoop *ML) const override; 1454 1455 protected: 1456 std::pair<const TargetRegisterClass *, uint8_t> 1457 findRepresentativeClass(const TargetRegisterInfo *TRI, 1458 MVT VT) const override; 1459 1460 private: 1461 /// Keep a reference to the X86Subtarget around so that we can 1462 /// make the right decision when generating code for different targets. 1463 const X86Subtarget &Subtarget; 1464 1465 /// Select between SSE or x87 floating point ops. 1466 /// When SSE is available, use it for f32 operations. 1467 /// When SSE2 is available, use it for f64 operations. 1468 bool X86ScalarSSEf32; 1469 bool X86ScalarSSEf64; 1470 bool X86ScalarSSEf16; 1471 1472 /// A list of legal FP immediates. 1473 std::vector<APFloat> LegalFPImmediates; 1474 1475 /// Indicate that this x86 target can instruction 1476 /// select the specified FP immediate natively. addLegalFPImmediate(const APFloat & Imm)1477 void addLegalFPImmediate(const APFloat& Imm) { 1478 LegalFPImmediates.push_back(Imm); 1479 } 1480 1481 SDValue LowerCallResult(SDValue Chain, SDValue InFlag, 1482 CallingConv::ID CallConv, bool isVarArg, 1483 const SmallVectorImpl<ISD::InputArg> &Ins, 1484 const SDLoc &dl, SelectionDAG &DAG, 1485 SmallVectorImpl<SDValue> &InVals, 1486 uint32_t *RegMask) const; 1487 SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, 1488 const SmallVectorImpl<ISD::InputArg> &ArgInfo, 1489 const SDLoc &dl, SelectionDAG &DAG, 1490 const CCValAssign &VA, MachineFrameInfo &MFI, 1491 unsigned i) const; 1492 SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, 1493 const SDLoc &dl, SelectionDAG &DAG, 1494 const CCValAssign &VA, 1495 ISD::ArgFlagsTy Flags, bool isByval) const; 1496 1497 // Call lowering helpers. 1498 1499 /// Check whether the call is eligible for tail call optimization. Targets 1500 /// that want to do tail call optimization should implement this function. 1501 bool IsEligibleForTailCallOptimization( 1502 SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleeStackStructRet, 1503 bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs, 1504 const SmallVectorImpl<SDValue> &OutVals, 1505 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const; 1506 SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr, 1507 SDValue Chain, bool IsTailCall, 1508 bool Is64Bit, int FPDiff, 1509 const SDLoc &dl) const; 1510 1511 unsigned GetAlignedArgumentStackSize(unsigned StackSize, 1512 SelectionDAG &DAG) const; 1513 1514 unsigned getAddressSpace(void) const; 1515 1516 SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, 1517 SDValue &Chain) const; 1518 SDValue LRINT_LLRINTHelper(SDNode *N, SelectionDAG &DAG) const; 1519 1520 SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; 1521 SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const; 1522 SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; 1523 SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; 1524 1525 unsigned getGlobalWrapperKind(const GlobalValue *GV = nullptr, 1526 const unsigned char OpFlags = 0) const; 1527 SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; 1528 SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; 1529 SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; 1530 SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; 1531 SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const; 1532 1533 /// Creates target global address or external symbol nodes for calls or 1534 /// other uses. 1535 SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG, 1536 bool ForCall) const; 1537 1538 SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; 1539 SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; 1540 SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; 1541 SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; 1542 SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const; 1543 SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const; 1544 SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; 1545 SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const; 1546 SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; 1547 SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; 1548 SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; 1549 SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; 1550 SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; 1551 SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const; 1552 SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; 1553 SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const; 1554 SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; 1555 SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const; 1556 SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const; 1557 SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; 1558 SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; 1559 SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const; 1560 SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; 1561 SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; 1562 SDValue LowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; 1563 SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const; 1564 SDValue LowerWin64_FP_TO_INT128(SDValue Op, SelectionDAG &DAG, 1565 SDValue &Chain) const; 1566 SDValue LowerWin64_INT128_TO_FP(SDValue Op, SelectionDAG &DAG) const; 1567 SDValue LowerGC_TRANSITION(SDValue Op, SelectionDAG &DAG) const; 1568 SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; 1569 SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const; 1570 SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; 1571 SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; 1572 1573 SDValue 1574 LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 1575 const SmallVectorImpl<ISD::InputArg> &Ins, 1576 const SDLoc &dl, SelectionDAG &DAG, 1577 SmallVectorImpl<SDValue> &InVals) const override; 1578 SDValue LowerCall(CallLoweringInfo &CLI, 1579 SmallVectorImpl<SDValue> &InVals) const override; 1580 1581 SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 1582 const SmallVectorImpl<ISD::OutputArg> &Outs, 1583 const SmallVectorImpl<SDValue> &OutVals, 1584 const SDLoc &dl, SelectionDAG &DAG) const override; 1585 supportSplitCSR(MachineFunction * MF)1586 bool supportSplitCSR(MachineFunction *MF) const override { 1587 return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS && 1588 MF->getFunction().hasFnAttribute(Attribute::NoUnwind); 1589 } 1590 void initializeSplitCSR(MachineBasicBlock *Entry) const override; 1591 void insertCopiesSplitCSR( 1592 MachineBasicBlock *Entry, 1593 const SmallVectorImpl<MachineBasicBlock *> &Exits) const override; 1594 1595 bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; 1596 1597 bool mayBeEmittedAsTailCall(const CallInst *CI) const override; 1598 1599 EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, 1600 ISD::NodeType ExtendKind) const override; 1601 1602 bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, 1603 bool isVarArg, 1604 const SmallVectorImpl<ISD::OutputArg> &Outs, 1605 LLVMContext &Context) const override; 1606 1607 const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; 1608 1609 TargetLoweringBase::AtomicExpansionKind 1610 shouldExpandAtomicLoadInIR(LoadInst *LI) const override; 1611 bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; 1612 TargetLoweringBase::AtomicExpansionKind 1613 shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; 1614 1615 LoadInst * 1616 lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; 1617 1618 bool lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const override; 1619 bool lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const override; 1620 1621 bool needsCmpXchgNb(Type *MemType) const; 1622 1623 void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB, 1624 MachineBasicBlock *DispatchBB, int FI) const; 1625 1626 // Utility function to emit the low-level va_arg code for X86-64. 1627 MachineBasicBlock * 1628 EmitVAARGWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const; 1629 1630 /// Utility function to emit the xmm reg save portion of va_start. 1631 MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1, 1632 MachineInstr &MI2, 1633 MachineBasicBlock *BB) const; 1634 1635 MachineBasicBlock *EmitLoweredSelect(MachineInstr &I, 1636 MachineBasicBlock *BB) const; 1637 1638 MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI, 1639 MachineBasicBlock *BB) const; 1640 1641 MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI, 1642 MachineBasicBlock *BB) const; 1643 1644 MachineBasicBlock *EmitLoweredProbedAlloca(MachineInstr &MI, 1645 MachineBasicBlock *BB) const; 1646 1647 MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr &MI, 1648 MachineBasicBlock *BB) const; 1649 1650 MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI, 1651 MachineBasicBlock *BB) const; 1652 1653 MachineBasicBlock *EmitLoweredIndirectThunk(MachineInstr &MI, 1654 MachineBasicBlock *BB) const; 1655 1656 MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI, 1657 MachineBasicBlock *MBB) const; 1658 1659 void emitSetJmpShadowStackFix(MachineInstr &MI, 1660 MachineBasicBlock *MBB) const; 1661 1662 MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI, 1663 MachineBasicBlock *MBB) const; 1664 1665 MachineBasicBlock *emitLongJmpShadowStackFix(MachineInstr &MI, 1666 MachineBasicBlock *MBB) const; 1667 1668 MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI, 1669 MachineBasicBlock *MBB) const; 1670 1671 /// Emit flags for the given setcc condition and operands. Also returns the 1672 /// corresponding X86 condition code constant in X86CC. 1673 SDValue emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC, 1674 const SDLoc &dl, SelectionDAG &DAG, 1675 SDValue &X86CC) const; 1676 1677 /// Check if replacement of SQRT with RSQRT should be disabled. 1678 bool isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const override; 1679 1680 /// Use rsqrt* to speed up sqrt calculations. 1681 SDValue getSqrtEstimate(SDValue Op, SelectionDAG &DAG, int Enabled, 1682 int &RefinementSteps, bool &UseOneConstNR, 1683 bool Reciprocal) const override; 1684 1685 /// Use rcp* to speed up fdiv calculations. 1686 SDValue getRecipEstimate(SDValue Op, SelectionDAG &DAG, int Enabled, 1687 int &RefinementSteps) const override; 1688 1689 /// Reassociate floating point divisions into multiply by reciprocal. 1690 unsigned combineRepeatedFPDivisors() const override; 1691 1692 SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, 1693 SmallVectorImpl<SDNode *> &Created) const override; 1694 }; 1695 1696 namespace X86 { 1697 FastISel *createFastISel(FunctionLoweringInfo &funcInfo, 1698 const TargetLibraryInfo *libInfo); 1699 } // end namespace X86 1700 1701 // X86 specific Gather/Scatter nodes. 1702 // The class has the same order of operands as MaskedGatherScatterSDNode for 1703 // convenience. 1704 class X86MaskedGatherScatterSDNode : public MemIntrinsicSDNode { 1705 public: 1706 // This is a intended as a utility and should never be directly created. 1707 X86MaskedGatherScatterSDNode() = delete; 1708 ~X86MaskedGatherScatterSDNode() = delete; 1709 getBasePtr()1710 const SDValue &getBasePtr() const { return getOperand(3); } getIndex()1711 const SDValue &getIndex() const { return getOperand(4); } getMask()1712 const SDValue &getMask() const { return getOperand(2); } getScale()1713 const SDValue &getScale() const { return getOperand(5); } 1714 classof(const SDNode * N)1715 static bool classof(const SDNode *N) { 1716 return N->getOpcode() == X86ISD::MGATHER || 1717 N->getOpcode() == X86ISD::MSCATTER; 1718 } 1719 }; 1720 1721 class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode { 1722 public: getPassThru()1723 const SDValue &getPassThru() const { return getOperand(1); } 1724 classof(const SDNode * N)1725 static bool classof(const SDNode *N) { 1726 return N->getOpcode() == X86ISD::MGATHER; 1727 } 1728 }; 1729 1730 class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode { 1731 public: getValue()1732 const SDValue &getValue() const { return getOperand(1); } 1733 classof(const SDNode * N)1734 static bool classof(const SDNode *N) { 1735 return N->getOpcode() == X86ISD::MSCATTER; 1736 } 1737 }; 1738 1739 /// Generate unpacklo/unpackhi shuffle mask. 1740 void createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask, bool Lo, 1741 bool Unary); 1742 1743 /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation 1744 /// imposed by AVX and specific to the unary pattern. Example: 1745 /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3> 1746 /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7> 1747 void createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo); 1748 1749 } // end namespace llvm 1750 1751 #endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H 1752