1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file describes the X86 SSE instruction set, defining the instructions, 10// and properties of the instructions which are needed for code generation, 11// machine code emission, and analysis. 12// 13//===----------------------------------------------------------------------===// 14 15//===----------------------------------------------------------------------===// 16// SSE 1 & 2 Instructions Classes 17//===----------------------------------------------------------------------===// 18 19/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class 20multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, 21 RegisterClass RC, X86MemOperand x86memop, 22 Domain d, X86FoldableSchedWrite sched, 23 bit Is2Addr = 1> { 24let isCodeGenOnly = 1 in { 25 let isCommutable = 1 in { 26 def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 27 !if(Is2Addr, 28 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 29 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 30 [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>, 31 Sched<[sched]>; 32 } 33 def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 34 !if(Is2Addr, 35 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 36 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 37 [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>, 38 Sched<[sched.Folded, sched.ReadAfterFold]>; 39} 40} 41 42/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class 43multiclass sse12_fp_scalar_int<bits<8> opc, 44 SDPatternOperator OpNode, RegisterClass RC, 45 ValueType VT, string asm, Operand memopr, 46 PatFrags mem_frags, Domain d, 47 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 48let hasSideEffects = 0 in { 49 def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 50 !if(Is2Addr, 51 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 52 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 53 [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>, 54 Sched<[sched]>; 55 let mayLoad = 1 in 56 def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2), 57 !if(Is2Addr, 58 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 59 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 60 [(set RC:$dst, (VT (OpNode RC:$src1, (mem_frags addr:$src2))))], d>, 61 Sched<[sched.Folded, sched.ReadAfterFold]>; 62} 63} 64 65/// sse12_fp_packed - SSE 1 & 2 packed instructions class 66multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, 67 RegisterClass RC, ValueType vt, 68 X86MemOperand x86memop, PatFrag mem_frag, 69 Domain d, X86FoldableSchedWrite sched, 70 bit Is2Addr = 1> { 71 let isCommutable = 1 in 72 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 73 !if(Is2Addr, 74 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 75 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 76 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>, 77 Sched<[sched]>; 78 let mayLoad = 1 in 79 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 80 !if(Is2Addr, 81 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 82 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 83 [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], 84 d>, 85 Sched<[sched.Folded, sched.ReadAfterFold]>; 86} 87 88/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class 89multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, 90 string OpcodeStr, X86MemOperand x86memop, 91 X86FoldableSchedWrite sched, 92 list<dag> pat_rr, list<dag> pat_rm, 93 bit Is2Addr = 1> { 94 let isCommutable = 1, hasSideEffects = 0 in 95 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 96 !if(Is2Addr, 97 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 98 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 99 pat_rr, d>, 100 Sched<[sched]>; 101 let hasSideEffects = 0, mayLoad = 1 in 102 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 103 !if(Is2Addr, 104 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 105 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 106 pat_rm, d>, 107 Sched<[sched.Folded, sched.ReadAfterFold]>; 108} 109 110 111// Alias instructions that map fld0 to xorps for sse or vxorps for avx. 112// This is expanded by ExpandPostRAPseudos. 113let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 114 isPseudo = 1, SchedRW = [WriteZero] in { 115 def FsFLD0SH : I<0, Pseudo, (outs FR16:$dst), (ins), "", 116 [(set FR16:$dst, fp16imm0)]>, Requires<[HasSSE2, NoAVX512]>; 117 def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", 118 [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>; 119 def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", 120 [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>; 121 def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "", 122 [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>; 123} 124 125//===----------------------------------------------------------------------===// 126// AVX & SSE - Zero/One Vectors 127//===----------------------------------------------------------------------===// 128 129// Alias instruction that maps zero vector to pxor / xorp* for sse. 130// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then 131// swizzled by ExecutionDomainFix to pxor. 132// We set canFoldAsLoad because this can be converted to a constant-pool 133// load of an all-zeros value if folding it would be beneficial. 134let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 135 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in { 136def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", 137 [(set VR128:$dst, (v4f32 immAllZerosV))]>; 138} 139 140let Predicates = [NoAVX512] in { 141def : Pat<(v16i8 immAllZerosV), (V_SET0)>; 142def : Pat<(v8i16 immAllZerosV), (V_SET0)>; 143def : Pat<(v8f16 immAllZerosV), (V_SET0)>; 144def : Pat<(v4i32 immAllZerosV), (V_SET0)>; 145def : Pat<(v2i64 immAllZerosV), (V_SET0)>; 146def : Pat<(v2f64 immAllZerosV), (V_SET0)>; 147} 148 149 150// The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI, 151// and doesn't need it because on sandy bridge the register is set to zero 152// at the rename stage without using any execution unit, so SET0PSY 153// and SET0PDY can be used for vector int instructions without penalty 154let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 155 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in { 156def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "", 157 [(set VR256:$dst, (v8i32 immAllZerosV))]>; 158} 159 160let Predicates = [NoAVX512] in { 161def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>; 162def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>; 163def : Pat<(v16f16 immAllZerosV), (AVX_SET0)>; 164def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>; 165def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>; 166def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>; 167} 168 169// We set canFoldAsLoad because this can be converted to a constant-pool 170// load of an all-ones value if folding it would be beneficial. 171let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 172 isPseudo = 1, SchedRW = [WriteZero] in { 173 def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "", 174 [(set VR128:$dst, (v4i32 immAllOnesV))]>; 175 let Predicates = [HasAVX1Only, OptForMinSize] in { 176 def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "", 177 [(set VR256:$dst, (v8i32 immAllOnesV))]>; 178 } 179 let Predicates = [HasAVX2] in 180 def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "", 181 [(set VR256:$dst, (v8i32 immAllOnesV))]>; 182} 183 184//===----------------------------------------------------------------------===// 185// SSE 1 & 2 - Move FP Scalar Instructions 186// 187// Move Instructions. Register-to-register movss/movsd is not used for FR32/64 188// register copies because it's a partial register update; Register-to-register 189// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires 190// that the insert be implementable in terms of a copy, and just mentioned, we 191// don't use movss/movsd for copies. 192//===----------------------------------------------------------------------===// 193 194multiclass sse12_move_rr<SDNode OpNode, ValueType vt, string base_opc, 195 string asm_opr, Domain d, string Name> { 196 let isCommutable = 1 in 197 def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst), 198 (ins VR128:$src1, VR128:$src2), 199 !strconcat(base_opc, asm_opr), 200 [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>, 201 Sched<[SchedWriteFShuffle.XMM]>; 202 203 // For the disassembler 204 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in 205 def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), 206 (ins VR128:$src1, VR128:$src2), 207 !strconcat(base_opc, asm_opr), []>, 208 Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>; 209} 210 211multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, 212 X86MemOperand x86memop, string OpcodeStr, 213 Domain d, string Name, Predicate pred> { 214 // AVX 215 let Predicates = [UseAVX, OptForSize] in 216 defm V#NAME : sse12_move_rr<OpNode, vt, OpcodeStr, 217 "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d, 218 "V"#Name>, 219 VEX_4V, VEX_LIG, VEX_WIG; 220 221 def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), 222 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 223 [(store RC:$src, addr:$dst)], d>, 224 VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG; 225 // SSE1 & 2 226 let Constraints = "$src1 = $dst" in { 227 let Predicates = [pred, NoSSE41_Or_OptForSize] in 228 defm NAME : sse12_move_rr<OpNode, vt, OpcodeStr, 229 "\t{$src2, $dst|$dst, $src2}", d, Name>; 230 } 231 232 def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), 233 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 234 [(store RC:$src, addr:$dst)], d>, 235 Sched<[WriteFStore]>; 236 237 def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", 238 (!cast<Instruction>("V"#NAME#"rr_REV") 239 VR128:$dst, VR128:$src1, VR128:$src2), 0>; 240 def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}", 241 (!cast<Instruction>(NAME#"rr_REV") 242 VR128:$dst, VR128:$src2), 0>; 243} 244 245// Loading from memory automatically zeroing upper bits. 246multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop, 247 PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr, 248 Domain d> { 249 def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 250 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 251 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>, 252 VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG; 253 def NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 254 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 255 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>, 256 Sched<[WriteFLoad]>; 257 258 // _alt version uses FR32/FR64 register class. 259 let isCodeGenOnly = 1 in { 260 def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 261 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 262 [(set RC:$dst, (mem_pat addr:$src))], d>, 263 VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG; 264 def NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 265 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 266 [(set RC:$dst, (mem_pat addr:$src))], d>, 267 Sched<[WriteFLoad]>; 268 } 269} 270 271defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss", 272 SSEPackedSingle, "MOVSS", UseSSE1>, XS; 273defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd", 274 SSEPackedDouble, "MOVSD", UseSSE2>, XD; 275 276let canFoldAsLoad = 1, isReMaterializable = 1 in { 277 defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss", 278 SSEPackedSingle>, XS; 279 defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd", 280 SSEPackedDouble>, XD; 281} 282 283// Patterns 284let Predicates = [UseAVX] in { 285 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), 286 (VMOVSSrm addr:$src)>; 287 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), 288 (VMOVSDrm addr:$src)>; 289 290 // Represent the same patterns above but in the form they appear for 291 // 256-bit types 292 def : Pat<(v8f32 (X86vzload32 addr:$src)), 293 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; 294 def : Pat<(v4f64 (X86vzload64 addr:$src)), 295 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; 296} 297 298let Predicates = [UseAVX, OptForSize] in { 299 // Move scalar to XMM zero-extended, zeroing a VR128 then do a 300 // MOVSS to the lower bits. 301 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 302 (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>; 303 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 304 (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>; 305 306 // Move low f32 and clear high bits. 307 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), 308 (SUBREG_TO_REG (i32 0), 309 (v4f32 (VMOVSSrr (v4f32 (V_SET0)), 310 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>; 311 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), 312 (SUBREG_TO_REG (i32 0), 313 (v4i32 (VMOVSSrr (v4i32 (V_SET0)), 314 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>; 315} 316 317let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in { 318// Move scalar to XMM zero-extended, zeroing a VR128 then do a 319// MOVSS to the lower bits. 320def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 321 (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>; 322def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 323 (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>; 324} 325 326let Predicates = [UseSSE2] in 327def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), 328 (MOVSDrm addr:$src)>; 329 330let Predicates = [UseSSE1] in 331def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), 332 (MOVSSrm addr:$src)>; 333 334//===----------------------------------------------------------------------===// 335// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions 336//===----------------------------------------------------------------------===// 337 338multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC, 339 X86MemOperand x86memop, PatFrag ld_frag, 340 string asm, Domain d, 341 X86SchedWriteMoveLS sched> { 342let hasSideEffects = 0, isMoveReg = 1 in 343 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), 344 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>, 345 Sched<[sched.RR]>; 346let canFoldAsLoad = 1, isReMaterializable = 1 in 347 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 348 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 349 [(set RC:$dst, (ld_frag addr:$src))], d>, 350 Sched<[sched.RM]>; 351} 352 353let Predicates = [HasAVX, NoVLX] in { 354defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", 355 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 356 PS, VEX, VEX_WIG; 357defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", 358 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 359 PD, VEX, VEX_WIG; 360defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", 361 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 362 PS, VEX, VEX_WIG; 363defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", 364 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 365 PD, VEX, VEX_WIG; 366 367defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps", 368 SSEPackedSingle, SchedWriteFMoveLS.YMM>, 369 PS, VEX, VEX_L, VEX_WIG; 370defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd", 371 SSEPackedDouble, SchedWriteFMoveLS.YMM>, 372 PD, VEX, VEX_L, VEX_WIG; 373defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups", 374 SSEPackedSingle, SchedWriteFMoveLS.YMM>, 375 PS, VEX, VEX_L, VEX_WIG; 376defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", 377 SSEPackedDouble, SchedWriteFMoveLS.YMM>, 378 PD, VEX, VEX_L, VEX_WIG; 379} 380 381let Predicates = [UseSSE1] in { 382defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", 383 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 384 PS; 385defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", 386 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 387 PS; 388} 389let Predicates = [UseSSE2] in { 390defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", 391 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 392 PD; 393defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", 394 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 395 PD; 396} 397 398let Predicates = [HasAVX, NoVLX] in { 399let SchedRW = [SchedWriteFMoveLS.XMM.MR] in { 400def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 401 "movaps\t{$src, $dst|$dst, $src}", 402 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>, 403 VEX, VEX_WIG; 404def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 405 "movapd\t{$src, $dst|$dst, $src}", 406 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>, 407 VEX, VEX_WIG; 408def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 409 "movups\t{$src, $dst|$dst, $src}", 410 [(store (v4f32 VR128:$src), addr:$dst)]>, 411 VEX, VEX_WIG; 412def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 413 "movupd\t{$src, $dst|$dst, $src}", 414 [(store (v2f64 VR128:$src), addr:$dst)]>, 415 VEX, VEX_WIG; 416} // SchedRW 417 418let SchedRW = [SchedWriteFMoveLS.YMM.MR] in { 419def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 420 "movaps\t{$src, $dst|$dst, $src}", 421 [(alignedstore (v8f32 VR256:$src), addr:$dst)]>, 422 VEX, VEX_L, VEX_WIG; 423def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 424 "movapd\t{$src, $dst|$dst, $src}", 425 [(alignedstore (v4f64 VR256:$src), addr:$dst)]>, 426 VEX, VEX_L, VEX_WIG; 427def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 428 "movups\t{$src, $dst|$dst, $src}", 429 [(store (v8f32 VR256:$src), addr:$dst)]>, 430 VEX, VEX_L, VEX_WIG; 431def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 432 "movupd\t{$src, $dst|$dst, $src}", 433 [(store (v4f64 VR256:$src), addr:$dst)]>, 434 VEX, VEX_L, VEX_WIG; 435} // SchedRW 436} // Predicate 437 438// For disassembler 439let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 440 isMoveReg = 1 in { 441let SchedRW = [SchedWriteFMoveLS.XMM.RR] in { 442 def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst), 443 (ins VR128:$src), 444 "movaps\t{$src, $dst|$dst, $src}", []>, 445 VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">; 446 def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst), 447 (ins VR128:$src), 448 "movapd\t{$src, $dst|$dst, $src}", []>, 449 VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">; 450 def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst), 451 (ins VR128:$src), 452 "movups\t{$src, $dst|$dst, $src}", []>, 453 VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">; 454 def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst), 455 (ins VR128:$src), 456 "movupd\t{$src, $dst|$dst, $src}", []>, 457 VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">; 458} // SchedRW 459 460let SchedRW = [SchedWriteFMoveLS.YMM.RR] in { 461 def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst), 462 (ins VR256:$src), 463 "movaps\t{$src, $dst|$dst, $src}", []>, 464 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">; 465 def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst), 466 (ins VR256:$src), 467 "movapd\t{$src, $dst|$dst, $src}", []>, 468 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">; 469 def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst), 470 (ins VR256:$src), 471 "movups\t{$src, $dst|$dst, $src}", []>, 472 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">; 473 def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst), 474 (ins VR256:$src), 475 "movupd\t{$src, $dst|$dst, $src}", []>, 476 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">; 477} // SchedRW 478} // Predicate 479 480// Reversed version with ".s" suffix for GAS compatibility. 481def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}", 482 (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>; 483def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}", 484 (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>; 485def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}", 486 (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>; 487def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}", 488 (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>; 489def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}", 490 (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>; 491def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}", 492 (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>; 493def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}", 494 (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>; 495def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}", 496 (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>; 497 498let SchedRW = [SchedWriteFMoveLS.XMM.MR] in { 499def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 500 "movaps\t{$src, $dst|$dst, $src}", 501 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>; 502def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 503 "movapd\t{$src, $dst|$dst, $src}", 504 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>; 505def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 506 "movups\t{$src, $dst|$dst, $src}", 507 [(store (v4f32 VR128:$src), addr:$dst)]>; 508def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 509 "movupd\t{$src, $dst|$dst, $src}", 510 [(store (v2f64 VR128:$src), addr:$dst)]>; 511} // SchedRW 512 513// For disassembler 514let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 515 isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in { 516 def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 517 "movaps\t{$src, $dst|$dst, $src}", []>, 518 FoldGenData<"MOVAPSrr">; 519 def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 520 "movapd\t{$src, $dst|$dst, $src}", []>, 521 FoldGenData<"MOVAPDrr">; 522 def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 523 "movups\t{$src, $dst|$dst, $src}", []>, 524 FoldGenData<"MOVUPSrr">; 525 def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 526 "movupd\t{$src, $dst|$dst, $src}", []>, 527 FoldGenData<"MOVUPDrr">; 528} 529 530// Reversed version with ".s" suffix for GAS compatibility. 531def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}", 532 (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>; 533def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}", 534 (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>; 535def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}", 536 (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>; 537def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}", 538 (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>; 539 540let Predicates = [HasAVX, NoVLX] in { 541 // 256-bit load/store need to use floating point load/store in case we don't 542 // have AVX2. Execution domain fixing will convert to integer if AVX2 is 543 // available and changing the domain is beneficial. 544 def : Pat<(alignedloadv4i64 addr:$src), 545 (VMOVAPSYrm addr:$src)>; 546 def : Pat<(alignedloadv8i32 addr:$src), 547 (VMOVAPSYrm addr:$src)>; 548 def : Pat<(alignedloadv16i16 addr:$src), 549 (VMOVAPSYrm addr:$src)>; 550 def : Pat<(alignedloadv32i8 addr:$src), 551 (VMOVAPSYrm addr:$src)>; 552 def : Pat<(loadv4i64 addr:$src), 553 (VMOVUPSYrm addr:$src)>; 554 def : Pat<(loadv8i32 addr:$src), 555 (VMOVUPSYrm addr:$src)>; 556 def : Pat<(loadv16i16 addr:$src), 557 (VMOVUPSYrm addr:$src)>; 558 def : Pat<(loadv32i8 addr:$src), 559 (VMOVUPSYrm addr:$src)>; 560 561 def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst), 562 (VMOVAPSYmr addr:$dst, VR256:$src)>; 563 def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst), 564 (VMOVAPSYmr addr:$dst, VR256:$src)>; 565 def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst), 566 (VMOVAPSYmr addr:$dst, VR256:$src)>; 567 def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst), 568 (VMOVAPSYmr addr:$dst, VR256:$src)>; 569 def : Pat<(store (v4i64 VR256:$src), addr:$dst), 570 (VMOVUPSYmr addr:$dst, VR256:$src)>; 571 def : Pat<(store (v8i32 VR256:$src), addr:$dst), 572 (VMOVUPSYmr addr:$dst, VR256:$src)>; 573 def : Pat<(store (v16i16 VR256:$src), addr:$dst), 574 (VMOVUPSYmr addr:$dst, VR256:$src)>; 575 def : Pat<(store (v32i8 VR256:$src), addr:$dst), 576 (VMOVUPSYmr addr:$dst, VR256:$src)>; 577 578 def : Pat<(alignedloadv8f16 addr:$src), 579 (VMOVAPSrm addr:$src)>; 580 def : Pat<(alignedloadv8bf16 addr:$src), 581 (VMOVAPSrm addr:$src)>; 582 def : Pat<(loadv8f16 addr:$src), 583 (VMOVUPSrm addr:$src)>; 584 def : Pat<(loadv8bf16 addr:$src), 585 (VMOVUPSrm addr:$src)>; 586 def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst), 587 (VMOVAPSmr addr:$dst, VR128:$src)>; 588 def : Pat<(alignedstore (v8bf16 VR128:$src), addr:$dst), 589 (VMOVAPSmr addr:$dst, VR128:$src)>; 590 def : Pat<(store (v8f16 VR128:$src), addr:$dst), 591 (VMOVUPSmr addr:$dst, VR128:$src)>; 592 def : Pat<(store (v8bf16 VR128:$src), addr:$dst), 593 (VMOVUPSmr addr:$dst, VR128:$src)>; 594 595 def : Pat<(alignedloadv16f16 addr:$src), 596 (VMOVAPSYrm addr:$src)>; 597 def : Pat<(alignedloadv16bf16 addr:$src), 598 (VMOVAPSYrm addr:$src)>; 599 def : Pat<(loadv16f16 addr:$src), 600 (VMOVUPSYrm addr:$src)>; 601 def : Pat<(loadv16bf16 addr:$src), 602 (VMOVUPSYrm addr:$src)>; 603 def : Pat<(alignedstore (v16f16 VR256:$src), addr:$dst), 604 (VMOVAPSYmr addr:$dst, VR256:$src)>; 605 def : Pat<(alignedstore (v16bf16 VR256:$src), addr:$dst), 606 (VMOVAPSYmr addr:$dst, VR256:$src)>; 607 def : Pat<(store (v16f16 VR256:$src), addr:$dst), 608 (VMOVUPSYmr addr:$dst, VR256:$src)>; 609 def : Pat<(store (v16bf16 VR256:$src), addr:$dst), 610 (VMOVUPSYmr addr:$dst, VR256:$src)>; 611} 612 613// Use movaps / movups for SSE integer load / store (one byte shorter). 614// The instructions selected below are then converted to MOVDQA/MOVDQU 615// during the SSE domain pass. 616let Predicates = [UseSSE1] in { 617 def : Pat<(alignedloadv2i64 addr:$src), 618 (MOVAPSrm addr:$src)>; 619 def : Pat<(alignedloadv4i32 addr:$src), 620 (MOVAPSrm addr:$src)>; 621 def : Pat<(alignedloadv8i16 addr:$src), 622 (MOVAPSrm addr:$src)>; 623 def : Pat<(alignedloadv16i8 addr:$src), 624 (MOVAPSrm addr:$src)>; 625 def : Pat<(loadv2i64 addr:$src), 626 (MOVUPSrm addr:$src)>; 627 def : Pat<(loadv4i32 addr:$src), 628 (MOVUPSrm addr:$src)>; 629 def : Pat<(loadv8i16 addr:$src), 630 (MOVUPSrm addr:$src)>; 631 def : Pat<(loadv16i8 addr:$src), 632 (MOVUPSrm addr:$src)>; 633 634 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), 635 (MOVAPSmr addr:$dst, VR128:$src)>; 636 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), 637 (MOVAPSmr addr:$dst, VR128:$src)>; 638 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), 639 (MOVAPSmr addr:$dst, VR128:$src)>; 640 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), 641 (MOVAPSmr addr:$dst, VR128:$src)>; 642 def : Pat<(store (v2i64 VR128:$src), addr:$dst), 643 (MOVUPSmr addr:$dst, VR128:$src)>; 644 def : Pat<(store (v4i32 VR128:$src), addr:$dst), 645 (MOVUPSmr addr:$dst, VR128:$src)>; 646 def : Pat<(store (v8i16 VR128:$src), addr:$dst), 647 (MOVUPSmr addr:$dst, VR128:$src)>; 648 def : Pat<(store (v16i8 VR128:$src), addr:$dst), 649 (MOVUPSmr addr:$dst, VR128:$src)>; 650} 651 652let Predicates = [UseSSE2] in { 653 def : Pat<(alignedloadv8f16 addr:$src), 654 (MOVAPSrm addr:$src)>; 655 def : Pat<(loadv8f16 addr:$src), 656 (MOVUPSrm addr:$src)>; 657 def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst), 658 (MOVAPSmr addr:$dst, VR128:$src)>; 659 def : Pat<(store (v8f16 VR128:$src), addr:$dst), 660 (MOVUPSmr addr:$dst, VR128:$src)>; 661} 662 663//===----------------------------------------------------------------------===// 664// SSE 1 & 2 - Move Low packed FP Instructions 665//===----------------------------------------------------------------------===// 666 667multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDPatternOperator pdnode, 668 string base_opc, string asm_opr> { 669 // No pattern as they need be special cased between high and low. 670 let hasSideEffects = 0, mayLoad = 1 in 671 def PSrm : PI<opc, MRMSrcMem, 672 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 673 !strconcat(base_opc, "s", asm_opr), 674 [], SSEPackedSingle>, PS, 675 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; 676 677 def PDrm : PI<opc, MRMSrcMem, 678 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 679 !strconcat(base_opc, "d", asm_opr), 680 [(set VR128:$dst, (v2f64 (pdnode VR128:$src1, 681 (scalar_to_vector (loadf64 addr:$src2)))))], 682 SSEPackedDouble>, PD, 683 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; 684} 685 686multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode, 687 string base_opc> { 688 let Predicates = [UseAVX] in 689 defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc, 690 "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, 691 VEX_4V, VEX_WIG; 692 693 let Constraints = "$src1 = $dst" in 694 defm NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc, 695 "\t{$src2, $dst|$dst, $src2}">; 696} 697 698defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">; 699 700let SchedRW = [WriteFStore] in { 701let Predicates = [UseAVX] in { 702let mayStore = 1, hasSideEffects = 0 in 703def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 704 "movlps\t{$src, $dst|$dst, $src}", 705 []>, 706 VEX, VEX_WIG; 707def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 708 "movlpd\t{$src, $dst|$dst, $src}", 709 [(store (f64 (extractelt (v2f64 VR128:$src), 710 (iPTR 0))), addr:$dst)]>, 711 VEX, VEX_WIG; 712}// UseAVX 713let mayStore = 1, hasSideEffects = 0 in 714def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 715 "movlps\t{$src, $dst|$dst, $src}", 716 []>; 717def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 718 "movlpd\t{$src, $dst|$dst, $src}", 719 [(store (f64 (extractelt (v2f64 VR128:$src), 720 (iPTR 0))), addr:$dst)]>; 721} // SchedRW 722 723let Predicates = [UseSSE1] in { 724 // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll 725 // end up with a movsd or blend instead of shufp. 726 // No need for aligned load, we're only loading 64-bits. 727 def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1, 728 (i8 -28)), 729 (MOVLPSrm VR128:$src1, addr:$src2)>; 730 def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)), 731 (MOVLPSrm VR128:$src1, addr:$src2)>; 732 733 def : Pat<(v4f32 (X86vzload64 addr:$src)), 734 (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>; 735 def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst), 736 (MOVLPSmr addr:$dst, VR128:$src)>; 737} 738 739//===----------------------------------------------------------------------===// 740// SSE 1 & 2 - Move Hi packed FP Instructions 741//===----------------------------------------------------------------------===// 742 743defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">; 744 745let SchedRW = [WriteFStore] in { 746// v2f64 extract element 1 is always custom lowered to unpack high to low 747// and extract element 0 so the non-store version isn't too horrible. 748let Predicates = [UseAVX] in { 749let mayStore = 1, hasSideEffects = 0 in 750def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 751 "movhps\t{$src, $dst|$dst, $src}", 752 []>, VEX, VEX_WIG; 753def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 754 "movhpd\t{$src, $dst|$dst, $src}", 755 [(store (f64 (extractelt 756 (v2f64 (X86Unpckh VR128:$src, VR128:$src)), 757 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG; 758} // UseAVX 759let mayStore = 1, hasSideEffects = 0 in 760def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 761 "movhps\t{$src, $dst|$dst, $src}", 762 []>; 763def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 764 "movhpd\t{$src, $dst|$dst, $src}", 765 [(store (f64 (extractelt 766 (v2f64 (X86Unpckh VR128:$src, VR128:$src)), 767 (iPTR 0))), addr:$dst)]>; 768} // SchedRW 769 770let Predicates = [UseAVX] in { 771 // MOVHPD patterns 772 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))), 773 (VMOVHPDrm VR128:$src1, addr:$src2)>; 774 775 def : Pat<(store (f64 (extractelt 776 (v2f64 (X86VPermilpi VR128:$src, (i8 1))), 777 (iPTR 0))), addr:$dst), 778 (VMOVHPDmr addr:$dst, VR128:$src)>; 779 780 // MOVLPD patterns 781 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))), 782 (VMOVLPDrm VR128:$src1, addr:$src2)>; 783} 784 785let Predicates = [UseSSE1] in { 786 // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll 787 // end up with a movsd or blend instead of shufp. 788 // No need for aligned load, we're only loading 64-bits. 789 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))), 790 (MOVHPSrm VR128:$src1, addr:$src2)>; 791 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))), 792 (MOVHPSrm VR128:$src1, addr:$src2)>; 793 794 def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)), 795 addr:$dst), 796 (MOVHPSmr addr:$dst, VR128:$src)>; 797} 798 799let Predicates = [UseSSE2] in { 800 // MOVHPD patterns 801 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))), 802 (MOVHPDrm VR128:$src1, addr:$src2)>; 803 804 def : Pat<(store (f64 (extractelt 805 (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))), 806 (iPTR 0))), addr:$dst), 807 (MOVHPDmr addr:$dst, VR128:$src)>; 808 809 // MOVLPD patterns 810 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))), 811 (MOVLPDrm VR128:$src1, addr:$src2)>; 812} 813 814let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in { 815 // Use MOVLPD to load into the low bits from a full vector unless we can use 816 // BLENDPD. 817 def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))), 818 (MOVLPDrm VR128:$src1, addr:$src2)>; 819} 820 821//===----------------------------------------------------------------------===// 822// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions 823//===----------------------------------------------------------------------===// 824 825let Predicates = [UseAVX] in { 826 def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst), 827 (ins VR128:$src1, VR128:$src2), 828 "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 829 [(set VR128:$dst, 830 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>, 831 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG; 832 let isCommutable = 1 in 833 def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst), 834 (ins VR128:$src1, VR128:$src2), 835 "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 836 [(set VR128:$dst, 837 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>, 838 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG, 839 NotMemoryFoldable; 840} 841let Constraints = "$src1 = $dst" in { 842 def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst), 843 (ins VR128:$src1, VR128:$src2), 844 "movlhps\t{$src2, $dst|$dst, $src2}", 845 [(set VR128:$dst, 846 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>, 847 Sched<[SchedWriteFShuffle.XMM]>; 848 let isCommutable = 1 in 849 def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst), 850 (ins VR128:$src1, VR128:$src2), 851 "movhlps\t{$src2, $dst|$dst, $src2}", 852 [(set VR128:$dst, 853 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>, 854 Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable; 855} 856 857//===----------------------------------------------------------------------===// 858// SSE 1 & 2 - Conversion Instructions 859//===----------------------------------------------------------------------===// 860 861multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 862 SDPatternOperator OpNode, X86MemOperand x86memop, PatFrag ld_frag, 863 string asm, string mem, X86FoldableSchedWrite sched, 864 Domain d, 865 SchedRead Int2Fpu = ReadDefault> { 866 let ExeDomain = d in { 867 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), 868 !strconcat(asm,"\t{$src, $dst|$dst, $src}"), 869 [(set DstRC:$dst, (OpNode SrcRC:$src))]>, 870 Sched<[sched, Int2Fpu]>; 871 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), 872 mem#"\t{$src, $dst|$dst, $src}", 873 [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, 874 Sched<[sched.Folded]>; 875 } 876} 877 878multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop, 879 ValueType DstTy, ValueType SrcTy, PatFrag ld_frag, 880 string asm, Domain d, X86FoldableSchedWrite sched> { 881let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in { 882 def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm, 883 [(set RC:$dst, (DstTy (any_sint_to_fp (SrcTy RC:$src))))], d>, 884 Sched<[sched]>; 885 let mayLoad = 1 in 886 def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm, 887 [(set RC:$dst, (DstTy (any_sint_to_fp 888 (SrcTy (ld_frag addr:$src)))))], d>, 889 Sched<[sched.Folded]>; 890} 891} 892 893multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 894 X86MemOperand x86memop, string asm, string mem, 895 X86FoldableSchedWrite sched, Domain d> { 896let hasSideEffects = 0, Predicates = [UseAVX], ExeDomain = d in { 897 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), 898 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, 899 Sched<[sched, ReadDefault, ReadInt2Fpu]>; 900 let mayLoad = 1 in 901 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), 902 (ins DstRC:$src1, x86memop:$src), 903 asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>, 904 Sched<[sched.Folded, sched.ReadAfterFold]>; 905} // hasSideEffects = 0 906} 907 908let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { 909defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32, 910 "cvttss2si", "cvttss2si", 911 WriteCvtSS2I, SSEPackedSingle>, 912 XS, VEX, VEX_LIG; 913defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32, 914 "cvttss2si", "cvttss2si", 915 WriteCvtSS2I, SSEPackedSingle>, 916 XS, VEX, VEX_W, VEX_LIG; 917defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64, 918 "cvttsd2si", "cvttsd2si", 919 WriteCvtSD2I, SSEPackedDouble>, 920 XD, VEX, VEX_LIG; 921defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64, 922 "cvttsd2si", "cvttsd2si", 923 WriteCvtSD2I, SSEPackedDouble>, 924 XD, VEX, VEX_W, VEX_LIG; 925 926defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32, 927 "cvtss2si", "cvtss2si", 928 WriteCvtSS2I, SSEPackedSingle>, 929 XS, VEX, VEX_LIG; 930defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32, 931 "cvtss2si", "cvtss2si", 932 WriteCvtSS2I, SSEPackedSingle>, 933 XS, VEX, VEX_W, VEX_LIG; 934defm VCVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64, 935 "cvtsd2si", "cvtsd2si", 936 WriteCvtSD2I, SSEPackedDouble>, 937 XD, VEX, VEX_LIG; 938defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64, 939 "cvtsd2si", "cvtsd2si", 940 WriteCvtSD2I, SSEPackedDouble>, 941 XD, VEX, VEX_W, VEX_LIG; 942} 943 944// The assembler can recognize rr 64-bit instructions by seeing a rxx 945// register, but the same isn't true when only using memory operands, 946// provide other assembly "l" and "q" forms to address this explicitly 947// where appropriate to do so. 948let isCodeGenOnly = 1 in { 949defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l", 950 WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V, 951 VEX_LIG, SIMD_EXC; 952defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q", 953 WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V, 954 VEX_W, VEX_LIG, SIMD_EXC; 955defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l", 956 WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V, 957 VEX_LIG; 958defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q", 959 WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V, 960 VEX_W, VEX_LIG, SIMD_EXC; 961} // isCodeGenOnly = 1 962 963let Predicates = [UseAVX] in { 964 def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))), 965 (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; 966 def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))), 967 (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; 968 def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))), 969 (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; 970 def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))), 971 (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; 972 973 def : Pat<(f32 (any_sint_to_fp GR32:$src)), 974 (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>; 975 def : Pat<(f32 (any_sint_to_fp GR64:$src)), 976 (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>; 977 def : Pat<(f64 (any_sint_to_fp GR32:$src)), 978 (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>; 979 def : Pat<(f64 (any_sint_to_fp GR64:$src)), 980 (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>; 981 982 def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64rr FR32:$src)>; 983 def : Pat<(i64 (lrint (loadf32 addr:$src))), (VCVTSS2SI64rm addr:$src)>; 984 985 def : Pat<(i64 (lrint FR64:$src)), (VCVTSD2SI64rr FR64:$src)>; 986 def : Pat<(i64 (lrint (loadf64 addr:$src))), (VCVTSD2SI64rm addr:$src)>; 987} 988 989let isCodeGenOnly = 1 in { 990defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32, 991 "cvttss2si", "cvttss2si", 992 WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC; 993defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32, 994 "cvttss2si", "cvttss2si", 995 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC; 996defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64, 997 "cvttsd2si", "cvttsd2si", 998 WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC; 999defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64, 1000 "cvttsd2si", "cvttsd2si", 1001 WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC; 1002 1003defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32, 1004 "cvtss2si", "cvtss2si", 1005 WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC; 1006defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32, 1007 "cvtss2si", "cvtss2si", 1008 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC; 1009defm CVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64, 1010 "cvtsd2si", "cvtsd2si", 1011 WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC; 1012defm CVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64, 1013 "cvtsd2si", "cvtsd2si", 1014 WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC; 1015 1016defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32, 1017 "cvtsi2ss", "cvtsi2ss{l}", 1018 WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC; 1019defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64, 1020 "cvtsi2ss", "cvtsi2ss{q}", 1021 WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, REX_W, SIMD_EXC; 1022defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32, 1023 "cvtsi2sd", "cvtsi2sd{l}", 1024 WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD; 1025defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64, 1026 "cvtsi2sd", "cvtsi2sd{q}", 1027 WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC; 1028} // isCodeGenOnly = 1 1029 1030let Predicates = [UseSSE1] in { 1031 def : Pat<(i64 (lrint FR32:$src)), (CVTSS2SI64rr FR32:$src)>; 1032 def : Pat<(i64 (lrint (loadf32 addr:$src))), (CVTSS2SI64rm addr:$src)>; 1033} 1034 1035let Predicates = [UseSSE2] in { 1036 def : Pat<(i64 (lrint FR64:$src)), (CVTSD2SI64rr FR64:$src)>; 1037 def : Pat<(i64 (lrint (loadf64 addr:$src))), (CVTSD2SI64rm addr:$src)>; 1038} 1039 1040// Conversion Instructions Intrinsics - Match intrinsics which expect MM 1041// and/or XMM operand(s). 1042 1043multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 1044 ValueType DstVT, ValueType SrcVT, SDNode OpNode, 1045 Operand memop, PatFrags mem_frags, string asm, 1046 X86FoldableSchedWrite sched, Domain d> { 1047let ExeDomain = d in { 1048 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), 1049 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 1050 [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>, 1051 Sched<[sched]>; 1052 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), 1053 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 1054 [(set DstRC:$dst, (DstVT (OpNode (SrcVT (mem_frags addr:$src)))))]>, 1055 Sched<[sched.Folded]>; 1056} 1057} 1058 1059multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, 1060 RegisterClass DstRC, X86MemOperand x86memop, 1061 string asm, string mem, X86FoldableSchedWrite sched, 1062 Domain d, bit Is2Addr = 1> { 1063let hasSideEffects = 0, ExeDomain = d in { 1064 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2), 1065 !if(Is2Addr, 1066 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 1067 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 1068 []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>; 1069 let mayLoad = 1 in 1070 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), 1071 (ins DstRC:$src1, x86memop:$src2), 1072 !if(Is2Addr, 1073 asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}", 1074 asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 1075 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 1076} 1077} 1078 1079let Uses = [MXCSR], mayRaiseFPException = 1 in { 1080let Predicates = [UseAVX] in { 1081defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, 1082 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si", 1083 WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG; 1084defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, 1085 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si", 1086 WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_W, VEX_LIG; 1087} 1088defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si, 1089 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I, 1090 SSEPackedDouble>, XD; 1091defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si, 1092 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I, 1093 SSEPackedDouble>, XD, REX_W; 1094} 1095 1096let Predicates = [UseAVX] in { 1097defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1098 i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>, 1099 XS, VEX_4V, VEX_LIG, SIMD_EXC; 1100defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1101 i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>, 1102 XS, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC; 1103defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1104 i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>, 1105 XD, VEX_4V, VEX_LIG; 1106defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1107 i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>, 1108 XD, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC; 1109} 1110let Constraints = "$src1 = $dst" in { 1111 defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1112 i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>, 1113 XS, SIMD_EXC; 1114 defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1115 i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>, 1116 XS, REX_W, SIMD_EXC; 1117 defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1118 i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>, 1119 XD; 1120 defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1121 i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>, 1122 XD, REX_W, SIMD_EXC; 1123} 1124 1125def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1126 (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">; 1127def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1128 (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">; 1129def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1130 (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">; 1131def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1132 (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">; 1133 1134def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", 1135 (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">; 1136def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", 1137 (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">; 1138 1139def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}", 1140 (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">; 1141def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}", 1142 (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">; 1143def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}", 1144 (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">; 1145def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}", 1146 (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">; 1147 1148def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}", 1149 (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">; 1150def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", 1151 (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">; 1152 1153/// SSE 1 Only 1154 1155// Aliases for intrinsics 1156let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1157defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int, 1158 ssmem, sse_load_f32, "cvttss2si", 1159 WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG; 1160defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32, 1161 X86cvtts2Int, ssmem, sse_load_f32, 1162 "cvttss2si", WriteCvtSS2I, SSEPackedSingle>, 1163 XS, VEX, VEX_LIG, VEX_W; 1164defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int, 1165 sdmem, sse_load_f64, "cvttsd2si", 1166 WriteCvtSS2I, SSEPackedDouble>, XD, VEX, VEX_LIG; 1167defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64, 1168 X86cvtts2Int, sdmem, sse_load_f64, 1169 "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>, 1170 XD, VEX, VEX_LIG, VEX_W; 1171} 1172let Uses = [MXCSR], mayRaiseFPException = 1 in { 1173defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int, 1174 ssmem, sse_load_f32, "cvttss2si", 1175 WriteCvtSS2I, SSEPackedSingle>, XS; 1176defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32, 1177 X86cvtts2Int, ssmem, sse_load_f32, 1178 "cvttss2si", WriteCvtSS2I, SSEPackedSingle>, 1179 XS, REX_W; 1180defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int, 1181 sdmem, sse_load_f64, "cvttsd2si", 1182 WriteCvtSD2I, SSEPackedDouble>, XD; 1183defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64, 1184 X86cvtts2Int, sdmem, sse_load_f64, 1185 "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>, 1186 XD, REX_W; 1187} 1188 1189def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", 1190 (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1191def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", 1192 (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">; 1193def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", 1194 (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1195def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", 1196 (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">; 1197def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", 1198 (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1199def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", 1200 (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">; 1201def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", 1202 (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1203def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", 1204 (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">; 1205 1206def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", 1207 (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1208def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", 1209 (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">; 1210def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", 1211 (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1212def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", 1213 (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">; 1214def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", 1215 (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1216def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", 1217 (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">; 1218def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", 1219 (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1220def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", 1221 (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">; 1222 1223let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1224defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si, 1225 ssmem, sse_load_f32, "cvtss2si", 1226 WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG; 1227defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si, 1228 ssmem, sse_load_f32, "cvtss2si", 1229 WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_W, VEX_LIG; 1230} 1231let Uses = [MXCSR], mayRaiseFPException = 1 in { 1232defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si, 1233 ssmem, sse_load_f32, "cvtss2si", 1234 WriteCvtSS2I, SSEPackedSingle>, XS; 1235defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si, 1236 ssmem, sse_load_f32, "cvtss2si", 1237 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W; 1238 1239defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load, 1240 "vcvtdq2ps\t{$src, $dst|$dst, $src}", 1241 SSEPackedSingle, WriteCvtI2PS>, 1242 PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG; 1243defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load, 1244 "vcvtdq2ps\t{$src, $dst|$dst, $src}", 1245 SSEPackedSingle, WriteCvtI2PSY>, 1246 PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG; 1247 1248defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop, 1249 "cvtdq2ps\t{$src, $dst|$dst, $src}", 1250 SSEPackedSingle, WriteCvtI2PS>, 1251 PS, Requires<[UseSSE2]>; 1252} 1253 1254// AVX aliases 1255def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", 1256 (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1257def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", 1258 (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">; 1259def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", 1260 (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1261def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", 1262 (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">; 1263def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", 1264 (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1265def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", 1266 (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">; 1267def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", 1268 (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1269def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", 1270 (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">; 1271 1272// SSE aliases 1273def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", 1274 (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1275def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", 1276 (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">; 1277def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", 1278 (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1279def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", 1280 (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">; 1281def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", 1282 (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1283def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", 1284 (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">; 1285def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", 1286 (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1287def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", 1288 (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">; 1289 1290/// SSE 2 Only 1291 1292// Convert scalar double to scalar single 1293let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX], 1294 ExeDomain = SSEPackedSingle in { 1295def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), 1296 (ins FR32:$src1, FR64:$src2), 1297 "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1298 VEX_4V, VEX_LIG, VEX_WIG, 1299 Sched<[WriteCvtSD2SS]>, SIMD_EXC; 1300let mayLoad = 1 in 1301def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), 1302 (ins FR32:$src1, f64mem:$src2), 1303 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1304 XD, VEX_4V, VEX_LIG, VEX_WIG, 1305 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC; 1306} 1307 1308def : Pat<(f32 (any_fpround FR64:$src)), 1309 (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>, 1310 Requires<[UseAVX]>; 1311 1312let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in { 1313def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), 1314 "cvtsd2ss\t{$src, $dst|$dst, $src}", 1315 [(set FR32:$dst, (any_fpround FR64:$src))]>, 1316 Sched<[WriteCvtSD2SS]>, SIMD_EXC; 1317def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), 1318 "cvtsd2ss\t{$src, $dst|$dst, $src}", 1319 [(set FR32:$dst, (any_fpround (loadf64 addr:$src)))]>, 1320 XD, Requires<[UseSSE2, OptForSize]>, 1321 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC; 1322} 1323 1324let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = SSEPackedSingle in { 1325def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg, 1326 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1327 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1328 [(set VR128:$dst, 1329 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>, 1330 XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>, 1331 Sched<[WriteCvtSD2SS]>; 1332def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem, 1333 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), 1334 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1335 [(set VR128:$dst, 1336 (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>, 1337 XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>, 1338 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; 1339let Constraints = "$src1 = $dst" in { 1340def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg, 1341 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1342 "cvtsd2ss\t{$src2, $dst|$dst, $src2}", 1343 [(set VR128:$dst, 1344 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>, 1345 XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>; 1346def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem, 1347 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), 1348 "cvtsd2ss\t{$src2, $dst|$dst, $src2}", 1349 [(set VR128:$dst, 1350 (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>, 1351 XD, Requires<[UseSSE2]>, 1352 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; 1353} 1354} 1355 1356// Convert scalar single to scalar double 1357// SSE2 instructions with XS prefix 1358let isCodeGenOnly = 1, hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 1359def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), 1360 (ins FR64:$src1, FR32:$src2), 1361 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1362 XS, VEX_4V, VEX_LIG, VEX_WIG, 1363 Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC; 1364let mayLoad = 1 in 1365def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), 1366 (ins FR64:$src1, f32mem:$src2), 1367 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1368 XS, VEX_4V, VEX_LIG, VEX_WIG, 1369 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>, 1370 Requires<[UseAVX, OptForSize]>, SIMD_EXC; 1371} // isCodeGenOnly = 1, hasSideEffects = 0 1372 1373def : Pat<(f64 (any_fpextend FR32:$src)), 1374 (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>; 1375def : Pat<(any_fpextend (loadf32 addr:$src)), 1376 (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>; 1377 1378let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in { 1379def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), 1380 "cvtss2sd\t{$src, $dst|$dst, $src}", 1381 [(set FR64:$dst, (any_fpextend FR32:$src))]>, 1382 XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC; 1383def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), 1384 "cvtss2sd\t{$src, $dst|$dst, $src}", 1385 [(set FR64:$dst, (any_fpextend (loadf32 addr:$src)))]>, 1386 XS, Requires<[UseSSE2, OptForSize]>, 1387 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>, SIMD_EXC; 1388} // isCodeGenOnly = 1 1389 1390let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1, 1391 ExeDomain = SSEPackedSingle in { 1392def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg, 1393 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1394 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1395 []>, XS, VEX_4V, VEX_LIG, VEX_WIG, 1396 Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>; 1397let mayLoad = 1 in 1398def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem, 1399 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), 1400 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1401 []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>, 1402 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>; 1403let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix 1404def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg, 1405 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1406 "cvtss2sd\t{$src2, $dst|$dst, $src2}", 1407 []>, XS, Requires<[UseSSE2]>, 1408 Sched<[WriteCvtSS2SD]>; 1409let mayLoad = 1 in 1410def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem, 1411 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), 1412 "cvtss2sd\t{$src2, $dst|$dst, $src2}", 1413 []>, XS, Requires<[UseSSE2]>, 1414 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>; 1415} 1416} // hasSideEffects = 0 1417 1418// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and 1419// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary 1420// vmovs{s,d} instructions 1421let Predicates = [UseAVX] in { 1422def : Pat<(v4f32 (X86Movss 1423 (v4f32 VR128:$dst), 1424 (v4f32 (scalar_to_vector 1425 (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), 1426 (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>; 1427 1428def : Pat<(v2f64 (X86Movsd 1429 (v2f64 VR128:$dst), 1430 (v2f64 (scalar_to_vector 1431 (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), 1432 (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>; 1433 1434def : Pat<(v4f32 (X86Movss 1435 (v4f32 VR128:$dst), 1436 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))), 1437 (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>; 1438 1439def : Pat<(v4f32 (X86Movss 1440 (v4f32 VR128:$dst), 1441 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))), 1442 (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>; 1443 1444def : Pat<(v4f32 (X86Movss 1445 (v4f32 VR128:$dst), 1446 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))), 1447 (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>; 1448 1449def : Pat<(v4f32 (X86Movss 1450 (v4f32 VR128:$dst), 1451 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))), 1452 (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>; 1453 1454def : Pat<(v2f64 (X86Movsd 1455 (v2f64 VR128:$dst), 1456 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))), 1457 (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>; 1458 1459def : Pat<(v2f64 (X86Movsd 1460 (v2f64 VR128:$dst), 1461 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))), 1462 (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>; 1463 1464def : Pat<(v2f64 (X86Movsd 1465 (v2f64 VR128:$dst), 1466 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))), 1467 (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>; 1468 1469def : Pat<(v2f64 (X86Movsd 1470 (v2f64 VR128:$dst), 1471 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))), 1472 (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>; 1473} // Predicates = [UseAVX] 1474 1475let Predicates = [UseSSE2] in { 1476def : Pat<(v4f32 (X86Movss 1477 (v4f32 VR128:$dst), 1478 (v4f32 (scalar_to_vector 1479 (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), 1480 (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>; 1481 1482def : Pat<(v2f64 (X86Movsd 1483 (v2f64 VR128:$dst), 1484 (v2f64 (scalar_to_vector 1485 (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), 1486 (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>; 1487 1488def : Pat<(v2f64 (X86Movsd 1489 (v2f64 VR128:$dst), 1490 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))), 1491 (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>; 1492 1493def : Pat<(v2f64 (X86Movsd 1494 (v2f64 VR128:$dst), 1495 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))), 1496 (CVTSI642SDrm_Int VR128:$dst, addr:$src)>; 1497 1498def : Pat<(v2f64 (X86Movsd 1499 (v2f64 VR128:$dst), 1500 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))), 1501 (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>; 1502 1503def : Pat<(v2f64 (X86Movsd 1504 (v2f64 VR128:$dst), 1505 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))), 1506 (CVTSI2SDrm_Int VR128:$dst, addr:$src)>; 1507} // Predicates = [UseSSE2] 1508 1509let Predicates = [UseSSE1] in { 1510def : Pat<(v4f32 (X86Movss 1511 (v4f32 VR128:$dst), 1512 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))), 1513 (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>; 1514 1515def : Pat<(v4f32 (X86Movss 1516 (v4f32 VR128:$dst), 1517 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))), 1518 (CVTSI642SSrm_Int VR128:$dst, addr:$src)>; 1519 1520def : Pat<(v4f32 (X86Movss 1521 (v4f32 VR128:$dst), 1522 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))), 1523 (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>; 1524 1525def : Pat<(v4f32 (X86Movss 1526 (v4f32 VR128:$dst), 1527 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))), 1528 (CVTSI2SSrm_Int VR128:$dst, addr:$src)>; 1529} // Predicates = [UseSSE1] 1530 1531let Predicates = [HasAVX, NoVLX] in { 1532// Convert packed single/double fp to doubleword 1533def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1534 "cvtps2dq\t{$src, $dst|$dst, $src}", 1535 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>, 1536 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG, SIMD_EXC; 1537def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1538 "cvtps2dq\t{$src, $dst|$dst, $src}", 1539 [(set VR128:$dst, 1540 (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>, 1541 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG, SIMD_EXC; 1542def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 1543 "cvtps2dq\t{$src, $dst|$dst, $src}", 1544 [(set VR256:$dst, 1545 (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>, 1546 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG, SIMD_EXC; 1547def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 1548 "cvtps2dq\t{$src, $dst|$dst, $src}", 1549 [(set VR256:$dst, 1550 (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>, 1551 VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG, SIMD_EXC; 1552} 1553def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1554 "cvtps2dq\t{$src, $dst|$dst, $src}", 1555 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>, 1556 Sched<[WriteCvtPS2I]>, SIMD_EXC; 1557def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1558 "cvtps2dq\t{$src, $dst|$dst, $src}", 1559 [(set VR128:$dst, 1560 (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>, 1561 Sched<[WriteCvtPS2ILd]>, SIMD_EXC; 1562 1563 1564// Convert Packed Double FP to Packed DW Integers 1565let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1566// The assembler can recognize rr 256-bit instructions by seeing a ymm 1567// register, but the same isn't true when using memory operands instead. 1568// Provide other assembly rr and rm forms to address this explicitly. 1569def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1570 "vcvtpd2dq\t{$src, $dst|$dst, $src}", 1571 [(set VR128:$dst, 1572 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, 1573 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG; 1574 1575// XMM only 1576def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1577 "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}", 1578 [(set VR128:$dst, 1579 (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX, 1580 Sched<[WriteCvtPD2ILd]>, VEX_WIG; 1581 1582// YMM only 1583def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1584 "vcvtpd2dq\t{$src, $dst|$dst, $src}", 1585 [(set VR128:$dst, 1586 (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>, 1587 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG; 1588def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1589 "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", 1590 [(set VR128:$dst, 1591 (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>, 1592 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG; 1593} 1594 1595def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", 1596 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">; 1597def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", 1598 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">; 1599 1600def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1601 "cvtpd2dq\t{$src, $dst|$dst, $src}", 1602 [(set VR128:$dst, 1603 (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>, 1604 Sched<[WriteCvtPD2ILd]>, SIMD_EXC; 1605def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1606 "cvtpd2dq\t{$src, $dst|$dst, $src}", 1607 [(set VR128:$dst, 1608 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, 1609 Sched<[WriteCvtPD2I]>, SIMD_EXC; 1610 1611// Convert with truncation packed single/double fp to doubleword 1612// SSE2 packed instructions with XS prefix 1613let Uses = [MXCSR], mayRaiseFPException = 1 in { 1614let Predicates = [HasAVX, NoVLX] in { 1615def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1616 "cvttps2dq\t{$src, $dst|$dst, $src}", 1617 [(set VR128:$dst, 1618 (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>, 1619 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG; 1620def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1621 "cvttps2dq\t{$src, $dst|$dst, $src}", 1622 [(set VR128:$dst, 1623 (v4i32 (X86any_cvttp2si (loadv4f32 addr:$src))))]>, 1624 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG; 1625def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 1626 "cvttps2dq\t{$src, $dst|$dst, $src}", 1627 [(set VR256:$dst, 1628 (v8i32 (X86any_cvttp2si (v8f32 VR256:$src))))]>, 1629 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG; 1630def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 1631 "cvttps2dq\t{$src, $dst|$dst, $src}", 1632 [(set VR256:$dst, 1633 (v8i32 (X86any_cvttp2si (loadv8f32 addr:$src))))]>, 1634 VEX, VEX_L, 1635 Sched<[WriteCvtPS2IYLd]>, VEX_WIG; 1636} 1637 1638def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1639 "cvttps2dq\t{$src, $dst|$dst, $src}", 1640 [(set VR128:$dst, 1641 (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>, 1642 Sched<[WriteCvtPS2I]>; 1643def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1644 "cvttps2dq\t{$src, $dst|$dst, $src}", 1645 [(set VR128:$dst, 1646 (v4i32 (X86any_cvttp2si (memopv4f32 addr:$src))))]>, 1647 Sched<[WriteCvtPS2ILd]>; 1648} 1649 1650// The assembler can recognize rr 256-bit instructions by seeing a ymm 1651// register, but the same isn't true when using memory operands instead. 1652// Provide other assembly rr and rm forms to address this explicitly. 1653let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1654// XMM only 1655def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1656 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1657 [(set VR128:$dst, 1658 (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>, 1659 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG; 1660def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1661 "cvttpd2dq{x}\t{$src, $dst|$dst, $src}", 1662 [(set VR128:$dst, 1663 (v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))))]>, 1664 VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG; 1665 1666// YMM only 1667def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1668 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1669 [(set VR128:$dst, 1670 (v4i32 (X86any_cvttp2si (v4f64 VR256:$src))))]>, 1671 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG; 1672def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1673 "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", 1674 [(set VR128:$dst, 1675 (v4i32 (X86any_cvttp2si (loadv4f64 addr:$src))))]>, 1676 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG; 1677} // Predicates = [HasAVX, NoVLX] 1678 1679def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", 1680 (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">; 1681def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", 1682 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">; 1683 1684let Predicates = [HasAVX, NoVLX] in { 1685 def : Pat<(v4i32 (any_fp_to_sint (v4f64 VR256:$src))), 1686 (VCVTTPD2DQYrr VR256:$src)>; 1687 def : Pat<(v4i32 (any_fp_to_sint (loadv4f64 addr:$src))), 1688 (VCVTTPD2DQYrm addr:$src)>; 1689} 1690 1691def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1692 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1693 [(set VR128:$dst, 1694 (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>, 1695 Sched<[WriteCvtPD2I]>, SIMD_EXC; 1696def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), 1697 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1698 [(set VR128:$dst, 1699 (v4i32 (X86any_cvttp2si (memopv2f64 addr:$src))))]>, 1700 Sched<[WriteCvtPD2ILd]>, SIMD_EXC; 1701 1702// Convert packed single to packed double 1703let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1704 // SSE2 instructions without OpSize prefix 1705def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1706 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1707 [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>, 1708 PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG; 1709def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 1710 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1711 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>, 1712 PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG; 1713def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 1714 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1715 [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>, 1716 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG; 1717def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), 1718 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1719 [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>, 1720 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG; 1721} 1722 1723let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in { 1724def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1725 "cvtps2pd\t{$src, $dst|$dst, $src}", 1726 [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>, 1727 PS, Sched<[WriteCvtPS2PD]>; 1728def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 1729 "cvtps2pd\t{$src, $dst|$dst, $src}", 1730 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>, 1731 PS, Sched<[WriteCvtPS2PD.Folded]>; 1732} 1733 1734// Convert Packed DW Integers to Packed Double FP 1735let Predicates = [HasAVX, NoVLX] in { 1736let hasSideEffects = 0, mayLoad = 1 in 1737def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 1738 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1739 [(set VR128:$dst, 1740 (v2f64 (X86any_VSintToFP 1741 (bc_v4i32 1742 (v2i64 (scalar_to_vector 1743 (loadi64 addr:$src)))))))]>, 1744 VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG; 1745def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1746 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1747 [(set VR128:$dst, 1748 (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>, 1749 VEX, Sched<[WriteCvtI2PD]>, VEX_WIG; 1750def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), 1751 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1752 [(set VR256:$dst, 1753 (v4f64 (any_sint_to_fp (loadv4i32 addr:$src))))]>, 1754 VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>, 1755 VEX_WIG; 1756def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 1757 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1758 [(set VR256:$dst, 1759 (v4f64 (any_sint_to_fp (v4i32 VR128:$src))))]>, 1760 VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG; 1761} 1762 1763let hasSideEffects = 0, mayLoad = 1 in 1764def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 1765 "cvtdq2pd\t{$src, $dst|$dst, $src}", 1766 [(set VR128:$dst, 1767 (v2f64 (X86any_VSintToFP 1768 (bc_v4i32 1769 (v2i64 (scalar_to_vector 1770 (loadi64 addr:$src)))))))]>, 1771 Sched<[WriteCvtI2PDLd]>; 1772def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1773 "cvtdq2pd\t{$src, $dst|$dst, $src}", 1774 [(set VR128:$dst, 1775 (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>, 1776 Sched<[WriteCvtI2PD]>; 1777 1778// AVX register conversion intrinsics 1779let Predicates = [HasAVX, NoVLX] in { 1780 def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), 1781 (VCVTDQ2PDrm addr:$src)>; 1782} // Predicates = [HasAVX, NoVLX] 1783 1784// SSE2 register conversion intrinsics 1785let Predicates = [UseSSE2] in { 1786 def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), 1787 (CVTDQ2PDrm addr:$src)>; 1788} // Predicates = [UseSSE2] 1789 1790// Convert packed double to packed single 1791// The assembler can recognize rr 256-bit instructions by seeing a ymm 1792// register, but the same isn't true when using memory operands instead. 1793// Provide other assembly rr and rm forms to address this explicitly. 1794let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1795// XMM only 1796def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1797 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1798 [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>, 1799 VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG; 1800def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1801 "cvtpd2ps{x}\t{$src, $dst|$dst, $src}", 1802 [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv2f64 addr:$src))))]>, 1803 VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG; 1804 1805def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1806 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1807 [(set VR128:$dst, (v4f32 (X86any_vfpround (v4f64 VR256:$src))))]>, 1808 VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG; 1809def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1810 "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", 1811 [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv4f64 addr:$src))))]>, 1812 VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG; 1813} // Predicates = [HasAVX, NoVLX] 1814 1815def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", 1816 (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">; 1817def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}", 1818 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">; 1819 1820def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1821 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1822 [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>, 1823 Sched<[WriteCvtPD2PS]>, SIMD_EXC; 1824def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1825 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1826 [(set VR128:$dst, (v4f32 (X86any_vfpround (memopv2f64 addr:$src))))]>, 1827 Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC; 1828 1829//===----------------------------------------------------------------------===// 1830// SSE 1 & 2 - Compare Instructions 1831//===----------------------------------------------------------------------===// 1832 1833// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions 1834multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, 1835 Operand memop, SDNode OpNode, ValueType VT, 1836 PatFrag ld_frag, string asm, 1837 X86FoldableSchedWrite sched, 1838 PatFrags mem_frags> { 1839 def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), 1840 (ins VR128:$src1, VR128:$src2, u8imm:$cc), asm, 1841 [(set VR128:$dst, (OpNode (VT VR128:$src1), 1842 VR128:$src2, timm:$cc))]>, 1843 Sched<[sched]>, SIMD_EXC; 1844 let mayLoad = 1 in 1845 def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), 1846 (ins VR128:$src1, memop:$src2, u8imm:$cc), asm, 1847 [(set VR128:$dst, (OpNode (VT VR128:$src1), 1848 (mem_frags addr:$src2), timm:$cc))]>, 1849 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1850 1851 let isCodeGenOnly = 1 in { 1852 let isCommutable = 1 in 1853 def rr : SIi8<0xC2, MRMSrcReg, 1854 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, 1855 [(set RC:$dst, (OpNode RC:$src1, RC:$src2, timm:$cc))]>, 1856 Sched<[sched]>, SIMD_EXC; 1857 def rm : SIi8<0xC2, MRMSrcMem, 1858 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, 1859 [(set RC:$dst, (OpNode RC:$src1, 1860 (ld_frag addr:$src2), timm:$cc))]>, 1861 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1862 } 1863} 1864 1865let ExeDomain = SSEPackedSingle in 1866defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32, 1867 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1868 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, 1869 XS, VEX_4V, VEX_LIG, VEX_WIG; 1870let ExeDomain = SSEPackedDouble in 1871defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64, 1872 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1873 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, 1874 XD, VEX_4V, VEX_LIG, VEX_WIG; 1875 1876let Constraints = "$src1 = $dst" in { 1877 let ExeDomain = SSEPackedSingle in 1878 defm CMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32, 1879 "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1880 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS; 1881 let ExeDomain = SSEPackedDouble in 1882 defm CMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64, 1883 "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1884 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD; 1885} 1886 1887// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS 1888multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDPatternOperator OpNode, 1889 ValueType vt, X86MemOperand x86memop, 1890 PatFrag ld_frag, string OpcodeStr, Domain d, 1891 X86FoldableSchedWrite sched = WriteFComX> { 1892 let ExeDomain = d in { 1893 def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 1894 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1895 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, 1896 Sched<[sched]>, SIMD_EXC; 1897 let mayLoad = 1 in 1898 def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), 1899 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1900 [(set EFLAGS, (OpNode (vt RC:$src1), 1901 (ld_frag addr:$src2)))]>, 1902 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1903} 1904} 1905 1906// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp 1907multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode, 1908 ValueType vt, Operand memop, 1909 PatFrags mem_frags, string OpcodeStr, 1910 Domain d, 1911 X86FoldableSchedWrite sched = WriteFComX> { 1912let ExeDomain = d in { 1913 def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 1914 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1915 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, 1916 Sched<[sched]>, SIMD_EXC; 1917let mayLoad = 1 in 1918 def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2), 1919 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1920 [(set EFLAGS, (OpNode (vt RC:$src1), 1921 (mem_frags addr:$src2)))]>, 1922 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1923} 1924} 1925 1926let Defs = [EFLAGS] in { 1927 defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32, 1928 "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG; 1929 defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64, 1930 "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG; 1931 defm VCOMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32, 1932 "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG; 1933 defm VCOMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64, 1934 "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG; 1935 1936 let isCodeGenOnly = 1 in { 1937 defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, 1938 sse_load_f32, "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG; 1939 defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, 1940 sse_load_f64, "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG; 1941 1942 defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, 1943 sse_load_f32, "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG; 1944 defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, 1945 sse_load_f64, "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG; 1946 } 1947 defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32, 1948 "ucomiss", SSEPackedSingle>, PS; 1949 defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64, 1950 "ucomisd", SSEPackedDouble>, PD; 1951 defm COMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32, 1952 "comiss", SSEPackedSingle>, PS; 1953 defm COMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64, 1954 "comisd", SSEPackedDouble>, PD; 1955 1956 let isCodeGenOnly = 1 in { 1957 defm UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, 1958 sse_load_f32, "ucomiss", SSEPackedSingle>, PS; 1959 defm UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, 1960 sse_load_f64, "ucomisd", SSEPackedDouble>, PD; 1961 1962 defm COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, 1963 sse_load_f32, "comiss", SSEPackedSingle>, PS; 1964 defm COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, 1965 sse_load_f64, "comisd", SSEPackedDouble>, PD; 1966 } 1967} // Defs = [EFLAGS] 1968 1969// sse12_cmp_packed - sse 1 & 2 compare packed instructions 1970multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, 1971 ValueType VT, string asm, 1972 X86FoldableSchedWrite sched, 1973 Domain d, PatFrag ld_frag> { 1974 let isCommutable = 1 in 1975 def rri : PIi8<0xC2, MRMSrcReg, 1976 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, 1977 [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>, 1978 Sched<[sched]>, SIMD_EXC; 1979 def rmi : PIi8<0xC2, MRMSrcMem, 1980 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, 1981 [(set RC:$dst, 1982 (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>, 1983 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1984} 1985 1986defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32, 1987 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1988 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG; 1989defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64, 1990 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1991 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG; 1992defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32, 1993 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1994 SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG; 1995defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64, 1996 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1997 SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG; 1998let Constraints = "$src1 = $dst" in { 1999 defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32, 2000 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", 2001 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS; 2002 defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64, 2003 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", 2004 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD; 2005} 2006 2007def CommutableCMPCC : PatLeaf<(timm), [{ 2008 uint64_t Imm = N->getZExtValue() & 0x7; 2009 return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07); 2010}]>; 2011 2012// Patterns to select compares with loads in first operand. 2013let Predicates = [HasAVX] in { 2014 def : Pat<(v4f64 (X86any_cmpp (loadv4f64 addr:$src2), VR256:$src1, 2015 CommutableCMPCC:$cc)), 2016 (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>; 2017 2018 def : Pat<(v8f32 (X86any_cmpp (loadv8f32 addr:$src2), VR256:$src1, 2019 CommutableCMPCC:$cc)), 2020 (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>; 2021 2022 def : Pat<(v2f64 (X86any_cmpp (loadv2f64 addr:$src2), VR128:$src1, 2023 CommutableCMPCC:$cc)), 2024 (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>; 2025 2026 def : Pat<(v4f32 (X86any_cmpp (loadv4f32 addr:$src2), VR128:$src1, 2027 CommutableCMPCC:$cc)), 2028 (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>; 2029 2030 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, 2031 CommutableCMPCC:$cc)), 2032 (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>; 2033 2034 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, 2035 CommutableCMPCC:$cc)), 2036 (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>; 2037} 2038 2039let Predicates = [UseSSE2] in { 2040 def : Pat<(v2f64 (X86any_cmpp (memopv2f64 addr:$src2), VR128:$src1, 2041 CommutableCMPCC:$cc)), 2042 (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>; 2043 2044 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, 2045 CommutableCMPCC:$cc)), 2046 (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>; 2047} 2048 2049let Predicates = [UseSSE1] in { 2050 def : Pat<(v4f32 (X86any_cmpp (memopv4f32 addr:$src2), VR128:$src1, 2051 CommutableCMPCC:$cc)), 2052 (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>; 2053 2054 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, 2055 CommutableCMPCC:$cc)), 2056 (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>; 2057} 2058 2059//===----------------------------------------------------------------------===// 2060// SSE 1 & 2 - Shuffle Instructions 2061//===----------------------------------------------------------------------===// 2062 2063/// sse12_shuffle - sse 1 & 2 fp shuffle instructions 2064multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, 2065 ValueType vt, string asm, PatFrag mem_frag, 2066 X86FoldableSchedWrite sched, Domain d, 2067 bit IsCommutable = 0> { 2068 def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), 2069 (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm, 2070 [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), 2071 (i8 timm:$src3))))], d>, 2072 Sched<[sched.Folded, sched.ReadAfterFold]>; 2073 let isCommutable = IsCommutable in 2074 def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), 2075 (ins RC:$src1, RC:$src2, u8imm:$src3), asm, 2076 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, 2077 (i8 timm:$src3))))], d>, 2078 Sched<[sched]>; 2079} 2080 2081let Predicates = [HasAVX, NoVLX] in { 2082 defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, 2083 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2084 loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, 2085 PS, VEX_4V, VEX_WIG; 2086 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, 2087 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2088 loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>, 2089 PS, VEX_4V, VEX_L, VEX_WIG; 2090 defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, 2091 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2092 loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>, 2093 PD, VEX_4V, VEX_WIG; 2094 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, 2095 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2096 loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>, 2097 PD, VEX_4V, VEX_L, VEX_WIG; 2098} 2099let Constraints = "$src1 = $dst" in { 2100 defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, 2101 "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", 2102 memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; 2103 defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64, 2104 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}", 2105 memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD; 2106} 2107 2108//===----------------------------------------------------------------------===// 2109// SSE 1 & 2 - Unpack FP Instructions 2110//===----------------------------------------------------------------------===// 2111 2112/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave 2113multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, 2114 PatFrag mem_frag, RegisterClass RC, 2115 X86MemOperand x86memop, string asm, 2116 X86FoldableSchedWrite sched, Domain d, 2117 bit IsCommutable = 0> { 2118 let isCommutable = IsCommutable in 2119 def rr : PI<opc, MRMSrcReg, 2120 (outs RC:$dst), (ins RC:$src1, RC:$src2), 2121 asm, [(set RC:$dst, 2122 (vt (OpNode RC:$src1, RC:$src2)))], d>, 2123 Sched<[sched]>; 2124 def rm : PI<opc, MRMSrcMem, 2125 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 2126 asm, [(set RC:$dst, 2127 (vt (OpNode RC:$src1, 2128 (mem_frag addr:$src2))))], d>, 2129 Sched<[sched.Folded, sched.ReadAfterFold]>; 2130} 2131 2132let Predicates = [HasAVX, NoVLX] in { 2133defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load, 2134 VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2135 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; 2136defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load, 2137 VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2138 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG; 2139defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load, 2140 VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2141 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; 2142defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load, 2143 VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2144 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG; 2145 2146defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load, 2147 VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2148 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; 2149defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load, 2150 VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2151 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; 2152defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load, 2153 VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2154 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; 2155defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load, 2156 VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2157 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; 2158}// Predicates = [HasAVX, NoVLX] 2159 2160let Constraints = "$src1 = $dst" in { 2161 defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop, 2162 VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", 2163 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; 2164 defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop, 2165 VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}", 2166 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD; 2167 defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop, 2168 VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}", 2169 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; 2170 defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop, 2171 VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}", 2172 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD; 2173} // Constraints = "$src1 = $dst" 2174 2175let Predicates = [HasAVX1Only] in { 2176 def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))), 2177 (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; 2178 def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)), 2179 (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; 2180 def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))), 2181 (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; 2182 def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)), 2183 (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; 2184 2185 def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))), 2186 (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; 2187 def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)), 2188 (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; 2189 def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))), 2190 (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; 2191 def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)), 2192 (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; 2193} 2194 2195let Predicates = [UseSSE2] in { 2196 // Use MOVHPD if the load isn't aligned enough for UNPCKLPD. 2197 def : Pat<(v2f64 (X86Unpckl VR128:$src1, 2198 (v2f64 (simple_load addr:$src2)))), 2199 (MOVHPDrm VR128:$src1, addr:$src2)>; 2200} 2201 2202//===----------------------------------------------------------------------===// 2203// SSE 1 & 2 - Extract Floating-Point Sign mask 2204//===----------------------------------------------------------------------===// 2205 2206/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave 2207multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt, 2208 string asm, Domain d> { 2209 def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src), 2210 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 2211 [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>, 2212 Sched<[WriteFMOVMSK]>; 2213} 2214 2215let Predicates = [HasAVX] in { 2216 defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", 2217 SSEPackedSingle>, PS, VEX, VEX_WIG; 2218 defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd", 2219 SSEPackedDouble>, PD, VEX, VEX_WIG; 2220 defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps", 2221 SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG; 2222 defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd", 2223 SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG; 2224 2225 // Also support integer VTs to avoid a int->fp bitcast in the DAG. 2226 def : Pat<(X86movmsk (v4i32 VR128:$src)), 2227 (VMOVMSKPSrr VR128:$src)>; 2228 def : Pat<(X86movmsk (v2i64 VR128:$src)), 2229 (VMOVMSKPDrr VR128:$src)>; 2230 def : Pat<(X86movmsk (v8i32 VR256:$src)), 2231 (VMOVMSKPSYrr VR256:$src)>; 2232 def : Pat<(X86movmsk (v4i64 VR256:$src)), 2233 (VMOVMSKPDYrr VR256:$src)>; 2234} 2235 2236defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", 2237 SSEPackedSingle>, PS; 2238defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd", 2239 SSEPackedDouble>, PD; 2240 2241let Predicates = [UseSSE2] in { 2242 // Also support integer VTs to avoid a int->fp bitcast in the DAG. 2243 def : Pat<(X86movmsk (v4i32 VR128:$src)), 2244 (MOVMSKPSrr VR128:$src)>; 2245 def : Pat<(X86movmsk (v2i64 VR128:$src)), 2246 (MOVMSKPDrr VR128:$src)>; 2247} 2248 2249//===---------------------------------------------------------------------===// 2250// SSE2 - Packed Integer Logical Instructions 2251//===---------------------------------------------------------------------===// 2252 2253let ExeDomain = SSEPackedInt in { // SSE integer instructions 2254 2255/// PDI_binop_rm - Simple SSE2 binary operator. 2256multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 2257 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 2258 X86MemOperand x86memop, X86FoldableSchedWrite sched, 2259 bit IsCommutable, bit Is2Addr> { 2260 let isCommutable = IsCommutable in 2261 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 2262 (ins RC:$src1, RC:$src2), 2263 !if(Is2Addr, 2264 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 2265 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 2266 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 2267 Sched<[sched]>; 2268 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 2269 (ins RC:$src1, x86memop:$src2), 2270 !if(Is2Addr, 2271 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 2272 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 2273 [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 2274 Sched<[sched.Folded, sched.ReadAfterFold]>; 2275} 2276} // ExeDomain = SSEPackedInt 2277 2278multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode, 2279 ValueType OpVT128, ValueType OpVT256, 2280 X86SchedWriteWidths sched, bit IsCommutable, 2281 Predicate prd> { 2282let Predicates = [HasAVX, prd] in 2283 defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128, 2284 VR128, load, i128mem, sched.XMM, 2285 IsCommutable, 0>, VEX_4V, VEX_WIG; 2286 2287let Constraints = "$src1 = $dst" in 2288 defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128, 2289 memop, i128mem, sched.XMM, IsCommutable, 1>; 2290 2291let Predicates = [HasAVX2, prd] in 2292 defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, 2293 OpVT256, VR256, load, i256mem, sched.YMM, 2294 IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG; 2295} 2296 2297// These are ordered here for pattern ordering requirements with the fp versions 2298 2299defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, 2300 SchedWriteVecLogic, 1, NoVLX>; 2301defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, 2302 SchedWriteVecLogic, 1, NoVLX>; 2303defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, 2304 SchedWriteVecLogic, 1, NoVLX>; 2305defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, 2306 SchedWriteVecLogic, 0, NoVLX>; 2307 2308//===----------------------------------------------------------------------===// 2309// SSE 1 & 2 - Logical Instructions 2310//===----------------------------------------------------------------------===// 2311 2312/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops 2313/// 2314/// There are no patterns here because isel prefers integer versions for SSE2 2315/// and later. There are SSE1 v4f32 patterns later. 2316multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, 2317 X86SchedWriteWidths sched> { 2318 let Predicates = [HasAVX, NoVLX] in { 2319 defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, 2320 !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM, 2321 [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG; 2322 2323 defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, 2324 !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM, 2325 [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG; 2326 2327 defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, 2328 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM, 2329 [], [], 0>, PS, VEX_4V, VEX_WIG; 2330 2331 defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, 2332 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM, 2333 [], [], 0>, PD, VEX_4V, VEX_WIG; 2334 } 2335 2336 let Constraints = "$src1 = $dst" in { 2337 defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, 2338 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM, 2339 [], []>, PS; 2340 2341 defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, 2342 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM, 2343 [], []>, PD; 2344 } 2345} 2346 2347defm AND : sse12_fp_packed_logical<0x54, "and", SchedWriteFLogic>; 2348defm OR : sse12_fp_packed_logical<0x56, "or", SchedWriteFLogic>; 2349defm XOR : sse12_fp_packed_logical<0x57, "xor", SchedWriteFLogic>; 2350let isCommutable = 0 in 2351 defm ANDN : sse12_fp_packed_logical<0x55, "andn", SchedWriteFLogic>; 2352 2353let Predicates = [HasAVX2, NoVLX] in { 2354 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)), 2355 (VPANDYrr VR256:$src1, VR256:$src2)>; 2356 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)), 2357 (VPANDYrr VR256:$src1, VR256:$src2)>; 2358 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)), 2359 (VPANDYrr VR256:$src1, VR256:$src2)>; 2360 2361 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)), 2362 (VPORYrr VR256:$src1, VR256:$src2)>; 2363 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)), 2364 (VPORYrr VR256:$src1, VR256:$src2)>; 2365 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)), 2366 (VPORYrr VR256:$src1, VR256:$src2)>; 2367 2368 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)), 2369 (VPXORYrr VR256:$src1, VR256:$src2)>; 2370 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)), 2371 (VPXORYrr VR256:$src1, VR256:$src2)>; 2372 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)), 2373 (VPXORYrr VR256:$src1, VR256:$src2)>; 2374 2375 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)), 2376 (VPANDNYrr VR256:$src1, VR256:$src2)>; 2377 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)), 2378 (VPANDNYrr VR256:$src1, VR256:$src2)>; 2379 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)), 2380 (VPANDNYrr VR256:$src1, VR256:$src2)>; 2381 2382 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)), 2383 (VPANDYrm VR256:$src1, addr:$src2)>; 2384 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)), 2385 (VPANDYrm VR256:$src1, addr:$src2)>; 2386 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)), 2387 (VPANDYrm VR256:$src1, addr:$src2)>; 2388 2389 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)), 2390 (VPORYrm VR256:$src1, addr:$src2)>; 2391 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)), 2392 (VPORYrm VR256:$src1, addr:$src2)>; 2393 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)), 2394 (VPORYrm VR256:$src1, addr:$src2)>; 2395 2396 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)), 2397 (VPXORYrm VR256:$src1, addr:$src2)>; 2398 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)), 2399 (VPXORYrm VR256:$src1, addr:$src2)>; 2400 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)), 2401 (VPXORYrm VR256:$src1, addr:$src2)>; 2402 2403 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)), 2404 (VPANDNYrm VR256:$src1, addr:$src2)>; 2405 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)), 2406 (VPANDNYrm VR256:$src1, addr:$src2)>; 2407 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)), 2408 (VPANDNYrm VR256:$src1, addr:$src2)>; 2409} 2410 2411// If only AVX1 is supported, we need to handle integer operations with 2412// floating point instructions since the integer versions aren't available. 2413let Predicates = [HasAVX1Only] in { 2414 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)), 2415 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2416 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)), 2417 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2418 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)), 2419 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2420 def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)), 2421 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2422 2423 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)), 2424 (VORPSYrr VR256:$src1, VR256:$src2)>; 2425 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)), 2426 (VORPSYrr VR256:$src1, VR256:$src2)>; 2427 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)), 2428 (VORPSYrr VR256:$src1, VR256:$src2)>; 2429 def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)), 2430 (VORPSYrr VR256:$src1, VR256:$src2)>; 2431 2432 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)), 2433 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2434 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)), 2435 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2436 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)), 2437 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2438 def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)), 2439 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2440 2441 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)), 2442 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2443 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)), 2444 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2445 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)), 2446 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2447 def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)), 2448 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2449 2450 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)), 2451 (VANDPSYrm VR256:$src1, addr:$src2)>; 2452 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)), 2453 (VANDPSYrm VR256:$src1, addr:$src2)>; 2454 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)), 2455 (VANDPSYrm VR256:$src1, addr:$src2)>; 2456 def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)), 2457 (VANDPSYrm VR256:$src1, addr:$src2)>; 2458 2459 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)), 2460 (VORPSYrm VR256:$src1, addr:$src2)>; 2461 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)), 2462 (VORPSYrm VR256:$src1, addr:$src2)>; 2463 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)), 2464 (VORPSYrm VR256:$src1, addr:$src2)>; 2465 def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)), 2466 (VORPSYrm VR256:$src1, addr:$src2)>; 2467 2468 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)), 2469 (VXORPSYrm VR256:$src1, addr:$src2)>; 2470 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)), 2471 (VXORPSYrm VR256:$src1, addr:$src2)>; 2472 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)), 2473 (VXORPSYrm VR256:$src1, addr:$src2)>; 2474 def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)), 2475 (VXORPSYrm VR256:$src1, addr:$src2)>; 2476 2477 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)), 2478 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2479 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)), 2480 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2481 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)), 2482 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2483 def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)), 2484 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2485} 2486 2487let Predicates = [HasAVX, NoVLX] in { 2488 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)), 2489 (VPANDrr VR128:$src1, VR128:$src2)>; 2490 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)), 2491 (VPANDrr VR128:$src1, VR128:$src2)>; 2492 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)), 2493 (VPANDrr VR128:$src1, VR128:$src2)>; 2494 2495 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)), 2496 (VPORrr VR128:$src1, VR128:$src2)>; 2497 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)), 2498 (VPORrr VR128:$src1, VR128:$src2)>; 2499 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)), 2500 (VPORrr VR128:$src1, VR128:$src2)>; 2501 2502 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)), 2503 (VPXORrr VR128:$src1, VR128:$src2)>; 2504 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)), 2505 (VPXORrr VR128:$src1, VR128:$src2)>; 2506 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)), 2507 (VPXORrr VR128:$src1, VR128:$src2)>; 2508 2509 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)), 2510 (VPANDNrr VR128:$src1, VR128:$src2)>; 2511 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)), 2512 (VPANDNrr VR128:$src1, VR128:$src2)>; 2513 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)), 2514 (VPANDNrr VR128:$src1, VR128:$src2)>; 2515 2516 def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)), 2517 (VPANDrm VR128:$src1, addr:$src2)>; 2518 def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)), 2519 (VPANDrm VR128:$src1, addr:$src2)>; 2520 def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)), 2521 (VPANDrm VR128:$src1, addr:$src2)>; 2522 2523 def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)), 2524 (VPORrm VR128:$src1, addr:$src2)>; 2525 def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)), 2526 (VPORrm VR128:$src1, addr:$src2)>; 2527 def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)), 2528 (VPORrm VR128:$src1, addr:$src2)>; 2529 2530 def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)), 2531 (VPXORrm VR128:$src1, addr:$src2)>; 2532 def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)), 2533 (VPXORrm VR128:$src1, addr:$src2)>; 2534 def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)), 2535 (VPXORrm VR128:$src1, addr:$src2)>; 2536 2537 def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)), 2538 (VPANDNrm VR128:$src1, addr:$src2)>; 2539 def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)), 2540 (VPANDNrm VR128:$src1, addr:$src2)>; 2541 def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)), 2542 (VPANDNrm VR128:$src1, addr:$src2)>; 2543} 2544 2545let Predicates = [UseSSE2] in { 2546 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)), 2547 (PANDrr VR128:$src1, VR128:$src2)>; 2548 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)), 2549 (PANDrr VR128:$src1, VR128:$src2)>; 2550 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)), 2551 (PANDrr VR128:$src1, VR128:$src2)>; 2552 2553 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)), 2554 (PORrr VR128:$src1, VR128:$src2)>; 2555 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)), 2556 (PORrr VR128:$src1, VR128:$src2)>; 2557 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)), 2558 (PORrr VR128:$src1, VR128:$src2)>; 2559 2560 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)), 2561 (PXORrr VR128:$src1, VR128:$src2)>; 2562 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)), 2563 (PXORrr VR128:$src1, VR128:$src2)>; 2564 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)), 2565 (PXORrr VR128:$src1, VR128:$src2)>; 2566 2567 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)), 2568 (PANDNrr VR128:$src1, VR128:$src2)>; 2569 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)), 2570 (PANDNrr VR128:$src1, VR128:$src2)>; 2571 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)), 2572 (PANDNrr VR128:$src1, VR128:$src2)>; 2573 2574 def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)), 2575 (PANDrm VR128:$src1, addr:$src2)>; 2576 def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)), 2577 (PANDrm VR128:$src1, addr:$src2)>; 2578 def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)), 2579 (PANDrm VR128:$src1, addr:$src2)>; 2580 2581 def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)), 2582 (PORrm VR128:$src1, addr:$src2)>; 2583 def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)), 2584 (PORrm VR128:$src1, addr:$src2)>; 2585 def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)), 2586 (PORrm VR128:$src1, addr:$src2)>; 2587 2588 def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)), 2589 (PXORrm VR128:$src1, addr:$src2)>; 2590 def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)), 2591 (PXORrm VR128:$src1, addr:$src2)>; 2592 def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)), 2593 (PXORrm VR128:$src1, addr:$src2)>; 2594 2595 def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)), 2596 (PANDNrm VR128:$src1, addr:$src2)>; 2597 def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)), 2598 (PANDNrm VR128:$src1, addr:$src2)>; 2599 def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)), 2600 (PANDNrm VR128:$src1, addr:$src2)>; 2601} 2602 2603// Patterns for packed operations when we don't have integer type available. 2604def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)), 2605 (ANDPSrr VR128:$src1, VR128:$src2)>; 2606def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)), 2607 (ORPSrr VR128:$src1, VR128:$src2)>; 2608def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)), 2609 (XORPSrr VR128:$src1, VR128:$src2)>; 2610def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)), 2611 (ANDNPSrr VR128:$src1, VR128:$src2)>; 2612 2613def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)), 2614 (ANDPSrm VR128:$src1, addr:$src2)>; 2615def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)), 2616 (ORPSrm VR128:$src1, addr:$src2)>; 2617def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)), 2618 (XORPSrm VR128:$src1, addr:$src2)>; 2619def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)), 2620 (ANDNPSrm VR128:$src1, addr:$src2)>; 2621 2622//===----------------------------------------------------------------------===// 2623// SSE 1 & 2 - Arithmetic Instructions 2624//===----------------------------------------------------------------------===// 2625 2626/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and 2627/// vector forms. 2628/// 2629/// In addition, we also have a special variant of the scalar form here to 2630/// represent the associated intrinsic operation. This form is unlike the 2631/// plain scalar form, in that it takes an entire vector (instead of a scalar) 2632/// and leaves the top elements unmodified (therefore these cannot be commuted). 2633/// 2634/// These three forms can each be reg+reg or reg+mem. 2635/// 2636 2637/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those 2638/// classes below 2639multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, 2640 SDPatternOperator OpNode, X86SchedWriteSizes sched> { 2641let Uses = [MXCSR], mayRaiseFPException = 1 in { 2642 let Predicates = [HasAVX, NoVLX] in { 2643 defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, 2644 VR128, v4f32, f128mem, loadv4f32, 2645 SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG; 2646 defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, 2647 VR128, v2f64, f128mem, loadv2f64, 2648 SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG; 2649 2650 defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), 2651 OpNode, VR256, v8f32, f256mem, loadv8f32, 2652 SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG; 2653 defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), 2654 OpNode, VR256, v4f64, f256mem, loadv4f64, 2655 SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG; 2656 } 2657 2658 let Constraints = "$src1 = $dst" in { 2659 defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, 2660 v4f32, f128mem, memopv4f32, SSEPackedSingle, 2661 sched.PS.XMM>, PS; 2662 defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, 2663 v2f64, f128mem, memopv2f64, SSEPackedDouble, 2664 sched.PD.XMM>, PD; 2665 } 2666} 2667} 2668 2669multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, 2670 X86SchedWriteSizes sched> { 2671let Uses = [MXCSR], mayRaiseFPException = 1 in { 2672 defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), 2673 OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>, 2674 XS, VEX_4V, VEX_LIG, VEX_WIG; 2675 defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), 2676 OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>, 2677 XD, VEX_4V, VEX_LIG, VEX_WIG; 2678 2679 let Constraints = "$src1 = $dst" in { 2680 defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), 2681 OpNode, FR32, f32mem, SSEPackedSingle, 2682 sched.PS.Scl>, XS; 2683 defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), 2684 OpNode, FR64, f64mem, SSEPackedDouble, 2685 sched.PD.Scl>, XD; 2686 } 2687} 2688} 2689 2690multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, 2691 SDPatternOperator OpNode, 2692 X86SchedWriteSizes sched> { 2693let Uses = [MXCSR], mayRaiseFPException = 1 in { 2694 defm V#NAME#SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32, 2695 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, 2696 SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG; 2697 defm V#NAME#SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64, 2698 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, 2699 SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG; 2700 2701 let Constraints = "$src1 = $dst" in { 2702 defm SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32, 2703 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, 2704 SSEPackedSingle, sched.PS.Scl>, XS; 2705 defm SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64, 2706 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, 2707 SSEPackedDouble, sched.PD.Scl>, XD; 2708 } 2709} 2710} 2711 2712// Binary Arithmetic instructions 2713defm ADD : basic_sse12_fp_binop_p<0x58, "add", any_fadd, SchedWriteFAddSizes>, 2714 basic_sse12_fp_binop_s<0x58, "add", any_fadd, SchedWriteFAddSizes>, 2715 basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>; 2716defm MUL : basic_sse12_fp_binop_p<0x59, "mul", any_fmul, SchedWriteFMulSizes>, 2717 basic_sse12_fp_binop_s<0x59, "mul", any_fmul, SchedWriteFMulSizes>, 2718 basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>; 2719let isCommutable = 0 in { 2720 defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", any_fsub, SchedWriteFAddSizes>, 2721 basic_sse12_fp_binop_s<0x5C, "sub", any_fsub, SchedWriteFAddSizes>, 2722 basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>; 2723 defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>, 2724 basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>, 2725 basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>; 2726 defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, 2727 basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, 2728 basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>; 2729 defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>, 2730 basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>, 2731 basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>; 2732} 2733 2734let isCodeGenOnly = 1 in { 2735 defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>, 2736 basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>; 2737 defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>, 2738 basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>; 2739} 2740 2741// Patterns used to select SSE scalar fp arithmetic instructions from 2742// either: 2743// 2744// (1) a scalar fp operation followed by a blend 2745// 2746// The effect is that the backend no longer emits unnecessary vector 2747// insert instructions immediately after SSE scalar fp instructions 2748// like addss or mulss. 2749// 2750// For example, given the following code: 2751// __m128 foo(__m128 A, __m128 B) { 2752// A[0] += B[0]; 2753// return A; 2754// } 2755// 2756// Previously we generated: 2757// addss %xmm0, %xmm1 2758// movss %xmm1, %xmm0 2759// 2760// We now generate: 2761// addss %xmm1, %xmm0 2762// 2763// (2) a vector packed single/double fp operation followed by a vector insert 2764// 2765// The effect is that the backend converts the packed fp instruction 2766// followed by a vector insert into a single SSE scalar fp instruction. 2767// 2768// For example, given the following code: 2769// __m128 foo(__m128 A, __m128 B) { 2770// __m128 C = A + B; 2771// return (__m128) {c[0], a[1], a[2], a[3]}; 2772// } 2773// 2774// Previously we generated: 2775// addps %xmm0, %xmm1 2776// movss %xmm1, %xmm0 2777// 2778// We now generate: 2779// addss %xmm1, %xmm0 2780 2781// TODO: Some canonicalization in lowering would simplify the number of 2782// patterns we have to try to match. 2783multiclass scalar_math_patterns<SDPatternOperator Op, string OpcPrefix, SDNode Move, 2784 ValueType VT, ValueType EltTy, 2785 RegisterClass RC, PatFrag ld_frag, 2786 Predicate BasePredicate> { 2787 let Predicates = [BasePredicate] in { 2788 // extracted scalar math op with insert via movss/movsd 2789 def : Pat<(VT (Move (VT VR128:$dst), 2790 (VT (scalar_to_vector 2791 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2792 RC:$src))))), 2793 (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst, 2794 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; 2795 def : Pat<(VT (Move (VT VR128:$dst), 2796 (VT (scalar_to_vector 2797 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2798 (ld_frag addr:$src)))))), 2799 (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>; 2800 } 2801 2802 // Repeat for AVX versions of the instructions. 2803 let Predicates = [UseAVX] in { 2804 // extracted scalar math op with insert via movss/movsd 2805 def : Pat<(VT (Move (VT VR128:$dst), 2806 (VT (scalar_to_vector 2807 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2808 RC:$src))))), 2809 (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst, 2810 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; 2811 def : Pat<(VT (Move (VT VR128:$dst), 2812 (VT (scalar_to_vector 2813 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2814 (ld_frag addr:$src)))))), 2815 (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>; 2816 } 2817} 2818 2819defm : scalar_math_patterns<any_fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2820defm : scalar_math_patterns<any_fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2821defm : scalar_math_patterns<any_fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2822defm : scalar_math_patterns<any_fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2823 2824defm : scalar_math_patterns<any_fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2825defm : scalar_math_patterns<any_fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2826defm : scalar_math_patterns<any_fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2827defm : scalar_math_patterns<any_fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2828 2829/// Unop Arithmetic 2830/// In addition, we also have a special variant of the scalar form here to 2831/// represent the associated intrinsic operation. This form is unlike the 2832/// plain scalar form, in that it takes an entire vector (instead of a 2833/// scalar) and leaves the top elements undefined. 2834/// 2835/// And, we have a special variant form for a full-vector intrinsic form. 2836 2837/// sse_fp_unop_s - SSE1 unops in scalar form 2838/// For the non-AVX defs, we need $src1 to be tied to $dst because 2839/// the HW instructions are 2 operand / destructive. 2840multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, 2841 X86MemOperand x86memop, Operand intmemop, 2842 SDPatternOperator OpNode, Domain d, 2843 X86FoldableSchedWrite sched, Predicate target> { 2844 let isCodeGenOnly = 1, hasSideEffects = 0 in { 2845 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1), 2846 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), 2847 [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>, 2848 Requires<[target]>; 2849 let mayLoad = 1 in 2850 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1), 2851 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), 2852 [(set RC:$dst, (OpNode (load addr:$src1)))], d>, 2853 Sched<[sched.Folded]>, 2854 Requires<[target, OptForSize]>; 2855 } 2856 2857 let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in { 2858 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 2859 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, 2860 Sched<[sched]>; 2861 let mayLoad = 1 in 2862 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2), 2863 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, 2864 Sched<[sched.Folded, sched.ReadAfterFold]>; 2865 } 2866 2867} 2868 2869multiclass sse_fp_unop_s_intr<ValueType vt, PatFrags mem_frags, 2870 Intrinsic Intr, Predicate target> { 2871 let Predicates = [target] in { 2872 // These are unary operations, but they are modeled as having 2 source operands 2873 // because the high elements of the destination are unchanged in SSE. 2874 def : Pat<(Intr VR128:$src), 2875 (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>; 2876 } 2877 // We don't want to fold scalar loads into these instructions unless 2878 // optimizing for size. This is because the folded instruction will have a 2879 // partial register update, while the unfolded sequence will not, e.g. 2880 // movss mem, %xmm0 2881 // rcpss %xmm0, %xmm0 2882 // which has a clobber before the rcp, vs. 2883 // rcpss mem, %xmm0 2884 let Predicates = [target, OptForSize] in { 2885 def : Pat<(Intr (mem_frags addr:$src2)), 2886 (!cast<Instruction>(NAME#m_Int) 2887 (vt (IMPLICIT_DEF)), addr:$src2)>; 2888 } 2889} 2890 2891multiclass avx_fp_unop_s_intr<ValueType vt, PatFrags mem_frags, 2892 Intrinsic Intr, Predicate target> { 2893 let Predicates = [target] in { 2894 def : Pat<(Intr VR128:$src), 2895 (!cast<Instruction>(NAME#r_Int) VR128:$src, 2896 VR128:$src)>; 2897 } 2898 let Predicates = [target, OptForSize] in { 2899 def : Pat<(Intr (mem_frags addr:$src2)), 2900 (!cast<Instruction>(NAME#m_Int) 2901 (vt (IMPLICIT_DEF)), addr:$src2)>; 2902 } 2903} 2904 2905multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, 2906 ValueType ScalarVT, X86MemOperand x86memop, 2907 Operand intmemop, SDPatternOperator OpNode, Domain d, 2908 X86FoldableSchedWrite sched, Predicate target> { 2909 let isCodeGenOnly = 1, hasSideEffects = 0 in { 2910 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 2911 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2912 [], d>, Sched<[sched]>; 2913 let mayLoad = 1 in 2914 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 2915 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2916 [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>; 2917 } 2918 let hasSideEffects = 0, ExeDomain = d in { 2919 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), 2920 (ins VR128:$src1, VR128:$src2), 2921 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2922 []>, Sched<[sched]>; 2923 let mayLoad = 1 in 2924 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), 2925 (ins VR128:$src1, intmemop:$src2), 2926 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2927 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 2928 } 2929 2930 // We don't want to fold scalar loads into these instructions unless 2931 // optimizing for size. This is because the folded instruction will have a 2932 // partial register update, while the unfolded sequence will not, e.g. 2933 // vmovss mem, %xmm0 2934 // vrcpss %xmm0, %xmm0, %xmm0 2935 // which has a clobber before the rcp, vs. 2936 // vrcpss mem, %xmm0, %xmm0 2937 // TODO: In theory, we could fold the load, and avoid the stall caused by 2938 // the partial register store, either in BreakFalseDeps or with smarter RA. 2939 let Predicates = [target] in { 2940 def : Pat<(OpNode RC:$src), (!cast<Instruction>(NAME#r) 2941 (ScalarVT (IMPLICIT_DEF)), RC:$src)>; 2942 } 2943 let Predicates = [target, OptForSize] in { 2944 def : Pat<(ScalarVT (OpNode (load addr:$src))), 2945 (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)), 2946 addr:$src)>; 2947 } 2948} 2949 2950/// sse1_fp_unop_p - SSE1 unops in packed form. 2951multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, 2952 X86SchedWriteWidths sched, list<Predicate> prds> { 2953let Predicates = prds in { 2954 def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2955 !strconcat("v", OpcodeStr, 2956 "ps\t{$src, $dst|$dst, $src}"), 2957 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>, 2958 VEX, Sched<[sched.XMM]>, VEX_WIG; 2959 def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2960 !strconcat("v", OpcodeStr, 2961 "ps\t{$src, $dst|$dst, $src}"), 2962 [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>, 2963 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG; 2964 def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 2965 !strconcat("v", OpcodeStr, 2966 "ps\t{$src, $dst|$dst, $src}"), 2967 [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>, 2968 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; 2969 def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 2970 !strconcat("v", OpcodeStr, 2971 "ps\t{$src, $dst|$dst, $src}"), 2972 [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>, 2973 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG; 2974} 2975 2976 def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2977 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 2978 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>, 2979 Sched<[sched.XMM]>; 2980 def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2981 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 2982 [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>, 2983 Sched<[sched.XMM.Folded]>; 2984} 2985 2986/// sse2_fp_unop_p - SSE2 unops in vector forms. 2987multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, 2988 SDPatternOperator OpNode, X86SchedWriteWidths sched> { 2989let Predicates = [HasAVX, NoVLX] in { 2990 def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2991 !strconcat("v", OpcodeStr, 2992 "pd\t{$src, $dst|$dst, $src}"), 2993 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>, 2994 VEX, Sched<[sched.XMM]>, VEX_WIG; 2995 def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2996 !strconcat("v", OpcodeStr, 2997 "pd\t{$src, $dst|$dst, $src}"), 2998 [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>, 2999 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG; 3000 def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3001 !strconcat("v", OpcodeStr, 3002 "pd\t{$src, $dst|$dst, $src}"), 3003 [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>, 3004 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; 3005 def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 3006 !strconcat("v", OpcodeStr, 3007 "pd\t{$src, $dst|$dst, $src}"), 3008 [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>, 3009 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG; 3010} 3011 3012 def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3013 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), 3014 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>, 3015 Sched<[sched.XMM]>; 3016 def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 3017 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), 3018 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>, 3019 Sched<[sched.XMM.Folded]>; 3020} 3021 3022multiclass sse1_fp_unop_s_intr<string OpcodeStr, Predicate AVXTarget> { 3023 defm SS : sse_fp_unop_s_intr<v4f32, sse_load_f32, 3024 !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss), 3025 UseSSE1>, XS; 3026 defm V#NAME#SS : avx_fp_unop_s_intr<v4f32, sse_load_f32, 3027 !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss), 3028 AVXTarget>, 3029 XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable; 3030} 3031 3032multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, 3033 X86SchedWriteWidths sched, Predicate AVXTarget> { 3034 defm SS : sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32mem, 3035 ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS; 3036 defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32, 3037 f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>, 3038 XS, VEX_4V, VEX_LIG, VEX_WIG; 3039} 3040 3041multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, 3042 X86SchedWriteWidths sched, Predicate AVXTarget> { 3043 defm SD : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64mem, 3044 sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD; 3045 defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64, 3046 f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>, 3047 XD, VEX_4V, VEX_LIG, VEX_WIG; 3048} 3049 3050// Square root. 3051defm SQRT : sse1_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, UseAVX>, 3052 sse1_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>, 3053 sse2_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64, UseAVX>, 3054 sse2_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64>, SIMD_EXC; 3055 3056// Reciprocal approximations. Note that these typically require refinement 3057// in order to obtain suitable precision. 3058defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>, 3059 sse1_fp_unop_s_intr<"rsqrt", HasAVX>, 3060 sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>; 3061defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>, 3062 sse1_fp_unop_s_intr<"rcp", HasAVX>, 3063 sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>; 3064 3065// There is no f64 version of the reciprocal approximation instructions. 3066 3067multiclass scalar_unary_math_patterns<SDPatternOperator OpNode, string OpcPrefix, SDNode Move, 3068 ValueType VT, Predicate BasePredicate> { 3069 let Predicates = [BasePredicate] in { 3070 def : Pat<(VT (Move VT:$dst, (scalar_to_vector 3071 (OpNode (extractelt VT:$src, 0))))), 3072 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3073 } 3074 3075 // Repeat for AVX versions of the instructions. 3076 let Predicates = [UseAVX] in { 3077 def : Pat<(VT (Move VT:$dst, (scalar_to_vector 3078 (OpNode (extractelt VT:$src, 0))))), 3079 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3080 } 3081} 3082 3083defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>; 3084defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>; 3085 3086multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix, 3087 SDNode Move, ValueType VT, 3088 Predicate BasePredicate> { 3089 let Predicates = [BasePredicate] in { 3090 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), 3091 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3092 } 3093 3094 // Repeat for AVX versions of the instructions. 3095 let Predicates = [HasAVX] in { 3096 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), 3097 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3098 } 3099} 3100 3101defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss, 3102 v4f32, UseSSE1>; 3103defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss, 3104 v4f32, UseSSE1>; 3105 3106 3107//===----------------------------------------------------------------------===// 3108// SSE 1 & 2 - Non-temporal stores 3109//===----------------------------------------------------------------------===// 3110 3111let AddedComplexity = 400 in { // Prefer non-temporal versions 3112let Predicates = [HasAVX, NoVLX] in { 3113let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in { 3114def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), 3115 (ins f128mem:$dst, VR128:$src), 3116 "movntps\t{$src, $dst|$dst, $src}", 3117 [(alignednontemporalstore (v4f32 VR128:$src), 3118 addr:$dst)]>, VEX, VEX_WIG; 3119def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), 3120 (ins f128mem:$dst, VR128:$src), 3121 "movntpd\t{$src, $dst|$dst, $src}", 3122 [(alignednontemporalstore (v2f64 VR128:$src), 3123 addr:$dst)]>, VEX, VEX_WIG; 3124} // SchedRW 3125 3126let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in { 3127def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), 3128 (ins f256mem:$dst, VR256:$src), 3129 "movntps\t{$src, $dst|$dst, $src}", 3130 [(alignednontemporalstore (v8f32 VR256:$src), 3131 addr:$dst)]>, VEX, VEX_L, VEX_WIG; 3132def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), 3133 (ins f256mem:$dst, VR256:$src), 3134 "movntpd\t{$src, $dst|$dst, $src}", 3135 [(alignednontemporalstore (v4f64 VR256:$src), 3136 addr:$dst)]>, VEX, VEX_L, VEX_WIG; 3137} // SchedRW 3138 3139let ExeDomain = SSEPackedInt in { 3140def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), 3141 (ins i128mem:$dst, VR128:$src), 3142 "movntdq\t{$src, $dst|$dst, $src}", 3143 [(alignednontemporalstore (v2i64 VR128:$src), 3144 addr:$dst)]>, VEX, VEX_WIG, 3145 Sched<[SchedWriteVecMoveLSNT.XMM.MR]>; 3146def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), 3147 (ins i256mem:$dst, VR256:$src), 3148 "movntdq\t{$src, $dst|$dst, $src}", 3149 [(alignednontemporalstore (v4i64 VR256:$src), 3150 addr:$dst)]>, VEX, VEX_L, VEX_WIG, 3151 Sched<[SchedWriteVecMoveLSNT.YMM.MR]>; 3152} // ExeDomain 3153} // Predicates 3154 3155let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in { 3156def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3157 "movntps\t{$src, $dst|$dst, $src}", 3158 [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>; 3159def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3160 "movntpd\t{$src, $dst|$dst, $src}", 3161 [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>; 3162} // SchedRW 3163 3164let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in 3165def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3166 "movntdq\t{$src, $dst|$dst, $src}", 3167 [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>; 3168 3169let SchedRW = [WriteStoreNT] in { 3170// There is no AVX form for instructions below this point 3171def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), 3172 "movnti{l}\t{$src, $dst|$dst, $src}", 3173 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>, 3174 PS, Requires<[HasSSE2]>; 3175def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), 3176 "movnti{q}\t{$src, $dst|$dst, $src}", 3177 [(nontemporalstore (i64 GR64:$src), addr:$dst)]>, 3178 PS, Requires<[HasSSE2]>; 3179} // SchedRW = [WriteStoreNT] 3180 3181let Predicates = [HasAVX, NoVLX] in { 3182 def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst), 3183 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3184 def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst), 3185 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3186 def : Pat<(alignednontemporalstore (v16f16 VR256:$src), addr:$dst), 3187 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3188 def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst), 3189 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3190 3191 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), 3192 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3193 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), 3194 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3195 def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst), 3196 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3197 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), 3198 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3199} 3200 3201let Predicates = [UseSSE2] in { 3202 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), 3203 (MOVNTDQmr addr:$dst, VR128:$src)>; 3204 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), 3205 (MOVNTDQmr addr:$dst, VR128:$src)>; 3206 def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst), 3207 (MOVNTDQmr addr:$dst, VR128:$src)>; 3208 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), 3209 (MOVNTDQmr addr:$dst, VR128:$src)>; 3210} 3211 3212} // AddedComplexity 3213 3214//===----------------------------------------------------------------------===// 3215// SSE 1 & 2 - Prefetch and memory fence 3216//===----------------------------------------------------------------------===// 3217 3218// Prefetch intrinsic. 3219let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in { 3220def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src), 3221 "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB; 3222def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src), 3223 "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB; 3224def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src), 3225 "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB; 3226def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src), 3227 "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB; 3228} 3229 3230// FIXME: How should flush instruction be modeled? 3231let SchedRW = [WriteLoad] in { 3232// Flush cache 3233def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src), 3234 "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>, 3235 PS, Requires<[HasCLFLUSH]>; 3236} 3237 3238let SchedRW = [WriteNop] in { 3239// Pause. This "instruction" is encoded as "rep; nop", so even though it 3240// was introduced with SSE2, it's backward compatible. 3241def PAUSE : I<0x90, RawFrm, (outs), (ins), 3242 "pause", [(int_x86_sse2_pause)]>, OBXS; 3243} 3244 3245let SchedRW = [WriteFence] in { 3246// Load, store, and memory fence 3247// TODO: As with mfence, we may want to ease the availability of sfence/lfence 3248// to include any 64-bit target. 3249def SFENCE : I<0xAE, MRM7X, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>, 3250 PS, Requires<[HasSSE1]>; 3251def LFENCE : I<0xAE, MRM5X, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>, 3252 PS, Requires<[HasSSE2]>; 3253def MFENCE : I<0xAE, MRM6X, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>, 3254 PS, Requires<[HasMFence]>; 3255} // SchedRW 3256 3257def : Pat<(X86MFence), (MFENCE)>; 3258 3259//===----------------------------------------------------------------------===// 3260// SSE 1 & 2 - Load/Store XCSR register 3261//===----------------------------------------------------------------------===// 3262 3263let mayLoad=1, hasSideEffects=1, Defs=[MXCSR] in 3264def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), 3265 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, 3266 VEX, Sched<[WriteLDMXCSR]>, VEX_WIG; 3267let mayStore=1, hasSideEffects=1, Uses=[MXCSR] in 3268def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), 3269 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, 3270 VEX, Sched<[WriteSTMXCSR]>, VEX_WIG; 3271 3272let mayLoad=1, hasSideEffects=1, Defs=[MXCSR] in 3273def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src), 3274 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, 3275 PS, Sched<[WriteLDMXCSR]>; 3276let mayStore=1, hasSideEffects=1, Uses=[MXCSR] in 3277def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst), 3278 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, 3279 PS, Sched<[WriteSTMXCSR]>; 3280 3281//===---------------------------------------------------------------------===// 3282// SSE2 - Move Aligned/Unaligned Packed Integer Instructions 3283//===---------------------------------------------------------------------===// 3284 3285let ExeDomain = SSEPackedInt in { // SSE integer instructions 3286 3287let hasSideEffects = 0 in { 3288def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3289 "movdqa\t{$src, $dst|$dst, $src}", []>, 3290 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG; 3291def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3292 "movdqu\t{$src, $dst|$dst, $src}", []>, 3293 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG; 3294def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3295 "movdqa\t{$src, $dst|$dst, $src}", []>, 3296 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG; 3297def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3298 "movdqu\t{$src, $dst|$dst, $src}", []>, 3299 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG; 3300} 3301 3302// For Disassembler 3303let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { 3304def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3305 "movdqa\t{$src, $dst|$dst, $src}", []>, 3306 Sched<[SchedWriteVecMoveLS.XMM.RR]>, 3307 VEX, VEX_WIG, FoldGenData<"VMOVDQArr">; 3308def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), 3309 "movdqa\t{$src, $dst|$dst, $src}", []>, 3310 Sched<[SchedWriteVecMoveLS.YMM.RR]>, 3311 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">; 3312def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3313 "movdqu\t{$src, $dst|$dst, $src}", []>, 3314 Sched<[SchedWriteVecMoveLS.XMM.RR]>, 3315 VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">; 3316def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), 3317 "movdqu\t{$src, $dst|$dst, $src}", []>, 3318 Sched<[SchedWriteVecMoveLS.YMM.RR]>, 3319 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">; 3320} 3321 3322let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, 3323 hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in { 3324def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3325 "movdqa\t{$src, $dst|$dst, $src}", 3326 [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>, 3327 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG; 3328def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 3329 "movdqa\t{$src, $dst|$dst, $src}", []>, 3330 Sched<[SchedWriteVecMoveLS.YMM.RM]>, 3331 VEX, VEX_L, VEX_WIG; 3332def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3333 "vmovdqu\t{$src, $dst|$dst, $src}", 3334 [(set VR128:$dst, (loadv2i64 addr:$src))]>, 3335 Sched<[SchedWriteVecMoveLS.XMM.RM]>, 3336 XS, VEX, VEX_WIG; 3337def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 3338 "vmovdqu\t{$src, $dst|$dst, $src}", []>, 3339 Sched<[SchedWriteVecMoveLS.YMM.RM]>, 3340 XS, VEX, VEX_L, VEX_WIG; 3341} 3342 3343let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in { 3344def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), 3345 (ins i128mem:$dst, VR128:$src), 3346 "movdqa\t{$src, $dst|$dst, $src}", 3347 [(alignedstore (v2i64 VR128:$src), addr:$dst)]>, 3348 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG; 3349def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), 3350 (ins i256mem:$dst, VR256:$src), 3351 "movdqa\t{$src, $dst|$dst, $src}", []>, 3352 Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG; 3353def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3354 "vmovdqu\t{$src, $dst|$dst, $src}", 3355 [(store (v2i64 VR128:$src), addr:$dst)]>, 3356 Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG; 3357def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), 3358 "vmovdqu\t{$src, $dst|$dst, $src}",[]>, 3359 Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG; 3360} 3361 3362let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in { 3363let hasSideEffects = 0 in { 3364def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3365 "movdqa\t{$src, $dst|$dst, $src}", []>; 3366 3367def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3368 "movdqu\t{$src, $dst|$dst, $src}", []>, 3369 XS, Requires<[UseSSE2]>; 3370} 3371 3372// For Disassembler 3373let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { 3374def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3375 "movdqa\t{$src, $dst|$dst, $src}", []>, 3376 FoldGenData<"MOVDQArr">; 3377 3378def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3379 "movdqu\t{$src, $dst|$dst, $src}", []>, 3380 XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">; 3381} 3382} // SchedRW 3383 3384let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, 3385 hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in { 3386def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3387 "movdqa\t{$src, $dst|$dst, $src}", 3388 [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>; 3389def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3390 "movdqu\t{$src, $dst|$dst, $src}", 3391 [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>, 3392 XS, Requires<[UseSSE2]>; 3393} 3394 3395let mayStore = 1, hasSideEffects = 0, 3396 SchedRW = [SchedWriteVecMoveLS.XMM.MR] in { 3397def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3398 "movdqa\t{$src, $dst|$dst, $src}", 3399 [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>; 3400def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3401 "movdqu\t{$src, $dst|$dst, $src}", 3402 [/*(store (v2i64 VR128:$src), addr:$dst)*/]>, 3403 XS, Requires<[UseSSE2]>; 3404} 3405 3406} // ExeDomain = SSEPackedInt 3407 3408// Reversed version with ".s" suffix for GAS compatibility. 3409def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}", 3410 (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>; 3411def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}", 3412 (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>; 3413def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}", 3414 (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>; 3415def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}", 3416 (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>; 3417 3418// Reversed version with ".s" suffix for GAS compatibility. 3419def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}", 3420 (MOVDQArr_REV VR128:$dst, VR128:$src), 0>; 3421def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}", 3422 (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>; 3423 3424let Predicates = [HasAVX, NoVLX] in { 3425 // Additional patterns for other integer sizes. 3426 def : Pat<(alignedloadv4i32 addr:$src), 3427 (VMOVDQArm addr:$src)>; 3428 def : Pat<(alignedloadv8i16 addr:$src), 3429 (VMOVDQArm addr:$src)>; 3430 def : Pat<(alignedloadv8f16 addr:$src), 3431 (VMOVDQArm addr:$src)>; 3432 def : Pat<(alignedloadv16i8 addr:$src), 3433 (VMOVDQArm addr:$src)>; 3434 def : Pat<(loadv4i32 addr:$src), 3435 (VMOVDQUrm addr:$src)>; 3436 def : Pat<(loadv8i16 addr:$src), 3437 (VMOVDQUrm addr:$src)>; 3438 def : Pat<(loadv8f16 addr:$src), 3439 (VMOVDQUrm addr:$src)>; 3440 def : Pat<(loadv16i8 addr:$src), 3441 (VMOVDQUrm addr:$src)>; 3442 3443 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), 3444 (VMOVDQAmr addr:$dst, VR128:$src)>; 3445 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), 3446 (VMOVDQAmr addr:$dst, VR128:$src)>; 3447 def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst), 3448 (VMOVDQAmr addr:$dst, VR128:$src)>; 3449 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), 3450 (VMOVDQAmr addr:$dst, VR128:$src)>; 3451 def : Pat<(store (v4i32 VR128:$src), addr:$dst), 3452 (VMOVDQUmr addr:$dst, VR128:$src)>; 3453 def : Pat<(store (v8i16 VR128:$src), addr:$dst), 3454 (VMOVDQUmr addr:$dst, VR128:$src)>; 3455 def : Pat<(store (v8f16 VR128:$src), addr:$dst), 3456 (VMOVDQUmr addr:$dst, VR128:$src)>; 3457 def : Pat<(store (v16i8 VR128:$src), addr:$dst), 3458 (VMOVDQUmr addr:$dst, VR128:$src)>; 3459} 3460 3461//===---------------------------------------------------------------------===// 3462// SSE2 - Packed Integer Arithmetic Instructions 3463//===---------------------------------------------------------------------===// 3464 3465let ExeDomain = SSEPackedInt in { // SSE integer instructions 3466 3467/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types 3468multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, 3469 ValueType DstVT, ValueType SrcVT, RegisterClass RC, 3470 PatFrag memop_frag, X86MemOperand x86memop, 3471 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 3472 let isCommutable = 1 in 3473 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 3474 (ins RC:$src1, RC:$src2), 3475 !if(Is2Addr, 3476 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3477 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3478 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>, 3479 Sched<[sched]>; 3480 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 3481 (ins RC:$src1, x86memop:$src2), 3482 !if(Is2Addr, 3483 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3484 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3485 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), 3486 (memop_frag addr:$src2))))]>, 3487 Sched<[sched.Folded, sched.ReadAfterFold]>; 3488} 3489} // ExeDomain = SSEPackedInt 3490 3491defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8, 3492 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3493defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16, 3494 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3495defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32, 3496 SchedWriteVecALU, 1, NoVLX>; 3497defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64, 3498 SchedWriteVecALU, 1, NoVLX>; 3499defm PADDSB : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8, 3500 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3501defm PADDSW : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16, 3502 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3503defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8, 3504 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3505defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16, 3506 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3507defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, 3508 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3509defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16, 3510 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3511defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16, 3512 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3513defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8, 3514 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3515defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16, 3516 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3517defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32, 3518 SchedWriteVecALU, 0, NoVLX>; 3519defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64, 3520 SchedWriteVecALU, 0, NoVLX>; 3521defm PSUBSB : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8, 3522 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3523defm PSUBSW : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16, 3524 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3525defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8, 3526 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3527defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16, 3528 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3529defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8, 3530 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3531defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16, 3532 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3533defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8, 3534 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3535defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, 3536 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3537defm PAVGB : PDI_binop_all<0xE0, "pavgb", avgceilu, v16i8, v32i8, 3538 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3539defm PAVGW : PDI_binop_all<0xE3, "pavgw", avgceilu, v8i16, v16i16, 3540 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3541defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64, 3542 SchedWriteVecIMul, 1, NoVLX>; 3543 3544let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3545defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, 3546 load, i128mem, SchedWriteVecIMul.XMM, 0>, 3547 VEX_4V, VEX_WIG; 3548 3549let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3550defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16, 3551 VR256, load, i256mem, SchedWriteVecIMul.YMM, 3552 0>, VEX_4V, VEX_L, VEX_WIG; 3553let Constraints = "$src1 = $dst" in 3554defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, 3555 memop, i128mem, SchedWriteVecIMul.XMM>; 3556 3557let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3558defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128, 3559 load, i128mem, SchedWritePSADBW.XMM, 0>, 3560 VEX_4V, VEX_WIG; 3561let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3562defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256, 3563 load, i256mem, SchedWritePSADBW.YMM, 0>, 3564 VEX_4V, VEX_L, VEX_WIG; 3565let Constraints = "$src1 = $dst" in 3566defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128, 3567 memop, i128mem, SchedWritePSADBW.XMM>; 3568 3569//===---------------------------------------------------------------------===// 3570// SSE2 - Packed Integer Logical Instructions 3571//===---------------------------------------------------------------------===// 3572 3573multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, 3574 string OpcodeStr, SDNode OpNode, 3575 SDNode OpNode2, RegisterClass RC, 3576 X86FoldableSchedWrite sched, 3577 X86FoldableSchedWrite schedImm, 3578 ValueType DstVT, ValueType SrcVT, 3579 PatFrag ld_frag, bit Is2Addr = 1> { 3580 // src2 is always 128-bit 3581 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 3582 (ins RC:$src1, VR128:$src2), 3583 !if(Is2Addr, 3584 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3585 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3586 [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>, 3587 Sched<[sched]>; 3588 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 3589 (ins RC:$src1, i128mem:$src2), 3590 !if(Is2Addr, 3591 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3592 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3593 [(set RC:$dst, (DstVT (OpNode RC:$src1, 3594 (SrcVT (ld_frag addr:$src2)))))]>, 3595 Sched<[sched.Folded, sched.ReadAfterFold]>; 3596 def ri : PDIi8<opc2, ImmForm, (outs RC:$dst), 3597 (ins RC:$src1, u8imm:$src2), 3598 !if(Is2Addr, 3599 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3600 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3601 [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>, 3602 Sched<[schedImm]>; 3603} 3604 3605multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm, 3606 string OpcodeStr, SDNode OpNode, 3607 SDNode OpNode2, ValueType DstVT128, 3608 ValueType DstVT256, ValueType SrcVT, 3609 X86SchedWriteWidths sched, 3610 X86SchedWriteWidths schedImm, Predicate prd> { 3611let Predicates = [HasAVX, prd] in 3612 defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), 3613 OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM, 3614 DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG; 3615let Predicates = [HasAVX2, prd] in 3616 defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), 3617 OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM, 3618 DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L, 3619 VEX_WIG; 3620let Constraints = "$src1 = $dst" in 3621 defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2, 3622 VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT, 3623 memop>; 3624} 3625 3626multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr, 3627 SDNode OpNode, RegisterClass RC, ValueType VT, 3628 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 3629 def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2), 3630 !if(Is2Addr, 3631 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3632 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3633 [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>, 3634 Sched<[sched]>; 3635} 3636 3637multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr, 3638 SDNode OpNode, X86SchedWriteWidths sched> { 3639let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3640 defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, 3641 VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG; 3642let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3643 defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, 3644 VR256, v32i8, sched.YMM, 0>, 3645 VEX_4V, VEX_L, VEX_WIG; 3646let Constraints = "$src1 = $dst" in 3647 defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8, 3648 sched.XMM>; 3649} 3650 3651let ExeDomain = SSEPackedInt in { 3652 defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, 3653 v8i16, v16i16, v8i16, SchedWriteVecShift, 3654 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3655 defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli, 3656 v4i32, v8i32, v4i32, SchedWriteVecShift, 3657 SchedWriteVecShiftImm, NoVLX>; 3658 defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli, 3659 v2i64, v4i64, v2i64, SchedWriteVecShift, 3660 SchedWriteVecShiftImm, NoVLX>; 3661 3662 defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli, 3663 v8i16, v16i16, v8i16, SchedWriteVecShift, 3664 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3665 defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli, 3666 v4i32, v8i32, v4i32, SchedWriteVecShift, 3667 SchedWriteVecShiftImm, NoVLX>; 3668 defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli, 3669 v2i64, v4i64, v2i64, SchedWriteVecShift, 3670 SchedWriteVecShiftImm, NoVLX>; 3671 3672 defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai, 3673 v8i16, v16i16, v8i16, SchedWriteVecShift, 3674 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3675 defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai, 3676 v4i32, v8i32, v4i32, SchedWriteVecShift, 3677 SchedWriteVecShiftImm, NoVLX>; 3678 3679 defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq, 3680 SchedWriteShuffle>; 3681 defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq, 3682 SchedWriteShuffle>; 3683} // ExeDomain = SSEPackedInt 3684 3685//===---------------------------------------------------------------------===// 3686// SSE2 - Packed Integer Comparison Instructions 3687//===---------------------------------------------------------------------===// 3688 3689defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8, 3690 SchedWriteVecALU, 1, TruePredicate>; 3691defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16, 3692 SchedWriteVecALU, 1, TruePredicate>; 3693defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32, 3694 SchedWriteVecALU, 1, TruePredicate>; 3695defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8, 3696 SchedWriteVecALU, 0, TruePredicate>; 3697defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, 3698 SchedWriteVecALU, 0, TruePredicate>; 3699defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, 3700 SchedWriteVecALU, 0, TruePredicate>; 3701 3702//===---------------------------------------------------------------------===// 3703// SSE2 - Packed Integer Shuffle Instructions 3704//===---------------------------------------------------------------------===// 3705 3706let ExeDomain = SSEPackedInt in { 3707multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256, 3708 SDNode OpNode, X86SchedWriteWidths sched, 3709 Predicate prd> { 3710let Predicates = [HasAVX, prd] in { 3711 def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst), 3712 (ins VR128:$src1, u8imm:$src2), 3713 !strconcat("v", OpcodeStr, 3714 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3715 [(set VR128:$dst, 3716 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>, 3717 VEX, Sched<[sched.XMM]>, VEX_WIG; 3718 def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), 3719 (ins i128mem:$src1, u8imm:$src2), 3720 !strconcat("v", OpcodeStr, 3721 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3722 [(set VR128:$dst, 3723 (vt128 (OpNode (load addr:$src1), 3724 (i8 timm:$src2))))]>, VEX, 3725 Sched<[sched.XMM.Folded]>, VEX_WIG; 3726} 3727 3728let Predicates = [HasAVX2, prd] in { 3729 def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst), 3730 (ins VR256:$src1, u8imm:$src2), 3731 !strconcat("v", OpcodeStr, 3732 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3733 [(set VR256:$dst, 3734 (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>, 3735 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; 3736 def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), 3737 (ins i256mem:$src1, u8imm:$src2), 3738 !strconcat("v", OpcodeStr, 3739 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3740 [(set VR256:$dst, 3741 (vt256 (OpNode (load addr:$src1), 3742 (i8 timm:$src2))))]>, VEX, VEX_L, 3743 Sched<[sched.YMM.Folded]>, VEX_WIG; 3744} 3745 3746let Predicates = [UseSSE2] in { 3747 def ri : Ii8<0x70, MRMSrcReg, 3748 (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), 3749 !strconcat(OpcodeStr, 3750 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3751 [(set VR128:$dst, 3752 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>, 3753 Sched<[sched.XMM]>; 3754 def mi : Ii8<0x70, MRMSrcMem, 3755 (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), 3756 !strconcat(OpcodeStr, 3757 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3758 [(set VR128:$dst, 3759 (vt128 (OpNode (memop addr:$src1), 3760 (i8 timm:$src2))))]>, 3761 Sched<[sched.XMM.Folded]>; 3762} 3763} 3764} // ExeDomain = SSEPackedInt 3765 3766defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, 3767 SchedWriteShuffle, NoVLX>, PD; 3768defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw, 3769 SchedWriteShuffle, NoVLX_Or_NoBWI>, XS; 3770defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw, 3771 SchedWriteShuffle, NoVLX_Or_NoBWI>, XD; 3772 3773//===---------------------------------------------------------------------===// 3774// Packed Integer Pack Instructions (SSE & AVX) 3775//===---------------------------------------------------------------------===// 3776 3777let ExeDomain = SSEPackedInt in { 3778multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, 3779 ValueType ArgVT, SDNode OpNode, RegisterClass RC, 3780 X86MemOperand x86memop, X86FoldableSchedWrite sched, 3781 PatFrag ld_frag, bit Is2Addr = 1> { 3782 def rr : PDI<opc, MRMSrcReg, 3783 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3784 !if(Is2Addr, 3785 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3786 !strconcat(OpcodeStr, 3787 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3788 [(set RC:$dst, 3789 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>, 3790 Sched<[sched]>; 3791 def rm : PDI<opc, MRMSrcMem, 3792 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3793 !if(Is2Addr, 3794 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3795 !strconcat(OpcodeStr, 3796 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3797 [(set RC:$dst, 3798 (OutVT (OpNode (ArgVT RC:$src1), 3799 (ld_frag addr:$src2))))]>, 3800 Sched<[sched.Folded, sched.ReadAfterFold]>; 3801} 3802 3803multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, 3804 ValueType ArgVT, SDNode OpNode, RegisterClass RC, 3805 X86MemOperand x86memop, X86FoldableSchedWrite sched, 3806 PatFrag ld_frag, bit Is2Addr = 1> { 3807 def rr : SS48I<opc, MRMSrcReg, 3808 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3809 !if(Is2Addr, 3810 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3811 !strconcat(OpcodeStr, 3812 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3813 [(set RC:$dst, 3814 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>, 3815 Sched<[sched]>; 3816 def rm : SS48I<opc, MRMSrcMem, 3817 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3818 !if(Is2Addr, 3819 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3820 !strconcat(OpcodeStr, 3821 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3822 [(set RC:$dst, 3823 (OutVT (OpNode (ArgVT RC:$src1), 3824 (ld_frag addr:$src2))))]>, 3825 Sched<[sched.Folded, sched.ReadAfterFold]>; 3826} 3827 3828let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 3829 defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128, 3830 i128mem, SchedWriteShuffle.XMM, load, 0>, 3831 VEX_4V, VEX_WIG; 3832 defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128, 3833 i128mem, SchedWriteShuffle.XMM, load, 0>, 3834 VEX_4V, VEX_WIG; 3835 3836 defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128, 3837 i128mem, SchedWriteShuffle.XMM, load, 0>, 3838 VEX_4V, VEX_WIG; 3839 defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128, 3840 i128mem, SchedWriteShuffle.XMM, load, 0>, 3841 VEX_4V, VEX_WIG; 3842} 3843 3844let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 3845 defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256, 3846 i256mem, SchedWriteShuffle.YMM, load, 0>, 3847 VEX_4V, VEX_L, VEX_WIG; 3848 defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256, 3849 i256mem, SchedWriteShuffle.YMM, load, 0>, 3850 VEX_4V, VEX_L, VEX_WIG; 3851 3852 defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256, 3853 i256mem, SchedWriteShuffle.YMM, load, 0>, 3854 VEX_4V, VEX_L, VEX_WIG; 3855 defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256, 3856 i256mem, SchedWriteShuffle.YMM, load, 0>, 3857 VEX_4V, VEX_L, VEX_WIG; 3858} 3859 3860let Constraints = "$src1 = $dst" in { 3861 defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128, 3862 i128mem, SchedWriteShuffle.XMM, memop>; 3863 defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128, 3864 i128mem, SchedWriteShuffle.XMM, memop>; 3865 3866 defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128, 3867 i128mem, SchedWriteShuffle.XMM, memop>; 3868 3869 defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128, 3870 i128mem, SchedWriteShuffle.XMM, memop>; 3871} 3872} // ExeDomain = SSEPackedInt 3873 3874//===---------------------------------------------------------------------===// 3875// SSE2 - Packed Integer Unpack Instructions 3876//===---------------------------------------------------------------------===// 3877 3878let ExeDomain = SSEPackedInt in { 3879multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, 3880 SDNode OpNode, RegisterClass RC, X86MemOperand x86memop, 3881 X86FoldableSchedWrite sched, PatFrag ld_frag, 3882 bit Is2Addr = 1> { 3883 def rr : PDI<opc, MRMSrcReg, 3884 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3885 !if(Is2Addr, 3886 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), 3887 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3888 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 3889 Sched<[sched]>; 3890 def rm : PDI<opc, MRMSrcMem, 3891 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3892 !if(Is2Addr, 3893 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), 3894 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3895 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 3896 Sched<[sched.Folded, sched.ReadAfterFold]>; 3897} 3898 3899let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 3900 defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128, 3901 i128mem, SchedWriteShuffle.XMM, load, 0>, 3902 VEX_4V, VEX_WIG; 3903 defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128, 3904 i128mem, SchedWriteShuffle.XMM, load, 0>, 3905 VEX_4V, VEX_WIG; 3906 defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128, 3907 i128mem, SchedWriteShuffle.XMM, load, 0>, 3908 VEX_4V, VEX_WIG; 3909 defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128, 3910 i128mem, SchedWriteShuffle.XMM, load, 0>, 3911 VEX_4V, VEX_WIG; 3912} 3913 3914let Predicates = [HasAVX, NoVLX] in { 3915 defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128, 3916 i128mem, SchedWriteShuffle.XMM, load, 0>, 3917 VEX_4V, VEX_WIG; 3918 defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128, 3919 i128mem, SchedWriteShuffle.XMM, load, 0>, 3920 VEX_4V, VEX_WIG; 3921 defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128, 3922 i128mem, SchedWriteShuffle.XMM, load, 0>, 3923 VEX_4V, VEX_WIG; 3924 defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128, 3925 i128mem, SchedWriteShuffle.XMM, load, 0>, 3926 VEX_4V, VEX_WIG; 3927} 3928 3929let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 3930 defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256, 3931 i256mem, SchedWriteShuffle.YMM, load, 0>, 3932 VEX_4V, VEX_L, VEX_WIG; 3933 defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256, 3934 i256mem, SchedWriteShuffle.YMM, load, 0>, 3935 VEX_4V, VEX_L, VEX_WIG; 3936 defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256, 3937 i256mem, SchedWriteShuffle.YMM, load, 0>, 3938 VEX_4V, VEX_L, VEX_WIG; 3939 defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256, 3940 i256mem, SchedWriteShuffle.YMM, load, 0>, 3941 VEX_4V, VEX_L, VEX_WIG; 3942} 3943 3944let Predicates = [HasAVX2, NoVLX] in { 3945 defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256, 3946 i256mem, SchedWriteShuffle.YMM, load, 0>, 3947 VEX_4V, VEX_L, VEX_WIG; 3948 defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256, 3949 i256mem, SchedWriteShuffle.YMM, load, 0>, 3950 VEX_4V, VEX_L, VEX_WIG; 3951 defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256, 3952 i256mem, SchedWriteShuffle.YMM, load, 0>, 3953 VEX_4V, VEX_L, VEX_WIG; 3954 defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256, 3955 i256mem, SchedWriteShuffle.YMM, load, 0>, 3956 VEX_4V, VEX_L, VEX_WIG; 3957} 3958 3959let Constraints = "$src1 = $dst" in { 3960 defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128, 3961 i128mem, SchedWriteShuffle.XMM, memop>; 3962 defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128, 3963 i128mem, SchedWriteShuffle.XMM, memop>; 3964 defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128, 3965 i128mem, SchedWriteShuffle.XMM, memop>; 3966 defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128, 3967 i128mem, SchedWriteShuffle.XMM, memop>; 3968 3969 defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128, 3970 i128mem, SchedWriteShuffle.XMM, memop>; 3971 defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128, 3972 i128mem, SchedWriteShuffle.XMM, memop>; 3973 defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128, 3974 i128mem, SchedWriteShuffle.XMM, memop>; 3975 defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128, 3976 i128mem, SchedWriteShuffle.XMM, memop>; 3977} 3978} // ExeDomain = SSEPackedInt 3979 3980//===---------------------------------------------------------------------===// 3981// SSE2 - Packed Integer Extract and Insert 3982//===---------------------------------------------------------------------===// 3983 3984let ExeDomain = SSEPackedInt in { 3985multiclass sse2_pinsrw<bit Is2Addr = 1> { 3986 def rr : Ii8<0xC4, MRMSrcReg, 3987 (outs VR128:$dst), (ins VR128:$src1, 3988 GR32orGR64:$src2, u8imm:$src3), 3989 !if(Is2Addr, 3990 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", 3991 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 3992 [(set VR128:$dst, 3993 (X86pinsrw VR128:$src1, GR32orGR64:$src2, timm:$src3))]>, 3994 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 3995 def rm : Ii8<0xC4, MRMSrcMem, 3996 (outs VR128:$dst), (ins VR128:$src1, 3997 i16mem:$src2, u8imm:$src3), 3998 !if(Is2Addr, 3999 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", 4000 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 4001 [(set VR128:$dst, 4002 (X86pinsrw VR128:$src1, (extloadi16 addr:$src2), 4003 timm:$src3))]>, 4004 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 4005} 4006 4007// Extract 4008let Predicates = [HasAVX, NoBWI] in 4009def VPEXTRWrr : Ii8<0xC5, MRMSrcReg, 4010 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), 4011 "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", 4012 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), 4013 timm:$src2))]>, 4014 PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>; 4015def PEXTRWrr : PDIi8<0xC5, MRMSrcReg, 4016 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), 4017 "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", 4018 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), 4019 timm:$src2))]>, 4020 Sched<[WriteVecExtract]>; 4021 4022// Insert 4023let Predicates = [HasAVX, NoBWI] in 4024defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, VEX_WIG; 4025 4026let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in 4027defm PINSRW : sse2_pinsrw, PD; 4028 4029} // ExeDomain = SSEPackedInt 4030 4031// Always select FP16 instructions if available. 4032let Predicates = [UseSSE2], AddedComplexity = -10 in { 4033 def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (PINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>; 4034 def : Pat<(store f16:$src, addr:$dst), (MOV16mr addr:$dst, (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit))>; 4035 def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>; 4036 def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (PINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>; 4037} 4038 4039let Predicates = [HasAVX, NoBWI] in { 4040 def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (VPINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>; 4041 def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (VPEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>; 4042 def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (VPINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>; 4043} 4044 4045//===---------------------------------------------------------------------===// 4046// SSE2 - Packed Mask Creation 4047//===---------------------------------------------------------------------===// 4048 4049let ExeDomain = SSEPackedInt in { 4050 4051def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), 4052 (ins VR128:$src), 4053 "pmovmskb\t{$src, $dst|$dst, $src}", 4054 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>, 4055 Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG; 4056 4057let Predicates = [HasAVX2] in { 4058def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), 4059 (ins VR256:$src), 4060 "pmovmskb\t{$src, $dst|$dst, $src}", 4061 [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>, 4062 Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG; 4063} 4064 4065def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src), 4066 "pmovmskb\t{$src, $dst|$dst, $src}", 4067 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>, 4068 Sched<[WriteVecMOVMSK]>; 4069 4070} // ExeDomain = SSEPackedInt 4071 4072//===---------------------------------------------------------------------===// 4073// SSE2 - Conditional Store 4074//===---------------------------------------------------------------------===// 4075 4076let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in { 4077// As VEX does not have separate instruction contexts for address size 4078// overrides, VMASKMOVDQU and VMASKMOVDQU64 would have a decode conflict. 4079// Prefer VMASKMODDQU64. 4080let Uses = [EDI], Predicates = [HasAVX], isAsmParserOnly = 1 in 4081def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), 4082 (ins VR128:$src, VR128:$mask), 4083 "maskmovdqu\t{$mask, $src|$src, $mask}", 4084 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>, 4085 VEX, VEX_WIG; 4086let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in 4087def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs), 4088 (ins VR128:$src, VR128:$mask), 4089 "maskmovdqu\t{$mask, $src|$src, $mask}", 4090 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>, 4091 VEX, VEX_WIG; 4092 4093let Uses = [EDI], Predicates = [UseSSE2] in 4094def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), 4095 "maskmovdqu\t{$mask, $src|$src, $mask}", 4096 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>; 4097let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in 4098def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), 4099 "maskmovdqu\t{$mask, $src|$src, $mask}", 4100 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>; 4101 4102} // ExeDomain = SSEPackedInt 4103 4104//===---------------------------------------------------------------------===// 4105// SSE2 - Move Doubleword/Quadword 4106//===---------------------------------------------------------------------===// 4107 4108//===---------------------------------------------------------------------===// 4109// Move Int Doubleword to Packed Double Int 4110// 4111let ExeDomain = SSEPackedInt in { 4112def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), 4113 "movd\t{$src, $dst|$dst, $src}", 4114 [(set VR128:$dst, 4115 (v4i32 (scalar_to_vector GR32:$src)))]>, 4116 VEX, Sched<[WriteVecMoveFromGpr]>; 4117def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), 4118 "movd\t{$src, $dst|$dst, $src}", 4119 [(set VR128:$dst, 4120 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, 4121 VEX, Sched<[WriteVecLoad]>; 4122def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 4123 "movq\t{$src, $dst|$dst, $src}", 4124 [(set VR128:$dst, 4125 (v2i64 (scalar_to_vector GR64:$src)))]>, 4126 VEX, Sched<[WriteVecMoveFromGpr]>; 4127let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in 4128def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4129 "movq\t{$src, $dst|$dst, $src}", []>, 4130 VEX, Sched<[WriteVecLoad]>; 4131let isCodeGenOnly = 1 in 4132def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), 4133 "movq\t{$src, $dst|$dst, $src}", 4134 [(set FR64:$dst, (bitconvert GR64:$src))]>, 4135 VEX, Sched<[WriteVecMoveFromGpr]>; 4136 4137def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), 4138 "movd\t{$src, $dst|$dst, $src}", 4139 [(set VR128:$dst, 4140 (v4i32 (scalar_to_vector GR32:$src)))]>, 4141 Sched<[WriteVecMoveFromGpr]>; 4142def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), 4143 "movd\t{$src, $dst|$dst, $src}", 4144 [(set VR128:$dst, 4145 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, 4146 Sched<[WriteVecLoad]>; 4147def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 4148 "movq\t{$src, $dst|$dst, $src}", 4149 [(set VR128:$dst, 4150 (v2i64 (scalar_to_vector GR64:$src)))]>, 4151 Sched<[WriteVecMoveFromGpr]>; 4152let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in 4153def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4154 "movq\t{$src, $dst|$dst, $src}", []>, 4155 Sched<[WriteVecLoad]>; 4156let isCodeGenOnly = 1 in 4157def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), 4158 "movq\t{$src, $dst|$dst, $src}", 4159 [(set FR64:$dst, (bitconvert GR64:$src))]>, 4160 Sched<[WriteVecMoveFromGpr]>; 4161} // ExeDomain = SSEPackedInt 4162 4163//===---------------------------------------------------------------------===// 4164// Move Int Doubleword to Single Scalar 4165// 4166let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4167 def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), 4168 "movd\t{$src, $dst|$dst, $src}", 4169 [(set FR32:$dst, (bitconvert GR32:$src))]>, 4170 VEX, Sched<[WriteVecMoveFromGpr]>; 4171 4172 def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), 4173 "movd\t{$src, $dst|$dst, $src}", 4174 [(set FR32:$dst, (bitconvert GR32:$src))]>, 4175 Sched<[WriteVecMoveFromGpr]>; 4176 4177} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4178 4179//===---------------------------------------------------------------------===// 4180// Move Packed Doubleword Int to Packed Double Int 4181// 4182let ExeDomain = SSEPackedInt in { 4183def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), 4184 "movd\t{$src, $dst|$dst, $src}", 4185 [(set GR32:$dst, (extractelt (v4i32 VR128:$src), 4186 (iPTR 0)))]>, VEX, 4187 Sched<[WriteVecMoveToGpr]>; 4188def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs), 4189 (ins i32mem:$dst, VR128:$src), 4190 "movd\t{$src, $dst|$dst, $src}", 4191 [(store (i32 (extractelt (v4i32 VR128:$src), 4192 (iPTR 0))), addr:$dst)]>, 4193 VEX, Sched<[WriteVecStore]>; 4194def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), 4195 "movd\t{$src, $dst|$dst, $src}", 4196 [(set GR32:$dst, (extractelt (v4i32 VR128:$src), 4197 (iPTR 0)))]>, 4198 Sched<[WriteVecMoveToGpr]>; 4199def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), 4200 "movd\t{$src, $dst|$dst, $src}", 4201 [(store (i32 (extractelt (v4i32 VR128:$src), 4202 (iPTR 0))), addr:$dst)]>, 4203 Sched<[WriteVecStore]>; 4204} // ExeDomain = SSEPackedInt 4205 4206//===---------------------------------------------------------------------===// 4207// Move Packed Doubleword Int first element to Doubleword Int 4208// 4209let ExeDomain = SSEPackedInt in { 4210let SchedRW = [WriteVecMoveToGpr] in { 4211def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), 4212 "movq\t{$src, $dst|$dst, $src}", 4213 [(set GR64:$dst, (extractelt (v2i64 VR128:$src), 4214 (iPTR 0)))]>, 4215 VEX; 4216 4217def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), 4218 "movq\t{$src, $dst|$dst, $src}", 4219 [(set GR64:$dst, (extractelt (v2i64 VR128:$src), 4220 (iPTR 0)))]>; 4221} //SchedRW 4222 4223let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in 4224def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs), 4225 (ins i64mem:$dst, VR128:$src), 4226 "movq\t{$src, $dst|$dst, $src}", []>, 4227 VEX, Sched<[WriteVecStore]>; 4228let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in 4229def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4230 "movq\t{$src, $dst|$dst, $src}", []>, 4231 Sched<[WriteVecStore]>; 4232} // ExeDomain = SSEPackedInt 4233 4234//===---------------------------------------------------------------------===// 4235// Bitcast FR64 <-> GR64 4236// 4237let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4238 def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), 4239 "movq\t{$src, $dst|$dst, $src}", 4240 [(set GR64:$dst, (bitconvert FR64:$src))]>, 4241 VEX, Sched<[WriteVecMoveToGpr]>; 4242 4243 def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), 4244 "movq\t{$src, $dst|$dst, $src}", 4245 [(set GR64:$dst, (bitconvert FR64:$src))]>, 4246 Sched<[WriteVecMoveToGpr]>; 4247} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4248 4249//===---------------------------------------------------------------------===// 4250// Move Scalar Single to Double Int 4251// 4252let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4253 def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), 4254 "movd\t{$src, $dst|$dst, $src}", 4255 [(set GR32:$dst, (bitconvert FR32:$src))]>, 4256 VEX, Sched<[WriteVecMoveToGpr]>; 4257 def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), 4258 "movd\t{$src, $dst|$dst, $src}", 4259 [(set GR32:$dst, (bitconvert FR32:$src))]>, 4260 Sched<[WriteVecMoveToGpr]>; 4261} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4262 4263let Predicates = [UseAVX] in { 4264 def : Pat<(v4i32 (scalar_to_vector (i32 (anyext GR8:$src)))), 4265 (VMOVDI2PDIrr (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 4266 GR8:$src, sub_8bit)))>; 4267 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), 4268 (VMOVDI2PDIrr GR32:$src)>; 4269 4270 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), 4271 (VMOV64toPQIrr GR64:$src)>; 4272 4273 // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. 4274 // These instructions also write zeros in the high part of a 256-bit register. 4275 def : Pat<(v4i32 (X86vzload32 addr:$src)), 4276 (VMOVDI2PDIrm addr:$src)>; 4277 def : Pat<(v8i32 (X86vzload32 addr:$src)), 4278 (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>; 4279} 4280 4281let Predicates = [UseSSE2] in { 4282 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), 4283 (MOVDI2PDIrr GR32:$src)>; 4284 4285 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), 4286 (MOV64toPQIrr GR64:$src)>; 4287 def : Pat<(v4i32 (X86vzload32 addr:$src)), 4288 (MOVDI2PDIrm addr:$src)>; 4289} 4290 4291// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of 4292// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add 4293// these aliases. 4294def : InstAlias<"movd\t{$src, $dst|$dst, $src}", 4295 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>; 4296def : InstAlias<"movd\t{$src, $dst|$dst, $src}", 4297 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>; 4298// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX. 4299def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", 4300 (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>; 4301def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", 4302 (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>; 4303 4304//===---------------------------------------------------------------------===// 4305// SSE2 - Move Quadword 4306//===---------------------------------------------------------------------===// 4307 4308//===---------------------------------------------------------------------===// 4309// Move Quadword Int to Packed Quadword Int 4310// 4311 4312let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in { 4313def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4314 "vmovq\t{$src, $dst|$dst, $src}", 4315 [(set VR128:$dst, 4316 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS, 4317 VEX, Requires<[UseAVX]>, VEX_WIG; 4318def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4319 "movq\t{$src, $dst|$dst, $src}", 4320 [(set VR128:$dst, 4321 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, 4322 XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix 4323} // ExeDomain, SchedRW 4324 4325//===---------------------------------------------------------------------===// 4326// Move Packed Quadword Int to Quadword Int 4327// 4328let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in { 4329def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4330 "movq\t{$src, $dst|$dst, $src}", 4331 [(store (i64 (extractelt (v2i64 VR128:$src), 4332 (iPTR 0))), addr:$dst)]>, 4333 VEX, VEX_WIG; 4334def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4335 "movq\t{$src, $dst|$dst, $src}", 4336 [(store (i64 (extractelt (v2i64 VR128:$src), 4337 (iPTR 0))), addr:$dst)]>; 4338} // ExeDomain, SchedRW 4339 4340// For disassembler only 4341let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 4342 SchedRW = [SchedWriteVecLogic.XMM] in { 4343def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4344 "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG; 4345def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4346 "movq\t{$src, $dst|$dst, $src}", []>; 4347} 4348 4349def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}", 4350 (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>; 4351def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}", 4352 (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>; 4353 4354let Predicates = [UseAVX] in { 4355 def : Pat<(v2i64 (X86vzload64 addr:$src)), 4356 (VMOVQI2PQIrm addr:$src)>; 4357 def : Pat<(v4i64 (X86vzload64 addr:$src)), 4358 (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; 4359 4360 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst), 4361 (VMOVPQI2QImr addr:$dst, VR128:$src)>; 4362} 4363 4364let Predicates = [UseSSE2] in { 4365 def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>; 4366 4367 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst), 4368 (MOVPQI2QImr addr:$dst, VR128:$src)>; 4369} 4370 4371//===---------------------------------------------------------------------===// 4372// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in 4373// IA32 document. movq xmm1, xmm2 does clear the high bits. 4374// 4375let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in { 4376def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4377 "vmovq\t{$src, $dst|$dst, $src}", 4378 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, 4379 XS, VEX, Requires<[UseAVX]>, VEX_WIG; 4380def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4381 "movq\t{$src, $dst|$dst, $src}", 4382 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, 4383 XS, Requires<[UseSSE2]>; 4384} // ExeDomain, SchedRW 4385 4386let Predicates = [UseAVX] in { 4387 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 4388 (VMOVZPQILo2PQIrr VR128:$src)>; 4389} 4390let Predicates = [UseSSE2] in { 4391 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 4392 (MOVZPQILo2PQIrr VR128:$src)>; 4393} 4394 4395let Predicates = [UseAVX] in { 4396 def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), 4397 (SUBREG_TO_REG (i32 0), 4398 (v2f64 (VMOVZPQILo2PQIrr 4399 (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))), 4400 sub_xmm)>; 4401 def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), 4402 (SUBREG_TO_REG (i32 0), 4403 (v2i64 (VMOVZPQILo2PQIrr 4404 (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))), 4405 sub_xmm)>; 4406} 4407 4408//===---------------------------------------------------------------------===// 4409// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP 4410//===---------------------------------------------------------------------===// 4411 4412multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, 4413 ValueType vt, RegisterClass RC, PatFrag mem_frag, 4414 X86MemOperand x86memop, X86FoldableSchedWrite sched> { 4415def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), 4416 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4417 [(set RC:$dst, (vt (OpNode RC:$src)))]>, 4418 Sched<[sched]>; 4419def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 4420 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4421 [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, 4422 Sched<[sched.Folded]>; 4423} 4424 4425let Predicates = [HasAVX, NoVLX] in { 4426 defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", 4427 v4f32, VR128, loadv4f32, f128mem, 4428 SchedWriteFShuffle.XMM>, VEX, VEX_WIG; 4429 defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", 4430 v4f32, VR128, loadv4f32, f128mem, 4431 SchedWriteFShuffle.XMM>, VEX, VEX_WIG; 4432 defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", 4433 v8f32, VR256, loadv8f32, f256mem, 4434 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG; 4435 defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", 4436 v8f32, VR256, loadv8f32, f256mem, 4437 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG; 4438} 4439defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128, 4440 memopv4f32, f128mem, SchedWriteFShuffle.XMM>; 4441defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128, 4442 memopv4f32, f128mem, SchedWriteFShuffle.XMM>; 4443 4444let Predicates = [HasAVX, NoVLX] in { 4445 def : Pat<(v4i32 (X86Movshdup VR128:$src)), 4446 (VMOVSHDUPrr VR128:$src)>; 4447 def : Pat<(v4i32 (X86Movshdup (load addr:$src))), 4448 (VMOVSHDUPrm addr:$src)>; 4449 def : Pat<(v4i32 (X86Movsldup VR128:$src)), 4450 (VMOVSLDUPrr VR128:$src)>; 4451 def : Pat<(v4i32 (X86Movsldup (load addr:$src))), 4452 (VMOVSLDUPrm addr:$src)>; 4453 def : Pat<(v8i32 (X86Movshdup VR256:$src)), 4454 (VMOVSHDUPYrr VR256:$src)>; 4455 def : Pat<(v8i32 (X86Movshdup (load addr:$src))), 4456 (VMOVSHDUPYrm addr:$src)>; 4457 def : Pat<(v8i32 (X86Movsldup VR256:$src)), 4458 (VMOVSLDUPYrr VR256:$src)>; 4459 def : Pat<(v8i32 (X86Movsldup (load addr:$src))), 4460 (VMOVSLDUPYrm addr:$src)>; 4461} 4462 4463let Predicates = [UseSSE3] in { 4464 def : Pat<(v4i32 (X86Movshdup VR128:$src)), 4465 (MOVSHDUPrr VR128:$src)>; 4466 def : Pat<(v4i32 (X86Movshdup (memop addr:$src))), 4467 (MOVSHDUPrm addr:$src)>; 4468 def : Pat<(v4i32 (X86Movsldup VR128:$src)), 4469 (MOVSLDUPrr VR128:$src)>; 4470 def : Pat<(v4i32 (X86Movsldup (memop addr:$src))), 4471 (MOVSLDUPrm addr:$src)>; 4472} 4473 4474//===---------------------------------------------------------------------===// 4475// SSE3 - Replicate Double FP - MOVDDUP 4476//===---------------------------------------------------------------------===// 4477 4478multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> { 4479def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4480 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4481 [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>, 4482 Sched<[sched.XMM]>; 4483def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 4484 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4485 [(set VR128:$dst, 4486 (v2f64 (X86Movddup 4487 (scalar_to_vector (loadf64 addr:$src)))))]>, 4488 Sched<[sched.XMM.Folded]>; 4489} 4490 4491// FIXME: Merge with above classes when there are patterns for the ymm version 4492multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> { 4493def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 4494 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4495 [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>, 4496 Sched<[sched.YMM]>; 4497def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 4498 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4499 [(set VR256:$dst, 4500 (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>, 4501 Sched<[sched.YMM.Folded]>; 4502} 4503 4504let Predicates = [HasAVX, NoVLX] in { 4505 defm VMOVDDUP : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>, 4506 VEX, VEX_WIG; 4507 defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>, 4508 VEX, VEX_L, VEX_WIG; 4509} 4510 4511defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>; 4512 4513 4514let Predicates = [HasAVX, NoVLX] in { 4515 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), 4516 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; 4517} 4518 4519let Predicates = [UseSSE3] in { 4520 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), 4521 (MOVDDUPrm addr:$src)>; 4522} 4523 4524//===---------------------------------------------------------------------===// 4525// SSE3 - Move Unaligned Integer 4526//===---------------------------------------------------------------------===// 4527 4528let Predicates = [HasAVX] in { 4529 def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 4530 "vlddqu\t{$src, $dst|$dst, $src}", 4531 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, 4532 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG; 4533 def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 4534 "vlddqu\t{$src, $dst|$dst, $src}", 4535 [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, 4536 Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG; 4537} // Predicates 4538 4539def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 4540 "lddqu\t{$src, $dst|$dst, $src}", 4541 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, 4542 Sched<[SchedWriteVecMoveLS.XMM.RM]>; 4543 4544//===---------------------------------------------------------------------===// 4545// SSE3 - Arithmetic 4546//===---------------------------------------------------------------------===// 4547 4548multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC, 4549 X86MemOperand x86memop, X86FoldableSchedWrite sched, 4550 PatFrag ld_frag, bit Is2Addr = 1> { 4551let Uses = [MXCSR], mayRaiseFPException = 1 in { 4552 def rr : I<0xD0, MRMSrcReg, 4553 (outs RC:$dst), (ins RC:$src1, RC:$src2), 4554 !if(Is2Addr, 4555 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4556 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4557 [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>, 4558 Sched<[sched]>; 4559 def rm : I<0xD0, MRMSrcMem, 4560 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4561 !if(Is2Addr, 4562 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4563 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4564 [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>, 4565 Sched<[sched.Folded, sched.ReadAfterFold]>; 4566} 4567} 4568 4569let Predicates = [HasAVX] in { 4570 let ExeDomain = SSEPackedSingle in { 4571 defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem, 4572 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>, 4573 XD, VEX_4V, VEX_WIG; 4574 defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem, 4575 SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>, 4576 XD, VEX_4V, VEX_L, VEX_WIG; 4577 } 4578 let ExeDomain = SSEPackedDouble in { 4579 defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem, 4580 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>, 4581 PD, VEX_4V, VEX_WIG; 4582 defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem, 4583 SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>, 4584 PD, VEX_4V, VEX_L, VEX_WIG; 4585 } 4586} 4587let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { 4588 let ExeDomain = SSEPackedSingle in 4589 defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem, 4590 SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD; 4591 let ExeDomain = SSEPackedDouble in 4592 defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem, 4593 SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD; 4594} 4595 4596//===---------------------------------------------------------------------===// 4597// SSE3 Instructions 4598//===---------------------------------------------------------------------===// 4599 4600// Horizontal ops 4601multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, 4602 X86MemOperand x86memop, SDNode OpNode, 4603 X86FoldableSchedWrite sched, PatFrag ld_frag, 4604 bit Is2Addr = 1> { 4605let Uses = [MXCSR], mayRaiseFPException = 1 in { 4606 def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 4607 !if(Is2Addr, 4608 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4609 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4610 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 4611 Sched<[sched]>; 4612 4613 def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4614 !if(Is2Addr, 4615 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4616 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4617 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 4618 Sched<[sched.Folded, sched.ReadAfterFold]>; 4619} 4620} 4621multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, 4622 X86MemOperand x86memop, SDNode OpNode, 4623 X86FoldableSchedWrite sched, PatFrag ld_frag, 4624 bit Is2Addr = 1> { 4625let Uses = [MXCSR], mayRaiseFPException = 1 in { 4626 def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 4627 !if(Is2Addr, 4628 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4629 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4630 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 4631 Sched<[sched]>; 4632 4633 def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4634 !if(Is2Addr, 4635 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4636 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4637 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 4638 Sched<[sched.Folded, sched.ReadAfterFold]>; 4639} 4640} 4641 4642let Predicates = [HasAVX] in { 4643 let ExeDomain = SSEPackedSingle in { 4644 defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, 4645 X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG; 4646 defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, 4647 X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG; 4648 defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, 4649 X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; 4650 defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, 4651 X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; 4652 } 4653 let ExeDomain = SSEPackedDouble in { 4654 defm VHADDPD : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem, 4655 X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG; 4656 defm VHSUBPD : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem, 4657 X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG; 4658 defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem, 4659 X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; 4660 defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem, 4661 X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; 4662 } 4663} 4664 4665let Constraints = "$src1 = $dst" in { 4666 let ExeDomain = SSEPackedSingle in { 4667 defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd, 4668 WriteFHAdd, memopv4f32>; 4669 defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub, 4670 WriteFHAdd, memopv4f32>; 4671 } 4672 let ExeDomain = SSEPackedDouble in { 4673 defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd, 4674 WriteFHAdd, memopv2f64>; 4675 defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub, 4676 WriteFHAdd, memopv2f64>; 4677 } 4678} 4679 4680//===---------------------------------------------------------------------===// 4681// SSSE3 - Packed Absolute Instructions 4682//===---------------------------------------------------------------------===// 4683 4684/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. 4685multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt, 4686 SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> { 4687 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst), 4688 (ins VR128:$src), 4689 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4690 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>, 4691 Sched<[sched.XMM]>; 4692 4693 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst), 4694 (ins i128mem:$src), 4695 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4696 [(set VR128:$dst, 4697 (vt (OpNode (ld_frag addr:$src))))]>, 4698 Sched<[sched.XMM.Folded]>; 4699} 4700 4701/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. 4702multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt, 4703 SDNode OpNode, X86SchedWriteWidths sched> { 4704 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst), 4705 (ins VR256:$src), 4706 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4707 [(set VR256:$dst, (vt (OpNode VR256:$src)))]>, 4708 Sched<[sched.YMM]>; 4709 4710 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst), 4711 (ins i256mem:$src), 4712 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4713 [(set VR256:$dst, 4714 (vt (OpNode (load addr:$src))))]>, 4715 Sched<[sched.YMM.Folded]>; 4716} 4717 4718let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4719 defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU, 4720 load>, VEX, VEX_WIG; 4721 defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU, 4722 load>, VEX, VEX_WIG; 4723} 4724let Predicates = [HasAVX, NoVLX] in { 4725 defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU, 4726 load>, VEX, VEX_WIG; 4727} 4728let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4729 defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>, 4730 VEX, VEX_L, VEX_WIG; 4731 defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>, 4732 VEX, VEX_L, VEX_WIG; 4733} 4734let Predicates = [HasAVX2, NoVLX] in { 4735 defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>, 4736 VEX, VEX_L, VEX_WIG; 4737} 4738 4739defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU, 4740 memop>; 4741defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU, 4742 memop>; 4743defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU, 4744 memop>; 4745 4746//===---------------------------------------------------------------------===// 4747// SSSE3 - Packed Binary Operator Instructions 4748//===---------------------------------------------------------------------===// 4749 4750/// SS3I_binop_rm - Simple SSSE3 bin op 4751multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 4752 ValueType DstVT, ValueType OpVT, RegisterClass RC, 4753 PatFrag memop_frag, X86MemOperand x86memop, 4754 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 4755 let isCommutable = 1 in 4756 def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst), 4757 (ins RC:$src1, RC:$src2), 4758 !if(Is2Addr, 4759 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4760 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4761 [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>, 4762 Sched<[sched]>; 4763 def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst), 4764 (ins RC:$src1, x86memop:$src2), 4765 !if(Is2Addr, 4766 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4767 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4768 [(set RC:$dst, 4769 (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>, 4770 Sched<[sched.Folded, sched.ReadAfterFold]>; 4771} 4772 4773/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}. 4774multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, 4775 Intrinsic IntId128, X86FoldableSchedWrite sched, 4776 PatFrag ld_frag, bit Is2Addr = 1> { 4777 let isCommutable = 1 in 4778 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst), 4779 (ins VR128:$src1, VR128:$src2), 4780 !if(Is2Addr, 4781 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4782 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4783 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, 4784 Sched<[sched]>; 4785 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst), 4786 (ins VR128:$src1, i128mem:$src2), 4787 !if(Is2Addr, 4788 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4789 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4790 [(set VR128:$dst, 4791 (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>, 4792 Sched<[sched.Folded, sched.ReadAfterFold]>; 4793} 4794 4795multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr, 4796 Intrinsic IntId256, 4797 X86FoldableSchedWrite sched> { 4798 let isCommutable = 1 in 4799 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst), 4800 (ins VR256:$src1, VR256:$src2), 4801 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4802 [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, 4803 Sched<[sched]>; 4804 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst), 4805 (ins VR256:$src1, i256mem:$src2), 4806 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4807 [(set VR256:$dst, 4808 (IntId256 VR256:$src1, (load addr:$src2)))]>, 4809 Sched<[sched.Folded, sched.ReadAfterFold]>; 4810} 4811 4812let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4813let isCommutable = 0 in { 4814 defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8, 4815 VR128, load, i128mem, 4816 SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG; 4817 defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16, 4818 v16i8, VR128, load, i128mem, 4819 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; 4820} 4821defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16, 4822 VR128, load, i128mem, 4823 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; 4824} 4825 4826let ImmT = NoImm, Predicates = [HasAVX] in { 4827let isCommutable = 0 in { 4828 defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128, 4829 load, i128mem, 4830 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4831 defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128, 4832 load, i128mem, 4833 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4834 defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128, 4835 load, i128mem, 4836 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4837 defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128, 4838 load, i128mem, 4839 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4840 defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb", 4841 int_x86_ssse3_psign_b_128, 4842 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; 4843 defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw", 4844 int_x86_ssse3_psign_w_128, 4845 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; 4846 defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd", 4847 int_x86_ssse3_psign_d_128, 4848 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; 4849 defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", 4850 int_x86_ssse3_phadd_sw_128, 4851 SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG; 4852 defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", 4853 int_x86_ssse3_phsub_sw_128, 4854 SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG; 4855} 4856} 4857 4858let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4859let isCommutable = 0 in { 4860 defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8, 4861 VR256, load, i256mem, 4862 SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4863 defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16, 4864 v32i8, VR256, load, i256mem, 4865 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4866} 4867defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16, 4868 VR256, load, i256mem, 4869 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4870} 4871 4872let ImmT = NoImm, Predicates = [HasAVX2] in { 4873let isCommutable = 0 in { 4874 defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16, 4875 VR256, load, i256mem, 4876 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4877 defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256, 4878 load, i256mem, 4879 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4880 defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16, 4881 VR256, load, i256mem, 4882 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4883 defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256, 4884 load, i256mem, 4885 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4886 defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b, 4887 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; 4888 defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w, 4889 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; 4890 defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d, 4891 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; 4892 defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", 4893 int_x86_avx2_phadd_sw, 4894 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG; 4895 defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", 4896 int_x86_avx2_phsub_sw, 4897 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG; 4898} 4899} 4900 4901// None of these have i8 immediate fields. 4902let ImmT = NoImm, Constraints = "$src1 = $dst" in { 4903let isCommutable = 0 in { 4904 defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128, 4905 memop, i128mem, SchedWritePHAdd.XMM>; 4906 defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128, 4907 memop, i128mem, SchedWritePHAdd.XMM>; 4908 defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128, 4909 memop, i128mem, SchedWritePHAdd.XMM>; 4910 defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128, 4911 memop, i128mem, SchedWritePHAdd.XMM>; 4912 defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128, 4913 SchedWriteVecALU.XMM, memop>; 4914 defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128, 4915 SchedWriteVecALU.XMM, memop>; 4916 defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128, 4917 SchedWriteVecALU.XMM, memop>; 4918 defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128, 4919 memop, i128mem, SchedWriteVarShuffle.XMM>; 4920 defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", 4921 int_x86_ssse3_phadd_sw_128, 4922 SchedWritePHAdd.XMM, memop>; 4923 defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", 4924 int_x86_ssse3_phsub_sw_128, 4925 SchedWritePHAdd.XMM, memop>; 4926 defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16, 4927 v16i8, VR128, memop, i128mem, 4928 SchedWriteVecIMul.XMM>; 4929} 4930defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16, 4931 VR128, memop, i128mem, SchedWriteVecIMul.XMM>; 4932} 4933 4934//===---------------------------------------------------------------------===// 4935// SSSE3 - Packed Align Instruction Patterns 4936//===---------------------------------------------------------------------===// 4937 4938multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC, 4939 PatFrag memop_frag, X86MemOperand x86memop, 4940 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 4941 let hasSideEffects = 0 in { 4942 def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst), 4943 (ins RC:$src1, RC:$src2, u8imm:$src3), 4944 !if(Is2Addr, 4945 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 4946 !strconcat(asm, 4947 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 4948 [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>, 4949 Sched<[sched]>; 4950 let mayLoad = 1 in 4951 def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst), 4952 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 4953 !if(Is2Addr, 4954 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 4955 !strconcat(asm, 4956 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 4957 [(set RC:$dst, (VT (X86PAlignr RC:$src1, 4958 (memop_frag addr:$src2), 4959 (i8 timm:$src3))))]>, 4960 Sched<[sched.Folded, sched.ReadAfterFold]>; 4961 } 4962} 4963 4964let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 4965 defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem, 4966 SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG; 4967let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 4968 defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem, 4969 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4970let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in 4971 defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem, 4972 SchedWriteShuffle.XMM>; 4973 4974//===---------------------------------------------------------------------===// 4975// SSSE3 - Thread synchronization 4976//===---------------------------------------------------------------------===// 4977 4978let SchedRW = [WriteSystem] in { 4979let Uses = [EAX, ECX, EDX] in 4980def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, 4981 TB, Requires<[HasSSE3, Not64BitMode]>; 4982let Uses = [RAX, ECX, EDX] in 4983def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, 4984 TB, Requires<[HasSSE3, In64BitMode]>; 4985 4986let Uses = [ECX, EAX] in 4987def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", 4988 [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>; 4989} // SchedRW 4990 4991def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>; 4992def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>; 4993 4994def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>, 4995 Requires<[Not64BitMode]>; 4996def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>, 4997 Requires<[In64BitMode]>; 4998 4999//===----------------------------------------------------------------------===// 5000// SSE4.1 - Packed Move with Sign/Zero Extend 5001// NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp 5002//===----------------------------------------------------------------------===// 5003 5004multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, 5005 RegisterClass OutRC, RegisterClass InRC, 5006 X86FoldableSchedWrite sched> { 5007 def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src), 5008 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, 5009 Sched<[sched]>; 5010 5011 def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src), 5012 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, 5013 Sched<[sched.Folded]>; 5014} 5015 5016multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr, 5017 X86MemOperand MemOp, X86MemOperand MemYOp, 5018 Predicate prd> { 5019 defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, 5020 SchedWriteShuffle.XMM>; 5021 let Predicates = [HasAVX, prd] in 5022 defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp, 5023 VR128, VR128, SchedWriteVecExtend.XMM>, 5024 VEX, VEX_WIG; 5025 let Predicates = [HasAVX2, prd] in 5026 defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp, 5027 VR256, VR128, SchedWriteVecExtend.YMM>, 5028 VEX, VEX_L, VEX_WIG; 5029} 5030 5031multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, 5032 X86MemOperand MemYOp, Predicate prd> { 5033 defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr), 5034 MemOp, MemYOp, prd>; 5035 defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10), 5036 !strconcat("pmovzx", OpcodeStr), 5037 MemOp, MemYOp, prd>; 5038} 5039 5040defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>; 5041defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>; 5042defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>; 5043 5044defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>; 5045defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>; 5046 5047defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>; 5048 5049// AVX2 Patterns 5050multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, 5051 SDNode ExtOp, SDNode InVecOp> { 5052 // Register-Register patterns 5053 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 5054 def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), 5055 (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>; 5056 } 5057 let Predicates = [HasAVX2, NoVLX] in { 5058 def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))), 5059 (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>; 5060 def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))), 5061 (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>; 5062 5063 def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), 5064 (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>; 5065 def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))), 5066 (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>; 5067 5068 def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), 5069 (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>; 5070 } 5071 5072 // Simple Register-Memory patterns 5073 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 5074 def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5075 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 5076 5077 def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), 5078 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 5079 } 5080 5081 let Predicates = [HasAVX2, NoVLX] in { 5082 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5083 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5084 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5085 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5086 5087 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5088 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5089 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5090 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5091 5092 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), 5093 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5094 } 5095 5096 // AVX2 Register-Memory patterns 5097 let Predicates = [HasAVX2, NoVLX] in { 5098 def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), 5099 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5100 5101 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5102 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5103 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5104 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5105 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), 5106 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5107 5108 def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), 5109 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5110 5111 def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5112 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5113 def : Pat<(v4i64 (InVecOp (bc_v16i8 (v2i64 (X86vzload32 addr:$src))))), 5114 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5115 5116 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5117 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5118 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5119 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5120 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), 5121 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5122 } 5123} 5124 5125defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>; 5126defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>; 5127 5128// SSE4.1/AVX patterns. 5129multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, 5130 SDNode ExtOp> { 5131 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5132 def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))), 5133 (!cast<I>(OpcPrefix#BWrr) VR128:$src)>; 5134 } 5135 let Predicates = [HasAVX, NoVLX] in { 5136 def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))), 5137 (!cast<I>(OpcPrefix#BDrr) VR128:$src)>; 5138 def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))), 5139 (!cast<I>(OpcPrefix#BQrr) VR128:$src)>; 5140 5141 def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))), 5142 (!cast<I>(OpcPrefix#WDrr) VR128:$src)>; 5143 def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))), 5144 (!cast<I>(OpcPrefix#WQrr) VR128:$src)>; 5145 5146 def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))), 5147 (!cast<I>(OpcPrefix#DQrr) VR128:$src)>; 5148 } 5149 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5150 def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5151 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5152 } 5153 let Predicates = [HasAVX, NoVLX] in { 5154 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5155 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5156 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5157 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5158 5159 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5160 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5161 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5162 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5163 5164 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), 5165 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5166 } 5167 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5168 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5169 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5170 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5171 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5172 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), 5173 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5174 def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))), 5175 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5176 } 5177 let Predicates = [HasAVX, NoVLX] in { 5178 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5179 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5180 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))), 5181 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5182 def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))), 5183 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5184 5185 def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))), 5186 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5187 def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))), 5188 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5189 5190 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5191 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5192 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5193 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5194 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), 5195 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5196 def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))), 5197 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5198 5199 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5200 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5201 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))), 5202 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5203 def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))), 5204 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5205 5206 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5207 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5208 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5209 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5210 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), 5211 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5212 def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))), 5213 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5214 } 5215} 5216 5217defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>; 5218defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>; 5219 5220let Predicates = [UseSSE41] in { 5221 defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>; 5222 defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>; 5223} 5224 5225//===----------------------------------------------------------------------===// 5226// SSE4.1 - Extract Instructions 5227//===----------------------------------------------------------------------===// 5228 5229/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem 5230multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { 5231 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5232 (ins VR128:$src1, u8imm:$src2), 5233 !strconcat(OpcodeStr, 5234 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5235 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1), 5236 timm:$src2))]>, 5237 Sched<[WriteVecExtract]>; 5238 let hasSideEffects = 0, mayStore = 1 in 5239 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5240 (ins i8mem:$dst, VR128:$src1, u8imm:$src2), 5241 !strconcat(OpcodeStr, 5242 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5243 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), timm:$src2))), 5244 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5245} 5246 5247let Predicates = [HasAVX, NoBWI] in 5248 defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, VEX_WIG; 5249 5250defm PEXTRB : SS41I_extract8<0x14, "pextrb">; 5251 5252 5253/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination 5254multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { 5255 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in 5256 def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5257 (ins VR128:$src1, u8imm:$src2), 5258 !strconcat(OpcodeStr, 5259 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 5260 Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>; 5261 5262 let hasSideEffects = 0, mayStore = 1 in 5263 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5264 (ins i16mem:$dst, VR128:$src1, u8imm:$src2), 5265 !strconcat(OpcodeStr, 5266 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5267 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), timm:$src2))), 5268 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5269} 5270 5271let Predicates = [HasAVX, NoBWI] in 5272 defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, VEX_WIG; 5273 5274defm PEXTRW : SS41I_extract16<0x15, "pextrw">; 5275 5276let Predicates = [UseSSE41] in 5277 def : Pat<(store f16:$src, addr:$dst), (PEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>; 5278 5279let Predicates = [HasAVX, NoBWI] in 5280 def : Pat<(store f16:$src, addr:$dst), (VPEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>; 5281 5282 5283/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination 5284multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { 5285 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst), 5286 (ins VR128:$src1, u8imm:$src2), 5287 !strconcat(OpcodeStr, 5288 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5289 [(set GR32:$dst, 5290 (extractelt (v4i32 VR128:$src1), imm:$src2))]>, 5291 Sched<[WriteVecExtract]>; 5292 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5293 (ins i32mem:$dst, VR128:$src1, u8imm:$src2), 5294 !strconcat(OpcodeStr, 5295 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5296 [(store (extractelt (v4i32 VR128:$src1), imm:$src2), 5297 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5298} 5299 5300let Predicates = [HasAVX, NoDQI] in 5301 defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX; 5302 5303defm PEXTRD : SS41I_extract32<0x16, "pextrd">; 5304 5305/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination 5306multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { 5307 def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst), 5308 (ins VR128:$src1, u8imm:$src2), 5309 !strconcat(OpcodeStr, 5310 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5311 [(set GR64:$dst, 5312 (extractelt (v2i64 VR128:$src1), imm:$src2))]>, 5313 Sched<[WriteVecExtract]>; 5314 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5315 (ins i64mem:$dst, VR128:$src1, u8imm:$src2), 5316 !strconcat(OpcodeStr, 5317 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5318 [(store (extractelt (v2i64 VR128:$src1), imm:$src2), 5319 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5320} 5321 5322let Predicates = [HasAVX, NoDQI] in 5323 defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W; 5324 5325defm PEXTRQ : SS41I_extract64<0x16, "pextrq">, REX_W; 5326 5327/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory 5328/// destination 5329multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> { 5330 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5331 (ins VR128:$src1, u8imm:$src2), 5332 !strconcat(OpcodeStr, 5333 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5334 [(set GR32orGR64:$dst, 5335 (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>, 5336 Sched<[WriteVecExtract]>; 5337 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5338 (ins f32mem:$dst, VR128:$src1, u8imm:$src2), 5339 !strconcat(OpcodeStr, 5340 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5341 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2), 5342 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5343} 5344 5345let ExeDomain = SSEPackedSingle in { 5346 let Predicates = [UseAVX] in 5347 defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG; 5348 defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">; 5349} 5350 5351//===----------------------------------------------------------------------===// 5352// SSE4.1 - Insert Instructions 5353//===----------------------------------------------------------------------===// 5354 5355multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { 5356 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5357 (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3), 5358 !if(Is2Addr, 5359 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5360 !strconcat(asm, 5361 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5362 [(set VR128:$dst, 5363 (X86pinsrb VR128:$src1, GR32orGR64:$src2, timm:$src3))]>, 5364 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 5365 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5366 (ins VR128:$src1, i8mem:$src2, u8imm:$src3), 5367 !if(Is2Addr, 5368 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5369 !strconcat(asm, 5370 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5371 [(set VR128:$dst, 5372 (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), timm:$src3))]>, 5373 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 5374} 5375 5376let Predicates = [HasAVX, NoBWI] in { 5377 defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, VEX_WIG; 5378 def : Pat<(X86pinsrb VR128:$src1, (i32 (anyext (i8 GR8:$src2))), timm:$src3), 5379 (VPINSRBrr VR128:$src1, (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 5380 GR8:$src2, sub_8bit), timm:$src3)>; 5381} 5382 5383let Constraints = "$src1 = $dst" in 5384 defm PINSRB : SS41I_insert8<0x20, "pinsrb">; 5385 5386multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { 5387 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5388 (ins VR128:$src1, GR32:$src2, u8imm:$src3), 5389 !if(Is2Addr, 5390 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5391 !strconcat(asm, 5392 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5393 [(set VR128:$dst, 5394 (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, 5395 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 5396 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5397 (ins VR128:$src1, i32mem:$src2, u8imm:$src3), 5398 !if(Is2Addr, 5399 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5400 !strconcat(asm, 5401 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5402 [(set VR128:$dst, 5403 (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>, 5404 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 5405} 5406 5407let Predicates = [HasAVX, NoDQI] in 5408 defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V; 5409let Constraints = "$src1 = $dst" in 5410 defm PINSRD : SS41I_insert32<0x22, "pinsrd">; 5411 5412multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { 5413 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5414 (ins VR128:$src1, GR64:$src2, u8imm:$src3), 5415 !if(Is2Addr, 5416 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5417 !strconcat(asm, 5418 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5419 [(set VR128:$dst, 5420 (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, 5421 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 5422 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5423 (ins VR128:$src1, i64mem:$src2, u8imm:$src3), 5424 !if(Is2Addr, 5425 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5426 !strconcat(asm, 5427 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5428 [(set VR128:$dst, 5429 (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>, 5430 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 5431} 5432 5433let Predicates = [HasAVX, NoDQI] in 5434 defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W; 5435let Constraints = "$src1 = $dst" in 5436 defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W; 5437 5438// insertps has a few different modes, there's the first two here below which 5439// are optimized inserts that won't zero arbitrary elements in the destination 5440// vector. The next one matches the intrinsic and could zero arbitrary elements 5441// in the target vector. 5442multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { 5443 let isCommutable = 1 in 5444 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5445 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 5446 !if(Is2Addr, 5447 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5448 !strconcat(asm, 5449 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5450 [(set VR128:$dst, 5451 (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>, 5452 Sched<[SchedWriteFShuffle.XMM]>; 5453 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5454 (ins VR128:$src1, f32mem:$src2, u8imm:$src3), 5455 !if(Is2Addr, 5456 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5457 !strconcat(asm, 5458 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5459 [(set VR128:$dst, 5460 (X86insertps VR128:$src1, 5461 (v4f32 (scalar_to_vector (loadf32 addr:$src2))), 5462 timm:$src3))]>, 5463 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; 5464} 5465 5466let ExeDomain = SSEPackedSingle in { 5467 let Predicates = [UseAVX] in 5468 defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, 5469 VEX_4V, VEX_WIG; 5470 let Constraints = "$src1 = $dst" in 5471 defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>; 5472} 5473 5474//===----------------------------------------------------------------------===// 5475// SSE4.1 - Round Instructions 5476//===----------------------------------------------------------------------===// 5477 5478multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr, 5479 X86MemOperand x86memop, RegisterClass RC, 5480 ValueType VT, PatFrag mem_frag, SDPatternOperator OpNode, 5481 X86FoldableSchedWrite sched> { 5482 // Intrinsic operation, reg. 5483 // Vector intrinsic operation, reg 5484let Uses = [MXCSR], mayRaiseFPException = 1 in { 5485 def r : SS4AIi8<opc, MRMSrcReg, 5486 (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), 5487 !strconcat(OpcodeStr, 5488 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5489 [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>, 5490 Sched<[sched]>; 5491 5492 // Vector intrinsic operation, mem 5493 def m : SS4AIi8<opc, MRMSrcMem, 5494 (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), 5495 !strconcat(OpcodeStr, 5496 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5497 [(set RC:$dst, 5498 (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>, 5499 Sched<[sched.Folded]>; 5500} 5501} 5502 5503multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd, 5504 string OpcodeStr, X86FoldableSchedWrite sched> { 5505let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in { 5506 def SSr : SS4AIi8<opcss, MRMSrcReg, 5507 (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3), 5508 !strconcat(OpcodeStr, 5509 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5510 []>, Sched<[sched]>; 5511 5512 let mayLoad = 1 in 5513 def SSm : SS4AIi8<opcss, MRMSrcMem, 5514 (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3), 5515 !strconcat(OpcodeStr, 5516 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5517 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5518} // ExeDomain = SSEPackedSingle, hasSideEffects = 0 5519 5520let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in { 5521 def SDr : SS4AIi8<opcsd, MRMSrcReg, 5522 (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3), 5523 !strconcat(OpcodeStr, 5524 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5525 []>, Sched<[sched]>; 5526 5527 let mayLoad = 1 in 5528 def SDm : SS4AIi8<opcsd, MRMSrcMem, 5529 (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3), 5530 !strconcat(OpcodeStr, 5531 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5532 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5533} // ExeDomain = SSEPackedDouble, hasSideEffects = 0 5534} 5535 5536multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd, 5537 string OpcodeStr, X86FoldableSchedWrite sched> { 5538let Uses = [MXCSR], mayRaiseFPException = 1 in { 5539let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in { 5540 def SSr : SS4AIi8<opcss, MRMSrcReg, 5541 (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2), 5542 !strconcat(OpcodeStr, 5543 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5544 []>, Sched<[sched]>; 5545 5546 let mayLoad = 1 in 5547 def SSm : SS4AIi8<opcss, MRMSrcMem, 5548 (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2), 5549 !strconcat(OpcodeStr, 5550 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5551 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5552} // ExeDomain = SSEPackedSingle, hasSideEffects = 0 5553 5554let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in { 5555 def SDr : SS4AIi8<opcsd, MRMSrcReg, 5556 (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2), 5557 !strconcat(OpcodeStr, 5558 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5559 []>, Sched<[sched]>; 5560 5561 let mayLoad = 1 in 5562 def SDm : SS4AIi8<opcsd, MRMSrcMem, 5563 (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2), 5564 !strconcat(OpcodeStr, 5565 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5566 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5567} // ExeDomain = SSEPackedDouble, hasSideEffects = 0 5568} 5569} 5570 5571multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd, 5572 string OpcodeStr, X86FoldableSchedWrite sched, 5573 ValueType VT32, ValueType VT64, 5574 SDNode OpNode, bit Is2Addr = 1> { 5575let Uses = [MXCSR], mayRaiseFPException = 1 in { 5576let ExeDomain = SSEPackedSingle in { 5577 def SSr_Int : SS4AIi8<opcss, MRMSrcReg, 5578 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), 5579 !if(Is2Addr, 5580 !strconcat(OpcodeStr, 5581 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5582 !strconcat(OpcodeStr, 5583 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5584 [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>, 5585 Sched<[sched]>; 5586 5587 def SSm_Int : SS4AIi8<opcss, MRMSrcMem, 5588 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3), 5589 !if(Is2Addr, 5590 !strconcat(OpcodeStr, 5591 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5592 !strconcat(OpcodeStr, 5593 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5594 [(set VR128:$dst, 5595 (OpNode VR128:$src1, (sse_load_f32 addr:$src2), timm:$src3))]>, 5596 Sched<[sched.Folded, sched.ReadAfterFold]>; 5597} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 5598 5599let ExeDomain = SSEPackedDouble in { 5600 def SDr_Int : SS4AIi8<opcsd, MRMSrcReg, 5601 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), 5602 !if(Is2Addr, 5603 !strconcat(OpcodeStr, 5604 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5605 !strconcat(OpcodeStr, 5606 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5607 [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>, 5608 Sched<[sched]>; 5609 5610 def SDm_Int : SS4AIi8<opcsd, MRMSrcMem, 5611 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3), 5612 !if(Is2Addr, 5613 !strconcat(OpcodeStr, 5614 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5615 !strconcat(OpcodeStr, 5616 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5617 [(set VR128:$dst, 5618 (OpNode VR128:$src1, (sse_load_f64 addr:$src2), timm:$src3))]>, 5619 Sched<[sched.Folded, sched.ReadAfterFold]>; 5620} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 5621} 5622} 5623 5624// FP round - roundss, roundps, roundsd, roundpd 5625let Predicates = [HasAVX, NoVLX] in { 5626 let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in { 5627 // Intrinsic form 5628 defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32, 5629 loadv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>, 5630 VEX, VEX_WIG; 5631 defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32, 5632 loadv8f32, X86any_VRndScale, SchedWriteFRnd.YMM>, 5633 VEX, VEX_L, VEX_WIG; 5634 } 5635 5636 let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in { 5637 defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64, 5638 loadv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>, 5639 VEX, VEX_WIG; 5640 defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64, 5641 loadv4f64, X86any_VRndScale, SchedWriteFRnd.YMM>, 5642 VEX, VEX_L, VEX_WIG; 5643 } 5644} 5645let Predicates = [UseAVX] in { 5646 defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl, 5647 v4f32, v2f64, X86RndScales, 0>, 5648 VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC; 5649 defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>, 5650 VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC; 5651} 5652 5653let Predicates = [UseAVX] in { 5654 def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2), 5655 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>; 5656 def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2), 5657 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>; 5658} 5659 5660let Predicates = [UseAVX, OptForSize] in { 5661 def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2), 5662 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>; 5663 def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2), 5664 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>; 5665} 5666 5667let ExeDomain = SSEPackedSingle in 5668defm ROUNDPS : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32, 5669 memopv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>; 5670let ExeDomain = SSEPackedDouble in 5671defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64, 5672 memopv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>; 5673 5674defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>; 5675 5676let Constraints = "$src1 = $dst" in 5677defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl, 5678 v4f32, v2f64, X86RndScales>; 5679 5680let Predicates = [UseSSE41] in { 5681 def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2), 5682 (ROUNDSSr FR32:$src1, timm:$src2)>; 5683 def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2), 5684 (ROUNDSDr FR64:$src1, timm:$src2)>; 5685} 5686 5687let Predicates = [UseSSE41, OptForSize] in { 5688 def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2), 5689 (ROUNDSSm addr:$src1, timm:$src2)>; 5690 def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2), 5691 (ROUNDSDm addr:$src1, timm:$src2)>; 5692} 5693 5694//===----------------------------------------------------------------------===// 5695// SSE4.1 - Packed Bit Test 5696//===----------------------------------------------------------------------===// 5697 5698// ptest instruction we'll lower to this in X86ISelLowering primarily from 5699// the intel intrinsic that corresponds to this. 5700let Defs = [EFLAGS], Predicates = [HasAVX] in { 5701def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), 5702 "vptest\t{$src2, $src1|$src1, $src2}", 5703 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, 5704 Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG; 5705def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), 5706 "vptest\t{$src2, $src1|$src1, $src2}", 5707 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>, 5708 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>, 5709 VEX, VEX_WIG; 5710 5711def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), 5712 "vptest\t{$src2, $src1|$src1, $src2}", 5713 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>, 5714 Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG; 5715def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2), 5716 "vptest\t{$src2, $src1|$src1, $src2}", 5717 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>, 5718 Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>, 5719 VEX, VEX_L, VEX_WIG; 5720} 5721 5722let Defs = [EFLAGS] in { 5723def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), 5724 "ptest\t{$src2, $src1|$src1, $src2}", 5725 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, 5726 Sched<[SchedWriteVecTest.XMM]>; 5727def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), 5728 "ptest\t{$src2, $src1|$src1, $src2}", 5729 [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>, 5730 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>; 5731} 5732 5733// The bit test instructions below are AVX only 5734multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, 5735 X86MemOperand x86memop, PatFrag mem_frag, ValueType vt, 5736 X86FoldableSchedWrite sched> { 5737 def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 5738 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 5739 [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, 5740 Sched<[sched]>, VEX; 5741 def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), 5742 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 5743 [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>, 5744 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX; 5745} 5746 5747let Defs = [EFLAGS], Predicates = [HasAVX] in { 5748let ExeDomain = SSEPackedSingle in { 5749defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32, 5750 SchedWriteFTest.XMM>; 5751defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32, 5752 SchedWriteFTest.YMM>, VEX_L; 5753} 5754let ExeDomain = SSEPackedDouble in { 5755defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64, 5756 SchedWriteFTest.XMM>; 5757defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64, 5758 SchedWriteFTest.YMM>, VEX_L; 5759} 5760} 5761 5762//===----------------------------------------------------------------------===// 5763// SSE4.1 - Misc Instructions 5764//===----------------------------------------------------------------------===// 5765 5766let Defs = [EFLAGS], Predicates = [HasPOPCNT] in { 5767 def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), 5768 "popcnt{w}\t{$src, $dst|$dst, $src}", 5769 [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>, 5770 Sched<[WritePOPCNT]>, OpSize16, XS; 5771 def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), 5772 "popcnt{w}\t{$src, $dst|$dst, $src}", 5773 [(set GR16:$dst, (ctpop (loadi16 addr:$src))), 5774 (implicit EFLAGS)]>, 5775 Sched<[WritePOPCNT.Folded]>, OpSize16, XS; 5776 5777 def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), 5778 "popcnt{l}\t{$src, $dst|$dst, $src}", 5779 [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>, 5780 Sched<[WritePOPCNT]>, OpSize32, XS; 5781 5782 def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), 5783 "popcnt{l}\t{$src, $dst|$dst, $src}", 5784 [(set GR32:$dst, (ctpop (loadi32 addr:$src))), 5785 (implicit EFLAGS)]>, 5786 Sched<[WritePOPCNT.Folded]>, OpSize32, XS; 5787 5788 def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), 5789 "popcnt{q}\t{$src, $dst|$dst, $src}", 5790 [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>, 5791 Sched<[WritePOPCNT]>, XS; 5792 def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), 5793 "popcnt{q}\t{$src, $dst|$dst, $src}", 5794 [(set GR64:$dst, (ctpop (loadi64 addr:$src))), 5795 (implicit EFLAGS)]>, 5796 Sched<[WritePOPCNT.Folded]>, XS; 5797} 5798 5799// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16. 5800multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, 5801 SDNode OpNode, PatFrag ld_frag, 5802 X86FoldableSchedWrite Sched> { 5803 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 5804 (ins VR128:$src), 5805 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5806 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>, 5807 Sched<[Sched]>; 5808 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 5809 (ins i128mem:$src), 5810 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5811 [(set VR128:$dst, 5812 (v8i16 (OpNode (ld_frag addr:$src))))]>, 5813 Sched<[Sched.Folded]>; 5814} 5815 5816// PHMIN has the same profile as PSAD, thus we use the same scheduling 5817// model, although the naming is misleading. 5818let Predicates = [HasAVX] in 5819defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw", 5820 X86phminpos, load, 5821 WritePHMINPOS>, VEX, VEX_WIG; 5822defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw", 5823 X86phminpos, memop, 5824 WritePHMINPOS>; 5825 5826/// SS48I_binop_rm - Simple SSE41 binary operator. 5827multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 5828 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 5829 X86MemOperand x86memop, X86FoldableSchedWrite sched, 5830 bit Is2Addr = 1> { 5831 let isCommutable = 1 in 5832 def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst), 5833 (ins RC:$src1, RC:$src2), 5834 !if(Is2Addr, 5835 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5836 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5837 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 5838 Sched<[sched]>; 5839 def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst), 5840 (ins RC:$src1, x86memop:$src2), 5841 !if(Is2Addr, 5842 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5843 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5844 [(set RC:$dst, 5845 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 5846 Sched<[sched.Folded, sched.ReadAfterFold]>; 5847} 5848 5849let Predicates = [HasAVX, NoVLX] in { 5850 defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128, 5851 load, i128mem, SchedWriteVecALU.XMM, 0>, 5852 VEX_4V, VEX_WIG; 5853 defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128, 5854 load, i128mem, SchedWriteVecALU.XMM, 0>, 5855 VEX_4V, VEX_WIG; 5856 defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128, 5857 load, i128mem, SchedWriteVecALU.XMM, 0>, 5858 VEX_4V, VEX_WIG; 5859 defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128, 5860 load, i128mem, SchedWriteVecALU.XMM, 0>, 5861 VEX_4V, VEX_WIG; 5862 defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128, 5863 load, i128mem, SchedWriteVecIMul.XMM, 0>, 5864 VEX_4V, VEX_WIG; 5865} 5866let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5867 defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128, 5868 load, i128mem, SchedWriteVecALU.XMM, 0>, 5869 VEX_4V, VEX_WIG; 5870 defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128, 5871 load, i128mem, SchedWriteVecALU.XMM, 0>, 5872 VEX_4V, VEX_WIG; 5873 defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128, 5874 load, i128mem, SchedWriteVecALU.XMM, 0>, 5875 VEX_4V, VEX_WIG; 5876 defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128, 5877 load, i128mem, SchedWriteVecALU.XMM, 0>, 5878 VEX_4V, VEX_WIG; 5879} 5880 5881let Predicates = [HasAVX2, NoVLX] in { 5882 defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256, 5883 load, i256mem, SchedWriteVecALU.YMM, 0>, 5884 VEX_4V, VEX_L, VEX_WIG; 5885 defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256, 5886 load, i256mem, SchedWriteVecALU.YMM, 0>, 5887 VEX_4V, VEX_L, VEX_WIG; 5888 defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256, 5889 load, i256mem, SchedWriteVecALU.YMM, 0>, 5890 VEX_4V, VEX_L, VEX_WIG; 5891 defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256, 5892 load, i256mem, SchedWriteVecALU.YMM, 0>, 5893 VEX_4V, VEX_L, VEX_WIG; 5894 defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256, 5895 load, i256mem, SchedWriteVecIMul.YMM, 0>, 5896 VEX_4V, VEX_L, VEX_WIG; 5897} 5898let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 5899 defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256, 5900 load, i256mem, SchedWriteVecALU.YMM, 0>, 5901 VEX_4V, VEX_L, VEX_WIG; 5902 defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256, 5903 load, i256mem, SchedWriteVecALU.YMM, 0>, 5904 VEX_4V, VEX_L, VEX_WIG; 5905 defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256, 5906 load, i256mem, SchedWriteVecALU.YMM, 0>, 5907 VEX_4V, VEX_L, VEX_WIG; 5908 defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256, 5909 load, i256mem, SchedWriteVecALU.YMM, 0>, 5910 VEX_4V, VEX_L, VEX_WIG; 5911} 5912 5913let Constraints = "$src1 = $dst" in { 5914 defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128, 5915 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5916 defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128, 5917 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5918 defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128, 5919 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5920 defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128, 5921 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5922 defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128, 5923 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5924 defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128, 5925 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5926 defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128, 5927 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5928 defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128, 5929 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5930 defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128, 5931 memop, i128mem, SchedWriteVecIMul.XMM, 1>; 5932} 5933 5934let Predicates = [HasAVX, NoVLX] in 5935 defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, 5936 load, i128mem, SchedWritePMULLD.XMM, 0>, 5937 VEX_4V, VEX_WIG; 5938let Predicates = [HasAVX] in 5939 defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, 5940 load, i128mem, SchedWriteVecALU.XMM, 0>, 5941 VEX_4V, VEX_WIG; 5942 5943let Predicates = [HasAVX2, NoVLX] in 5944 defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, 5945 load, i256mem, SchedWritePMULLD.YMM, 0>, 5946 VEX_4V, VEX_L, VEX_WIG; 5947let Predicates = [HasAVX2] in 5948 defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, 5949 load, i256mem, SchedWriteVecALU.YMM, 0>, 5950 VEX_4V, VEX_L, VEX_WIG; 5951 5952let Constraints = "$src1 = $dst" in { 5953 defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128, 5954 memop, i128mem, SchedWritePMULLD.XMM, 1>; 5955 defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128, 5956 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5957} 5958 5959/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate 5960multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, 5961 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, 5962 X86MemOperand x86memop, bit Is2Addr, 5963 X86FoldableSchedWrite sched> { 5964 let isCommutable = 1 in 5965 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 5966 (ins RC:$src1, RC:$src2, u8imm:$src3), 5967 !if(Is2Addr, 5968 !strconcat(OpcodeStr, 5969 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5970 !strconcat(OpcodeStr, 5971 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5972 [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>, 5973 Sched<[sched]>; 5974 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 5975 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 5976 !if(Is2Addr, 5977 !strconcat(OpcodeStr, 5978 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5979 !strconcat(OpcodeStr, 5980 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5981 [(set RC:$dst, 5982 (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>, 5983 Sched<[sched.Folded, sched.ReadAfterFold]>; 5984} 5985 5986/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate 5987multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 5988 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 5989 X86MemOperand x86memop, bit Is2Addr, 5990 X86FoldableSchedWrite sched> { 5991 let isCommutable = 1 in 5992 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 5993 (ins RC:$src1, RC:$src2, u8imm:$src3), 5994 !if(Is2Addr, 5995 !strconcat(OpcodeStr, 5996 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5997 !strconcat(OpcodeStr, 5998 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5999 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, 6000 Sched<[sched]>; 6001 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 6002 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 6003 !if(Is2Addr, 6004 !strconcat(OpcodeStr, 6005 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6006 !strconcat(OpcodeStr, 6007 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6008 [(set RC:$dst, 6009 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>, 6010 Sched<[sched.Folded, sched.ReadAfterFold]>; 6011} 6012 6013def BlendCommuteImm2 : SDNodeXForm<timm, [{ 6014 uint8_t Imm = N->getZExtValue() & 0x03; 6015 return getI8Imm(Imm ^ 0x03, SDLoc(N)); 6016}]>; 6017 6018def BlendCommuteImm4 : SDNodeXForm<timm, [{ 6019 uint8_t Imm = N->getZExtValue() & 0x0f; 6020 return getI8Imm(Imm ^ 0x0f, SDLoc(N)); 6021}]>; 6022 6023def BlendCommuteImm8 : SDNodeXForm<timm, [{ 6024 uint8_t Imm = N->getZExtValue() & 0xff; 6025 return getI8Imm(Imm ^ 0xff, SDLoc(N)); 6026}]>; 6027 6028// Turn a 4-bit blendi immediate to 8-bit for use with pblendw. 6029def BlendScaleImm4 : SDNodeXForm<timm, [{ 6030 uint8_t Imm = N->getZExtValue(); 6031 uint8_t NewImm = 0; 6032 for (unsigned i = 0; i != 4; ++i) { 6033 if (Imm & (1 << i)) 6034 NewImm |= 0x3 << (i * 2); 6035 } 6036 return getI8Imm(NewImm, SDLoc(N)); 6037}]>; 6038 6039// Turn a 2-bit blendi immediate to 8-bit for use with pblendw. 6040def BlendScaleImm2 : SDNodeXForm<timm, [{ 6041 uint8_t Imm = N->getZExtValue(); 6042 uint8_t NewImm = 0; 6043 for (unsigned i = 0; i != 2; ++i) { 6044 if (Imm & (1 << i)) 6045 NewImm |= 0xf << (i * 4); 6046 } 6047 return getI8Imm(NewImm, SDLoc(N)); 6048}]>; 6049 6050// Turn a 2-bit blendi immediate to 4-bit for use with pblendd. 6051def BlendScaleImm2to4 : SDNodeXForm<timm, [{ 6052 uint8_t Imm = N->getZExtValue(); 6053 uint8_t NewImm = 0; 6054 for (unsigned i = 0; i != 2; ++i) { 6055 if (Imm & (1 << i)) 6056 NewImm |= 0x3 << (i * 2); 6057 } 6058 return getI8Imm(NewImm, SDLoc(N)); 6059}]>; 6060 6061// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it. 6062def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{ 6063 uint8_t Imm = N->getZExtValue(); 6064 uint8_t NewImm = 0; 6065 for (unsigned i = 0; i != 4; ++i) { 6066 if (Imm & (1 << i)) 6067 NewImm |= 0x3 << (i * 2); 6068 } 6069 return getI8Imm(NewImm ^ 0xff, SDLoc(N)); 6070}]>; 6071 6072// Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it. 6073def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{ 6074 uint8_t Imm = N->getZExtValue(); 6075 uint8_t NewImm = 0; 6076 for (unsigned i = 0; i != 2; ++i) { 6077 if (Imm & (1 << i)) 6078 NewImm |= 0xf << (i * 4); 6079 } 6080 return getI8Imm(NewImm ^ 0xff, SDLoc(N)); 6081}]>; 6082 6083// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it. 6084def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{ 6085 uint8_t Imm = N->getZExtValue(); 6086 uint8_t NewImm = 0; 6087 for (unsigned i = 0; i != 2; ++i) { 6088 if (Imm & (1 << i)) 6089 NewImm |= 0x3 << (i * 2); 6090 } 6091 return getI8Imm(NewImm ^ 0xf, SDLoc(N)); 6092}]>; 6093 6094let Predicates = [HasAVX] in { 6095 let isCommutable = 0 in { 6096 defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, 6097 VR128, load, i128mem, 0, 6098 SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG; 6099 } 6100 6101let Uses = [MXCSR], mayRaiseFPException = 1 in { 6102 let ExeDomain = SSEPackedSingle in 6103 defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, 6104 VR128, load, f128mem, 0, 6105 SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG; 6106 let ExeDomain = SSEPackedDouble in 6107 defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, 6108 VR128, load, f128mem, 0, 6109 SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG; 6110 let ExeDomain = SSEPackedSingle in 6111 defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, 6112 VR256, load, i256mem, 0, 6113 SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG; 6114} 6115} 6116 6117let Predicates = [HasAVX2] in { 6118 let isCommutable = 0 in { 6119 defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, 6120 VR256, load, i256mem, 0, 6121 SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG; 6122 } 6123} 6124 6125let Constraints = "$src1 = $dst" in { 6126 let isCommutable = 0 in { 6127 defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, 6128 VR128, memop, i128mem, 1, 6129 SchedWriteMPSAD.XMM>; 6130 } 6131 6132 let ExeDomain = SSEPackedSingle in 6133 defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, 6134 VR128, memop, f128mem, 1, 6135 SchedWriteDPPS.XMM>, SIMD_EXC; 6136 let ExeDomain = SSEPackedDouble in 6137 defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, 6138 VR128, memop, f128mem, 1, 6139 SchedWriteDPPD.XMM>, SIMD_EXC; 6140} 6141 6142/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate 6143multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 6144 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6145 X86MemOperand x86memop, bit Is2Addr, Domain d, 6146 X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> { 6147let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in { 6148 let isCommutable = 1 in 6149 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 6150 (ins RC:$src1, RC:$src2, u8imm:$src3), 6151 !if(Is2Addr, 6152 !strconcat(OpcodeStr, 6153 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6154 !strconcat(OpcodeStr, 6155 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6156 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, 6157 Sched<[sched]>; 6158 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 6159 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 6160 !if(Is2Addr, 6161 !strconcat(OpcodeStr, 6162 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6163 !strconcat(OpcodeStr, 6164 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6165 [(set RC:$dst, 6166 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>, 6167 Sched<[sched.Folded, sched.ReadAfterFold]>; 6168} 6169 6170 // Pattern to commute if load is in first source. 6171 def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)), 6172 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, 6173 (commuteXForm timm:$src3))>; 6174} 6175 6176let Predicates = [HasAVX] in { 6177 defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32, 6178 VR128, load, f128mem, 0, SSEPackedSingle, 6179 SchedWriteFBlend.XMM, BlendCommuteImm4>, 6180 VEX_4V, VEX_WIG; 6181 defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32, 6182 VR256, load, f256mem, 0, SSEPackedSingle, 6183 SchedWriteFBlend.YMM, BlendCommuteImm8>, 6184 VEX_4V, VEX_L, VEX_WIG; 6185 defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64, 6186 VR128, load, f128mem, 0, SSEPackedDouble, 6187 SchedWriteFBlend.XMM, BlendCommuteImm2>, 6188 VEX_4V, VEX_WIG; 6189 defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64, 6190 VR256, load, f256mem, 0, SSEPackedDouble, 6191 SchedWriteFBlend.YMM, BlendCommuteImm4>, 6192 VEX_4V, VEX_L, VEX_WIG; 6193 defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16, 6194 VR128, load, i128mem, 0, SSEPackedInt, 6195 SchedWriteBlend.XMM, BlendCommuteImm8>, 6196 VEX_4V, VEX_WIG; 6197} 6198 6199let Predicates = [HasAVX2] in { 6200 defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16, 6201 VR256, load, i256mem, 0, SSEPackedInt, 6202 SchedWriteBlend.YMM, BlendCommuteImm8>, 6203 VEX_4V, VEX_L, VEX_WIG; 6204} 6205 6206// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw. 6207// ExecutionDomainFixPass will cleanup domains later on. 6208let Predicates = [HasAVX1Only] in { 6209def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3), 6210 (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>; 6211def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3), 6212 (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>; 6213def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3), 6214 (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>; 6215 6216// Use pblendw for 128-bit integer to keep it in the integer domain and prevent 6217// it from becoming movsd via commuting under optsize. 6218def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), 6219 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>; 6220def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3), 6221 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>; 6222def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3), 6223 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>; 6224 6225def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3), 6226 (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>; 6227def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3), 6228 (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>; 6229def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3), 6230 (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>; 6231 6232// Use pblendw for 128-bit integer to keep it in the integer domain and prevent 6233// it from becoming movss via commuting under optsize. 6234def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3), 6235 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>; 6236def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3), 6237 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; 6238def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3), 6239 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; 6240} 6241 6242defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32, 6243 VR128, memop, f128mem, 1, SSEPackedSingle, 6244 SchedWriteFBlend.XMM, BlendCommuteImm4>; 6245defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64, 6246 VR128, memop, f128mem, 1, SSEPackedDouble, 6247 SchedWriteFBlend.XMM, BlendCommuteImm2>; 6248defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16, 6249 VR128, memop, i128mem, 1, SSEPackedInt, 6250 SchedWriteBlend.XMM, BlendCommuteImm8>; 6251 6252let Predicates = [UseSSE41] in { 6253// Use pblendw for 128-bit integer to keep it in the integer domain and prevent 6254// it from becoming movss via commuting under optsize. 6255def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), 6256 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>; 6257def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3), 6258 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>; 6259def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3), 6260 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>; 6261 6262def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3), 6263 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>; 6264def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3), 6265 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; 6266def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3), 6267 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; 6268} 6269 6270// For insertion into the zero index (low half) of a 256-bit vector, it is 6271// more efficient to generate a blend with immediate instead of an insert*128. 6272let Predicates = [HasAVX] in { 6273def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)), 6274 (VBLENDPDYrri VR256:$src1, 6275 (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 6276 VR128:$src2, sub_xmm), 0x3)>; 6277def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)), 6278 (VBLENDPSYrri VR256:$src1, 6279 (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 6280 VR128:$src2, sub_xmm), 0xf)>; 6281 6282def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)), 6283 (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 6284 VR128:$src1, sub_xmm), addr:$src2, 0xc)>; 6285def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)), 6286 (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 6287 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 6288} 6289 6290/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators 6291multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC, 6292 X86MemOperand x86memop, ValueType VT, 6293 PatFrag mem_frag, SDNode OpNode, 6294 X86FoldableSchedWrite sched> { 6295 def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst), 6296 (ins RC:$src1, RC:$src2, RC:$src3), 6297 !strconcat(OpcodeStr, 6298 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 6299 [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))], 6300 SSEPackedInt>, TAPD, VEX_4V, 6301 Sched<[sched]>; 6302 6303 def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst), 6304 (ins RC:$src1, x86memop:$src2, RC:$src3), 6305 !strconcat(OpcodeStr, 6306 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 6307 [(set RC:$dst, 6308 (OpNode RC:$src3, (mem_frag addr:$src2), 6309 RC:$src1))], SSEPackedInt>, TAPD, VEX_4V, 6310 Sched<[sched.Folded, sched.ReadAfterFold, 6311 // x86memop:$src2 6312 ReadDefault, ReadDefault, ReadDefault, ReadDefault, 6313 ReadDefault, 6314 // RC::$src3 6315 sched.ReadAfterFold]>; 6316} 6317 6318let Predicates = [HasAVX] in { 6319let ExeDomain = SSEPackedDouble in { 6320defm VBLENDVPD : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem, 6321 v2f64, loadv2f64, X86Blendv, 6322 SchedWriteFVarBlend.XMM>; 6323defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem, 6324 v4f64, loadv4f64, X86Blendv, 6325 SchedWriteFVarBlend.YMM>, VEX_L; 6326} // ExeDomain = SSEPackedDouble 6327let ExeDomain = SSEPackedSingle in { 6328defm VBLENDVPS : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem, 6329 v4f32, loadv4f32, X86Blendv, 6330 SchedWriteFVarBlend.XMM>; 6331defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem, 6332 v8f32, loadv8f32, X86Blendv, 6333 SchedWriteFVarBlend.YMM>, VEX_L; 6334} // ExeDomain = SSEPackedSingle 6335defm VPBLENDVB : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem, 6336 v16i8, loadv16i8, X86Blendv, 6337 SchedWriteVarBlend.XMM>; 6338} 6339 6340let Predicates = [HasAVX2] in { 6341defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem, 6342 v32i8, loadv32i8, X86Blendv, 6343 SchedWriteVarBlend.YMM>, VEX_L; 6344} 6345 6346let Predicates = [HasAVX] in { 6347 def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1), 6348 (v4i32 VR128:$src2))), 6349 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6350 def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1), 6351 (v2i64 VR128:$src2))), 6352 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6353 def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1), 6354 (v8i32 VR256:$src2))), 6355 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6356 def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1), 6357 (v4i64 VR256:$src2))), 6358 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6359} 6360 6361// Prefer a movss or movsd over a blendps when optimizing for size. these were 6362// changed to use blends because blends have better throughput on sandybridge 6363// and haswell, but movs[s/d] are 1-2 byte shorter instructions. 6364let Predicates = [HasAVX, OptForSpeed] in { 6365 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 6366 (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; 6367 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 6368 (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; 6369 6370 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), 6371 (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; 6372 def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))), 6373 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; 6374 def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)), 6375 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; 6376 6377 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), 6378 (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; 6379 def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))), 6380 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; 6381 def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)), 6382 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; 6383 6384 // Move low f32 and clear high bits. 6385 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), 6386 (SUBREG_TO_REG (i32 0), 6387 (v4f32 (VBLENDPSrri (v4f32 (V_SET0)), 6388 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), 6389 (i8 1))), sub_xmm)>; 6390 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), 6391 (SUBREG_TO_REG (i32 0), 6392 (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), 6393 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), 6394 (i8 3))), sub_xmm)>; 6395} 6396 6397// Prefer a movss or movsd over a blendps when optimizing for size. these were 6398// changed to use blends because blends have better throughput on sandybridge 6399// and haswell, but movs[s/d] are 1-2 byte shorter instructions. 6400let Predicates = [UseSSE41, OptForSpeed] in { 6401 // With SSE41 we can use blends for these patterns. 6402 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 6403 (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; 6404 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 6405 (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; 6406 6407 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), 6408 (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; 6409 def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))), 6410 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; 6411 def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)), 6412 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; 6413 6414 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), 6415 (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; 6416 def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))), 6417 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; 6418 def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)), 6419 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; 6420} 6421 6422 6423/// SS41I_ternary - SSE 4.1 ternary operator 6424let Uses = [XMM0], Constraints = "$src1 = $dst" in { 6425 multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT, 6426 PatFrag mem_frag, X86MemOperand x86memop, 6427 SDNode OpNode, X86FoldableSchedWrite sched> { 6428 def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 6429 (ins VR128:$src1, VR128:$src2), 6430 !strconcat(OpcodeStr, 6431 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6432 [(set VR128:$dst, 6433 (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>, 6434 Sched<[sched]>; 6435 6436 def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 6437 (ins VR128:$src1, x86memop:$src2), 6438 !strconcat(OpcodeStr, 6439 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6440 [(set VR128:$dst, 6441 (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>, 6442 Sched<[sched.Folded, sched.ReadAfterFold]>; 6443 } 6444} 6445 6446let ExeDomain = SSEPackedDouble in 6447defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem, 6448 X86Blendv, SchedWriteFVarBlend.XMM>; 6449let ExeDomain = SSEPackedSingle in 6450defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem, 6451 X86Blendv, SchedWriteFVarBlend.XMM>; 6452defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem, 6453 X86Blendv, SchedWriteVarBlend.XMM>; 6454 6455// Aliases with the implicit xmm0 argument 6456def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", 6457 (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>; 6458def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", 6459 (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>; 6460def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}", 6461 (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>; 6462def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}", 6463 (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>; 6464def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", 6465 (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>; 6466def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", 6467 (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>; 6468 6469let Predicates = [UseSSE41] in { 6470 def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1), 6471 (v4i32 VR128:$src2))), 6472 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; 6473 def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1), 6474 (v2i64 VR128:$src2))), 6475 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; 6476} 6477 6478let AddedComplexity = 400 in { // Prefer non-temporal versions 6479 6480let Predicates = [HasAVX, NoVLX] in 6481def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 6482 "vmovntdqa\t{$src, $dst|$dst, $src}", []>, 6483 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG; 6484let Predicates = [HasAVX2, NoVLX] in 6485def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 6486 "vmovntdqa\t{$src, $dst|$dst, $src}", []>, 6487 Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG; 6488def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 6489 "movntdqa\t{$src, $dst|$dst, $src}", []>, 6490 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>; 6491 6492let Predicates = [HasAVX2, NoVLX] in { 6493 def : Pat<(v8f32 (alignednontemporalload addr:$src)), 6494 (VMOVNTDQAYrm addr:$src)>; 6495 def : Pat<(v4f64 (alignednontemporalload addr:$src)), 6496 (VMOVNTDQAYrm addr:$src)>; 6497 def : Pat<(v4i64 (alignednontemporalload addr:$src)), 6498 (VMOVNTDQAYrm addr:$src)>; 6499 def : Pat<(v8i32 (alignednontemporalload addr:$src)), 6500 (VMOVNTDQAYrm addr:$src)>; 6501 def : Pat<(v16i16 (alignednontemporalload addr:$src)), 6502 (VMOVNTDQAYrm addr:$src)>; 6503 def : Pat<(v16f16 (alignednontemporalload addr:$src)), 6504 (VMOVNTDQAYrm addr:$src)>; 6505 def : Pat<(v32i8 (alignednontemporalload addr:$src)), 6506 (VMOVNTDQAYrm addr:$src)>; 6507} 6508 6509let Predicates = [HasAVX, NoVLX] in { 6510 def : Pat<(v4f32 (alignednontemporalload addr:$src)), 6511 (VMOVNTDQArm addr:$src)>; 6512 def : Pat<(v2f64 (alignednontemporalload addr:$src)), 6513 (VMOVNTDQArm addr:$src)>; 6514 def : Pat<(v2i64 (alignednontemporalload addr:$src)), 6515 (VMOVNTDQArm addr:$src)>; 6516 def : Pat<(v4i32 (alignednontemporalload addr:$src)), 6517 (VMOVNTDQArm addr:$src)>; 6518 def : Pat<(v8i16 (alignednontemporalload addr:$src)), 6519 (VMOVNTDQArm addr:$src)>; 6520 def : Pat<(v8f16 (alignednontemporalload addr:$src)), 6521 (VMOVNTDQArm addr:$src)>; 6522 def : Pat<(v16i8 (alignednontemporalload addr:$src)), 6523 (VMOVNTDQArm addr:$src)>; 6524} 6525 6526let Predicates = [UseSSE41] in { 6527 def : Pat<(v4f32 (alignednontemporalload addr:$src)), 6528 (MOVNTDQArm addr:$src)>; 6529 def : Pat<(v2f64 (alignednontemporalload addr:$src)), 6530 (MOVNTDQArm addr:$src)>; 6531 def : Pat<(v2i64 (alignednontemporalload addr:$src)), 6532 (MOVNTDQArm addr:$src)>; 6533 def : Pat<(v4i32 (alignednontemporalload addr:$src)), 6534 (MOVNTDQArm addr:$src)>; 6535 def : Pat<(v8i16 (alignednontemporalload addr:$src)), 6536 (MOVNTDQArm addr:$src)>; 6537 def : Pat<(v8f16 (alignednontemporalload addr:$src)), 6538 (MOVNTDQArm addr:$src)>; 6539 def : Pat<(v16i8 (alignednontemporalload addr:$src)), 6540 (MOVNTDQArm addr:$src)>; 6541} 6542 6543} // AddedComplexity 6544 6545//===----------------------------------------------------------------------===// 6546// SSE4.2 - Compare Instructions 6547//===----------------------------------------------------------------------===// 6548 6549/// SS42I_binop_rm - Simple SSE 4.2 binary operator 6550multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 6551 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6552 X86MemOperand x86memop, X86FoldableSchedWrite sched, 6553 bit Is2Addr = 1> { 6554 def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst), 6555 (ins RC:$src1, RC:$src2), 6556 !if(Is2Addr, 6557 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6558 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6559 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 6560 Sched<[sched]>; 6561 def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst), 6562 (ins RC:$src1, x86memop:$src2), 6563 !if(Is2Addr, 6564 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6565 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6566 [(set RC:$dst, 6567 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 6568 Sched<[sched.Folded, sched.ReadAfterFold]>; 6569} 6570 6571let Predicates = [HasAVX] in 6572 defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128, 6573 load, i128mem, SchedWriteVecALU.XMM, 0>, 6574 VEX_4V, VEX_WIG; 6575 6576let Predicates = [HasAVX2] in 6577 defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256, 6578 load, i256mem, SchedWriteVecALU.YMM, 0>, 6579 VEX_4V, VEX_L, VEX_WIG; 6580 6581let Constraints = "$src1 = $dst" in 6582 defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128, 6583 memop, i128mem, SchedWriteVecALU.XMM>; 6584 6585//===----------------------------------------------------------------------===// 6586// SSE4.2 - String/text Processing Instructions 6587//===----------------------------------------------------------------------===// 6588 6589multiclass pcmpistrm_SS42AI<string asm> { 6590 def rr : SS42AI<0x62, MRMSrcReg, (outs), 6591 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6592 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6593 []>, Sched<[WritePCmpIStrM]>; 6594 let mayLoad = 1 in 6595 def rm :SS42AI<0x62, MRMSrcMem, (outs), 6596 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6597 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6598 []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>; 6599} 6600 6601let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in { 6602 let Predicates = [HasAVX] in 6603 defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX, VEX_WIG; 6604 defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ; 6605} 6606 6607multiclass SS42AI_pcmpestrm<string asm> { 6608 def rr : SS42AI<0x60, MRMSrcReg, (outs), 6609 (ins VR128:$src1, VR128:$src3, u8imm:$src5), 6610 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6611 []>, Sched<[WritePCmpEStrM]>; 6612 let mayLoad = 1 in 6613 def rm : SS42AI<0x60, MRMSrcMem, (outs), 6614 (ins VR128:$src1, i128mem:$src3, u8imm:$src5), 6615 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6616 []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>; 6617} 6618 6619let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { 6620 let Predicates = [HasAVX] in 6621 defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX, VEX_WIG; 6622 defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">; 6623} 6624 6625multiclass SS42AI_pcmpistri<string asm> { 6626 def rr : SS42AI<0x63, MRMSrcReg, (outs), 6627 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6628 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6629 []>, Sched<[WritePCmpIStrI]>; 6630 let mayLoad = 1 in 6631 def rm : SS42AI<0x63, MRMSrcMem, (outs), 6632 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6633 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6634 []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>; 6635} 6636 6637let Defs = [ECX, EFLAGS], hasSideEffects = 0 in { 6638 let Predicates = [HasAVX] in 6639 defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX, VEX_WIG; 6640 defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">; 6641} 6642 6643multiclass SS42AI_pcmpestri<string asm> { 6644 def rr : SS42AI<0x61, MRMSrcReg, (outs), 6645 (ins VR128:$src1, VR128:$src3, u8imm:$src5), 6646 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6647 []>, Sched<[WritePCmpEStrI]>; 6648 let mayLoad = 1 in 6649 def rm : SS42AI<0x61, MRMSrcMem, (outs), 6650 (ins VR128:$src1, i128mem:$src3, u8imm:$src5), 6651 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6652 []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>; 6653} 6654 6655let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { 6656 let Predicates = [HasAVX] in 6657 defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX, VEX_WIG; 6658 defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">; 6659} 6660 6661//===----------------------------------------------------------------------===// 6662// SSE4.2 - CRC Instructions 6663//===----------------------------------------------------------------------===// 6664 6665// No CRC instructions have AVX equivalents 6666 6667// crc intrinsic instruction 6668// This set of instructions are only rm, the only difference is the size 6669// of r and m. 6670class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut, 6671 RegisterClass RCIn, SDPatternOperator Int> : 6672 CRC32I<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2), 6673 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), 6674 [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>, 6675 Sched<[WriteCRC32]>; 6676 6677class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut, 6678 X86MemOperand x86memop, SDPatternOperator Int> : 6679 CRC32I<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2), 6680 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), 6681 [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>, 6682 Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>; 6683 6684let Constraints = "$src1 = $dst" in { 6685 def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem, 6686 int_x86_sse42_crc32_32_8>; 6687 def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8, 6688 int_x86_sse42_crc32_32_8>; 6689 def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem, 6690 int_x86_sse42_crc32_32_16>, OpSize16; 6691 def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16, 6692 int_x86_sse42_crc32_32_16>, OpSize16; 6693 def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem, 6694 int_x86_sse42_crc32_32_32>, OpSize32; 6695 def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32, 6696 int_x86_sse42_crc32_32_32>, OpSize32; 6697 def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem, 6698 int_x86_sse42_crc32_64_64>, REX_W; 6699 def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64, 6700 int_x86_sse42_crc32_64_64>, REX_W; 6701 let hasSideEffects = 0 in { 6702 let mayLoad = 1 in 6703 def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem, 6704 null_frag>, REX_W; 6705 def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8, 6706 null_frag>, REX_W; 6707 } 6708} 6709 6710//===----------------------------------------------------------------------===// 6711// SHA-NI Instructions 6712//===----------------------------------------------------------------------===// 6713 6714// FIXME: Is there a better scheduler class for SHA than WriteVecIMul? 6715multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId, 6716 X86FoldableSchedWrite sched, bit UsesXMM0 = 0> { 6717 def rr : I<Opc, MRMSrcReg, (outs VR128:$dst), 6718 (ins VR128:$src1, VR128:$src2), 6719 !if(UsesXMM0, 6720 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6721 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), 6722 [!if(UsesXMM0, 6723 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)), 6724 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, 6725 T8PS, Sched<[sched]>; 6726 6727 def rm : I<Opc, MRMSrcMem, (outs VR128:$dst), 6728 (ins VR128:$src1, i128mem:$src2), 6729 !if(UsesXMM0, 6730 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6731 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), 6732 [!if(UsesXMM0, 6733 (set VR128:$dst, (IntId VR128:$src1, 6734 (memop addr:$src2), XMM0)), 6735 (set VR128:$dst, (IntId VR128:$src1, 6736 (memop addr:$src2))))]>, T8PS, 6737 Sched<[sched.Folded, sched.ReadAfterFold]>; 6738} 6739 6740let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { 6741 def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst), 6742 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6743 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6744 [(set VR128:$dst, 6745 (int_x86_sha1rnds4 VR128:$src1, VR128:$src2, 6746 (i8 timm:$src3)))]>, TAPS, 6747 Sched<[SchedWriteVecIMul.XMM]>; 6748 def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst), 6749 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6750 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6751 [(set VR128:$dst, 6752 (int_x86_sha1rnds4 VR128:$src1, 6753 (memop addr:$src2), 6754 (i8 timm:$src3)))]>, TAPS, 6755 Sched<[SchedWriteVecIMul.XMM.Folded, 6756 SchedWriteVecIMul.XMM.ReadAfterFold]>; 6757 6758 defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte, 6759 SchedWriteVecIMul.XMM>; 6760 defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1, 6761 SchedWriteVecIMul.XMM>; 6762 defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2, 6763 SchedWriteVecIMul.XMM>; 6764 6765 let Uses=[XMM0] in 6766 defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 6767 SchedWriteVecIMul.XMM, 1>; 6768 6769 defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1, 6770 SchedWriteVecIMul.XMM>; 6771 defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2, 6772 SchedWriteVecIMul.XMM>; 6773} 6774 6775// Aliases with explicit %xmm0 6776def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}", 6777 (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>; 6778def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}", 6779 (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>; 6780 6781//===----------------------------------------------------------------------===// 6782// AES-NI Instructions 6783//===----------------------------------------------------------------------===// 6784 6785multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, 6786 Intrinsic IntId, PatFrag ld_frag, 6787 bit Is2Addr = 0, RegisterClass RC = VR128, 6788 X86MemOperand MemOp = i128mem> { 6789 let AsmString = OpcodeStr# 6790 !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}", 6791 "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { 6792 def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst), 6793 (ins RC:$src1, RC:$src2), "", 6794 [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>, 6795 Sched<[WriteAESDecEnc]>; 6796 def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst), 6797 (ins RC:$src1, MemOp:$src2), "", 6798 [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>, 6799 Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>; 6800 } 6801} 6802 6803// Perform One Round of an AES Encryption/Decryption Flow 6804let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in { 6805 defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", 6806 int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG; 6807 defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", 6808 int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG; 6809 defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec", 6810 int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG; 6811 defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast", 6812 int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG; 6813} 6814 6815let Predicates = [NoVLX, HasVAES] in { 6816 defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc", 6817 int_x86_aesni_aesenc_256, load, 0, VR256, 6818 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6819 defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast", 6820 int_x86_aesni_aesenclast_256, load, 0, VR256, 6821 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6822 defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec", 6823 int_x86_aesni_aesdec_256, load, 0, VR256, 6824 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6825 defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast", 6826 int_x86_aesni_aesdeclast_256, load, 0, VR256, 6827 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6828} 6829 6830let Constraints = "$src1 = $dst" in { 6831 defm AESENC : AESI_binop_rm_int<0xDC, "aesenc", 6832 int_x86_aesni_aesenc, memop, 1>; 6833 defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast", 6834 int_x86_aesni_aesenclast, memop, 1>; 6835 defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec", 6836 int_x86_aesni_aesdec, memop, 1>; 6837 defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast", 6838 int_x86_aesni_aesdeclast, memop, 1>; 6839} 6840 6841// Perform the AES InvMixColumn Transformation 6842let Predicates = [HasAVX, HasAES] in { 6843 def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), 6844 (ins VR128:$src1), 6845 "vaesimc\t{$src1, $dst|$dst, $src1}", 6846 [(set VR128:$dst, 6847 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>, 6848 VEX, VEX_WIG; 6849 def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), 6850 (ins i128mem:$src1), 6851 "vaesimc\t{$src1, $dst|$dst, $src1}", 6852 [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>, 6853 Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG; 6854} 6855def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), 6856 (ins VR128:$src1), 6857 "aesimc\t{$src1, $dst|$dst, $src1}", 6858 [(set VR128:$dst, 6859 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>; 6860def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), 6861 (ins i128mem:$src1), 6862 "aesimc\t{$src1, $dst|$dst, $src1}", 6863 [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>, 6864 Sched<[WriteAESIMC.Folded]>; 6865 6866// AES Round Key Generation Assist 6867let Predicates = [HasAVX, HasAES] in { 6868 def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), 6869 (ins VR128:$src1, u8imm:$src2), 6870 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6871 [(set VR128:$dst, 6872 (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>, 6873 Sched<[WriteAESKeyGen]>, VEX, VEX_WIG; 6874 def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), 6875 (ins i128mem:$src1, u8imm:$src2), 6876 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6877 [(set VR128:$dst, 6878 (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>, 6879 Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG; 6880} 6881def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), 6882 (ins VR128:$src1, u8imm:$src2), 6883 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6884 [(set VR128:$dst, 6885 (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>, 6886 Sched<[WriteAESKeyGen]>; 6887def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), 6888 (ins i128mem:$src1, u8imm:$src2), 6889 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6890 [(set VR128:$dst, 6891 (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>, 6892 Sched<[WriteAESKeyGen.Folded]>; 6893 6894//===----------------------------------------------------------------------===// 6895// PCLMUL Instructions 6896//===----------------------------------------------------------------------===// 6897 6898// Immediate transform to help with commuting. 6899def PCLMULCommuteImm : SDNodeXForm<timm, [{ 6900 uint8_t Imm = N->getZExtValue(); 6901 return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N)); 6902}]>; 6903 6904// SSE carry-less Multiplication instructions 6905let Predicates = [NoAVX, HasPCLMUL] in { 6906 let Constraints = "$src1 = $dst" in { 6907 let isCommutable = 1 in 6908 def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), 6909 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6910 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6911 [(set VR128:$dst, 6912 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>, 6913 Sched<[WriteCLMul]>; 6914 6915 def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), 6916 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6917 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6918 [(set VR128:$dst, 6919 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2), 6920 timm:$src3))]>, 6921 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>; 6922 } // Constraints = "$src1 = $dst" 6923 6924 def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1, 6925 (i8 timm:$src3)), 6926 (PCLMULQDQrm VR128:$src1, addr:$src2, 6927 (PCLMULCommuteImm timm:$src3))>; 6928} // Predicates = [NoAVX, HasPCLMUL] 6929 6930// SSE aliases 6931foreach HI = ["hq","lq"] in 6932foreach LO = ["hq","lq"] in { 6933 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", 6934 (PCLMULQDQrr VR128:$dst, VR128:$src, 6935 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; 6936 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", 6937 (PCLMULQDQrm VR128:$dst, i128mem:$src, 6938 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; 6939} 6940 6941// AVX carry-less Multiplication instructions 6942multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp, 6943 PatFrag LdFrag, Intrinsic IntId> { 6944 let isCommutable = 1 in 6945 def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst), 6946 (ins RC:$src1, RC:$src2, u8imm:$src3), 6947 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 6948 [(set RC:$dst, 6949 (IntId RC:$src1, RC:$src2, timm:$src3))]>, 6950 Sched<[WriteCLMul]>; 6951 6952 def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst), 6953 (ins RC:$src1, MemOp:$src2, u8imm:$src3), 6954 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 6955 [(set RC:$dst, 6956 (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>, 6957 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>; 6958 6959 // We can commute a load in the first operand by swapping the sources and 6960 // rotating the immediate. 6961 def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)), 6962 (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2, 6963 (PCLMULCommuteImm timm:$src3))>; 6964} 6965 6966let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in 6967defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load, 6968 int_x86_pclmulqdq>, VEX_4V, VEX_WIG; 6969 6970let Predicates = [NoVLX, HasVPCLMULQDQ] in 6971defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load, 6972 int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG; 6973 6974multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC, 6975 X86MemOperand MemOp, string Hi, string Lo> { 6976 def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6977 (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2, 6978 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; 6979 def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6980 (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2, 6981 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; 6982} 6983 6984multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC, 6985 X86MemOperand MemOp> { 6986 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">; 6987 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">; 6988 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">; 6989 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">; 6990} 6991 6992// AVX aliases 6993defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>; 6994defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>; 6995 6996//===----------------------------------------------------------------------===// 6997// SSE4A Instructions 6998//===----------------------------------------------------------------------===// 6999 7000let Predicates = [HasSSE4A] in { 7001 7002let ExeDomain = SSEPackedInt in { 7003let Constraints = "$src = $dst" in { 7004def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst), 7005 (ins VR128:$src, u8imm:$len, u8imm:$idx), 7006 "extrq\t{$idx, $len, $src|$src, $len, $idx}", 7007 [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len, 7008 timm:$idx))]>, 7009 PD, Sched<[SchedWriteVecALU.XMM]>; 7010def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), 7011 (ins VR128:$src, VR128:$mask), 7012 "extrq\t{$mask, $src|$src, $mask}", 7013 [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src, 7014 VR128:$mask))]>, 7015 PD, Sched<[SchedWriteVecALU.XMM]>; 7016 7017def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst), 7018 (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx), 7019 "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}", 7020 [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2, 7021 timm:$len, timm:$idx))]>, 7022 XD, Sched<[SchedWriteVecALU.XMM]>; 7023def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst), 7024 (ins VR128:$src, VR128:$mask), 7025 "insertq\t{$mask, $src|$src, $mask}", 7026 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src, 7027 VR128:$mask))]>, 7028 XD, Sched<[SchedWriteVecALU.XMM]>; 7029} 7030} // ExeDomain = SSEPackedInt 7031 7032// Non-temporal (unaligned) scalar stores. 7033let AddedComplexity = 400 in { // Prefer non-temporal versions 7034let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in { 7035def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src), 7036 "movntss\t{$src, $dst|$dst, $src}", []>, XS; 7037 7038def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 7039 "movntsd\t{$src, $dst|$dst, $src}", []>, XD; 7040} // SchedRW 7041 7042def : Pat<(nontemporalstore FR32:$src, addr:$dst), 7043 (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7044 7045def : Pat<(nontemporalstore FR64:$src, addr:$dst), 7046 (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7047 7048} // AddedComplexity 7049} // HasSSE4A 7050 7051//===----------------------------------------------------------------------===// 7052// AVX Instructions 7053//===----------------------------------------------------------------------===// 7054 7055//===----------------------------------------------------------------------===// 7056// VBROADCAST - Load from memory and broadcast to all elements of the 7057// destination operand 7058// 7059class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC, 7060 X86MemOperand x86memop, ValueType VT, 7061 PatFrag bcast_frag, SchedWrite Sched> : 7062 AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 7063 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7064 [(set RC:$dst, (VT (bcast_frag addr:$src)))]>, 7065 Sched<[Sched]>, VEX; 7066 7067// AVX2 adds register forms 7068class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC, 7069 ValueType ResVT, ValueType OpVT, SchedWrite Sched> : 7070 AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), 7071 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7072 [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>, 7073 Sched<[Sched]>, VEX; 7074 7075let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in { 7076 def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128, 7077 f32mem, v4f32, X86VBroadcastld32, 7078 SchedWriteFShuffle.XMM.Folded>; 7079 def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256, 7080 f32mem, v8f32, X86VBroadcastld32, 7081 SchedWriteFShuffle.XMM.Folded>, VEX_L; 7082} 7083let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in 7084def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem, 7085 v4f64, X86VBroadcastld64, 7086 SchedWriteFShuffle.XMM.Folded>, VEX_L; 7087 7088let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in { 7089 def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128, 7090 v4f32, v4f32, SchedWriteFShuffle.XMM>; 7091 def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256, 7092 v8f32, v4f32, WriteFShuffle256>, VEX_L; 7093} 7094let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in 7095def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256, 7096 v4f64, v2f64, WriteFShuffle256>, VEX_L; 7097 7098//===----------------------------------------------------------------------===// 7099// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both 7100// halves of a 256-bit vector. 7101// 7102let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in 7103def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst), 7104 (ins i128mem:$src), 7105 "vbroadcasti128\t{$src, $dst|$dst, $src}", []>, 7106 Sched<[WriteShuffleLd]>, VEX, VEX_L; 7107 7108let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX], 7109 ExeDomain = SSEPackedSingle in 7110def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst), 7111 (ins f128mem:$src), 7112 "vbroadcastf128\t{$src, $dst|$dst, $src}", []>, 7113 Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L; 7114 7115let Predicates = [HasAVX, NoVLX] in { 7116def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)), 7117 (VBROADCASTF128 addr:$src)>; 7118def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)), 7119 (VBROADCASTF128 addr:$src)>; 7120// NOTE: We're using FP instructions here, but execution domain fixing can 7121// convert to integer when profitable. 7122def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)), 7123 (VBROADCASTF128 addr:$src)>; 7124def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)), 7125 (VBROADCASTF128 addr:$src)>; 7126def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)), 7127 (VBROADCASTF128 addr:$src)>; 7128def : Pat<(v16f16 (X86SubVBroadcastld128 addr:$src)), 7129 (VBROADCASTF128 addr:$src)>; 7130def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)), 7131 (VBROADCASTF128 addr:$src)>; 7132} 7133 7134//===----------------------------------------------------------------------===// 7135// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks 7136// 7137 7138let ExeDomain = SSEPackedSingle in { 7139let isCommutable = 1 in 7140def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), 7141 (ins VR256:$src1, VR256:$src2, u8imm:$src3), 7142 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, 7143 VEX_4V, VEX_L, Sched<[WriteFShuffle256]>; 7144def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), 7145 (ins VR256:$src1, f256mem:$src2, u8imm:$src3), 7146 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, 7147 VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>; 7148} 7149 7150// Immediate transform to help with commuting. 7151def Perm2XCommuteImm : SDNodeXForm<timm, [{ 7152 return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N)); 7153}]>; 7154 7155multiclass vperm2x128_lowering<string InstrStr, ValueType VT, PatFrag memop_frag> { 7156 def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))), 7157 (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>; 7158 def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))), 7159 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>; 7160 // Pattern with load in other operand. 7161 def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))), 7162 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, 7163 (Perm2XCommuteImm timm:$imm))>; 7164} 7165 7166let Predicates = [HasAVX] in { 7167 defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>; 7168 defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>; 7169} 7170 7171let Predicates = [HasAVX1Only] in { 7172 defm : vperm2x128_lowering<"VPERM2F128", v4i64, loadv4i64>; 7173 defm : vperm2x128_lowering<"VPERM2F128", v8i32, loadv8i32>; 7174 defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>; 7175 defm : vperm2x128_lowering<"VPERM2F128", v16f16, loadv16f16>; 7176 defm : vperm2x128_lowering<"VPERM2F128", v32i8, loadv32i8>; 7177} 7178 7179//===----------------------------------------------------------------------===// 7180// VINSERTF128 - Insert packed floating-point values 7181// 7182let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 7183def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst), 7184 (ins VR256:$src1, VR128:$src2, u8imm:$src3), 7185 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7186 []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L; 7187let mayLoad = 1 in 7188def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), 7189 (ins VR256:$src1, f128mem:$src2, u8imm:$src3), 7190 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7191 []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; 7192} 7193 7194// To create a 256-bit all ones value, we should produce VCMPTRUEPS 7195// with YMM register containing zero. 7196// FIXME: Avoid producing vxorps to clear the fake inputs. 7197let Predicates = [HasAVX1Only] in { 7198def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>; 7199} 7200 7201multiclass vinsert_lowering<string InstrStr, string PermStr, 7202 ValueType From, ValueType To, 7203 PatFrag frommemop_frag, PatFrag tomemop_frag> { 7204 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2), 7205 (iPTR imm)), 7206 (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2, 7207 (INSERT_get_vinsert128_imm VR256:$ins))>; 7208 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), 7209 (From (frommemop_frag addr:$src2)), 7210 (iPTR imm)), 7211 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, 7212 (INSERT_get_vinsert128_imm VR256:$ins))>; 7213 // Folding "To" vector - convert to perm2x128 and commute inputs. 7214 def : Pat<(vinsert128_insert:$ins (To (tomemop_frag addr:$src1)), 7215 (From VR128:$src2), 7216 (iPTR imm)), 7217 (!cast<Instruction>(PermStr#rm) 7218 (INSERT_SUBREG (To (IMPLICIT_DEF)), VR128:$src2, sub_xmm), 7219 addr:$src1, (INSERT_get_vperm2x128_commutedimm VR256:$ins))>; 7220} 7221 7222let Predicates = [HasAVX, NoVLX] in { 7223 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4f32, v8f32, loadv4f32, loadv8f32>; 7224 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2f64, v4f64, loadv2f64, loadv4f64>; 7225} 7226 7227let Predicates = [HasAVX1Only] in { 7228 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2i64, v4i64, loadv2i64, loadv4i64>; 7229 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4i32, v8i32, loadv4i32, loadv8i32>; 7230 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8i16, v16i16, loadv8i16, loadv16i16>; 7231 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8f16, v16f16, loadv8f16, loadv16f16>; 7232 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8, loadv16i8, loadv32i8>; 7233 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8, loadv16i8, loadv32i8>; 7234} 7235 7236//===----------------------------------------------------------------------===// 7237// VEXTRACTF128 - Extract packed floating-point values 7238// 7239let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 7240def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst), 7241 (ins VR256:$src1, u8imm:$src2), 7242 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7243 []>, Sched<[WriteFShuffle256]>, VEX, VEX_L; 7244let mayStore = 1 in 7245def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), 7246 (ins f128mem:$dst, VR256:$src1, u8imm:$src2), 7247 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7248 []>, Sched<[WriteFStoreX]>, VEX, VEX_L; 7249} 7250 7251multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> { 7252 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 7253 (To (!cast<Instruction>(InstrStr#rr) 7254 (From VR256:$src1), 7255 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 7256 def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1), 7257 (iPTR imm))), addr:$dst), 7258 (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1, 7259 (EXTRACT_get_vextract128_imm VR128:$ext))>; 7260} 7261 7262// AVX1 patterns 7263let Predicates = [HasAVX, NoVLX] in { 7264 defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>; 7265 defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>; 7266} 7267 7268let Predicates = [HasAVX1Only] in { 7269 defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>; 7270 defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>; 7271 defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>; 7272 defm : vextract_lowering<"VEXTRACTF128", v16f16, v8f16>; 7273 defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>; 7274 defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>; 7275} 7276 7277//===----------------------------------------------------------------------===// 7278// VMASKMOV - Conditional SIMD Packed Loads and Stores 7279// 7280multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, 7281 Intrinsic IntLd, Intrinsic IntLd256, 7282 Intrinsic IntSt, Intrinsic IntSt256, 7283 X86SchedWriteMaskMove schedX, 7284 X86SchedWriteMaskMove schedY> { 7285 def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst), 7286 (ins VR128:$src1, f128mem:$src2), 7287 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7288 [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>, 7289 VEX_4V, Sched<[schedX.RM]>; 7290 def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst), 7291 (ins VR256:$src1, f256mem:$src2), 7292 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7293 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, 7294 VEX_4V, VEX_L, Sched<[schedY.RM]>; 7295 def mr : AVX8I<opc_mr, MRMDestMem, (outs), 7296 (ins f128mem:$dst, VR128:$src1, VR128:$src2), 7297 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7298 [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, 7299 VEX_4V, Sched<[schedX.MR]>; 7300 def Ymr : AVX8I<opc_mr, MRMDestMem, (outs), 7301 (ins f256mem:$dst, VR256:$src1, VR256:$src2), 7302 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7303 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, 7304 VEX_4V, VEX_L, Sched<[schedY.MR]>; 7305} 7306 7307let ExeDomain = SSEPackedSingle in 7308defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps", 7309 int_x86_avx_maskload_ps, 7310 int_x86_avx_maskload_ps_256, 7311 int_x86_avx_maskstore_ps, 7312 int_x86_avx_maskstore_ps_256, 7313 WriteFMaskMove32, WriteFMaskMove32Y>; 7314let ExeDomain = SSEPackedDouble in 7315defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", 7316 int_x86_avx_maskload_pd, 7317 int_x86_avx_maskload_pd_256, 7318 int_x86_avx_maskstore_pd, 7319 int_x86_avx_maskstore_pd_256, 7320 WriteFMaskMove64, WriteFMaskMove64Y>; 7321 7322//===----------------------------------------------------------------------===// 7323// AVX_VNNI 7324//===----------------------------------------------------------------------===// 7325let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst", 7326 ExplicitVEXPrefix = 1, checkVEXPredicate = 1 in 7327multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 7328 bit IsCommutable> { 7329 let isCommutable = IsCommutable in 7330 def rr : AVX8I<opc, MRMSrcReg, (outs VR128:$dst), 7331 (ins VR128:$src1, VR128:$src2, VR128:$src3), 7332 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 7333 [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, 7334 VR128:$src2, VR128:$src3)))]>, 7335 VEX_4V, Sched<[SchedWriteVecIMul.XMM]>; 7336 7337 def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst), 7338 (ins VR128:$src1, VR128:$src2, i128mem:$src3), 7339 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 7340 [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2, 7341 (loadv4i32 addr:$src3))))]>, 7342 VEX_4V, Sched<[SchedWriteVecIMul.XMM]>; 7343 7344 let isCommutable = IsCommutable in 7345 def Yrr : AVX8I<opc, MRMSrcReg, (outs VR256:$dst), 7346 (ins VR256:$src1, VR256:$src2, VR256:$src3), 7347 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 7348 [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, 7349 VR256:$src2, VR256:$src3)))]>, 7350 VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>; 7351 7352 def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst), 7353 (ins VR256:$src1, VR256:$src2, i256mem:$src3), 7354 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 7355 [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2, 7356 (loadv8i32 addr:$src3))))]>, 7357 VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>; 7358} 7359 7360defm VPDPBUSD : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>; 7361defm VPDPBUSDS : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>; 7362defm VPDPWSSD : avx_vnni_rm<0x52, "vpdpwssd", X86Vpdpwssd, 1>; 7363defm VPDPWSSDS : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>; 7364 7365def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs), 7366 (X86vpmaddwd node:$lhs, node:$rhs), [{ 7367 return N->hasOneUse(); 7368}]>; 7369 7370let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI] in { 7371 def : Pat<(v8i32 (add VR256:$src1, 7372 (X86vpmaddwd_su VR256:$src2, VR256:$src3))), 7373 (VPDPWSSDYrr VR256:$src1, VR256:$src2, VR256:$src3)>; 7374 def : Pat<(v8i32 (add VR256:$src1, 7375 (X86vpmaddwd_su VR256:$src2, (load addr:$src3)))), 7376 (VPDPWSSDYrm VR256:$src1, VR256:$src2, addr:$src3)>; 7377 def : Pat<(v4i32 (add VR128:$src1, 7378 (X86vpmaddwd_su VR128:$src2, VR128:$src3))), 7379 (VPDPWSSDrr VR128:$src1, VR128:$src2, VR128:$src3)>; 7380 def : Pat<(v4i32 (add VR128:$src1, 7381 (X86vpmaddwd_su VR128:$src2, (load addr:$src3)))), 7382 (VPDPWSSDrm VR128:$src1, VR128:$src2, addr:$src3)>; 7383} 7384 7385//===----------------------------------------------------------------------===// 7386// VPERMIL - Permute Single and Double Floating-Point Values 7387// 7388 7389multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, 7390 RegisterClass RC, X86MemOperand x86memop_f, 7391 X86MemOperand x86memop_i, 7392 ValueType f_vt, ValueType i_vt, 7393 X86FoldableSchedWrite sched, 7394 X86FoldableSchedWrite varsched> { 7395 let Predicates = [HasAVX, NoVLX] in { 7396 def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst), 7397 (ins RC:$src1, RC:$src2), 7398 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7399 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V, 7400 Sched<[varsched]>; 7401 def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst), 7402 (ins RC:$src1, x86memop_i:$src2), 7403 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7404 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, 7405 (i_vt (load addr:$src2)))))]>, VEX_4V, 7406 Sched<[varsched.Folded, sched.ReadAfterFold]>; 7407 7408 def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), 7409 (ins RC:$src1, u8imm:$src2), 7410 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7411 [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX, 7412 Sched<[sched]>; 7413 def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), 7414 (ins x86memop_f:$src1, u8imm:$src2), 7415 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7416 [(set RC:$dst, 7417 (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX, 7418 Sched<[sched.Folded]>; 7419 }// Predicates = [HasAVX, NoVLX] 7420} 7421 7422let ExeDomain = SSEPackedSingle in { 7423 defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem, 7424 v4f32, v4i32, SchedWriteFShuffle.XMM, 7425 SchedWriteFVarShuffle.XMM>; 7426 defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem, 7427 v8f32, v8i32, SchedWriteFShuffle.YMM, 7428 SchedWriteFVarShuffle.YMM>, VEX_L; 7429} 7430let ExeDomain = SSEPackedDouble in { 7431 defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem, 7432 v2f64, v2i64, SchedWriteFShuffle.XMM, 7433 SchedWriteFVarShuffle.XMM>; 7434 defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem, 7435 v4f64, v4i64, SchedWriteFShuffle.YMM, 7436 SchedWriteFVarShuffle.YMM>, VEX_L; 7437} 7438 7439//===----------------------------------------------------------------------===// 7440// VZERO - Zero YMM registers 7441// Note: These instruction do not affect the YMM16-YMM31. 7442// 7443 7444let SchedRW = [WriteSystem] in { 7445let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, 7446 YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in { 7447 // Zero All YMM registers 7448 def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall", 7449 [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, 7450 Requires<[HasAVX]>, VEX_WIG; 7451 7452 // Zero Upper bits of YMM registers 7453 def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", 7454 [(int_x86_avx_vzeroupper)]>, PS, VEX, 7455 Requires<[HasAVX]>, VEX_WIG; 7456} // Defs 7457} // SchedRW 7458 7459//===----------------------------------------------------------------------===// 7460// Half precision conversion instructions 7461// 7462 7463multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, 7464 X86FoldableSchedWrite sched> { 7465 def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), 7466 "vcvtph2ps\t{$src, $dst|$dst, $src}", 7467 [(set RC:$dst, (X86any_cvtph2ps VR128:$src))]>, 7468 T8PD, VEX, Sched<[sched]>; 7469 let hasSideEffects = 0, mayLoad = 1 in 7470 def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 7471 "vcvtph2ps\t{$src, $dst|$dst, $src}", 7472 []>, T8PD, VEX, Sched<[sched.Folded]>; 7473} 7474 7475multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, 7476 SchedWrite RR, SchedWrite MR> { 7477 def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), 7478 (ins RC:$src1, i32u8imm:$src2), 7479 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7480 [(set VR128:$dst, (X86any_cvtps2ph RC:$src1, timm:$src2))]>, 7481 TAPD, VEX, Sched<[RR]>; 7482 let hasSideEffects = 0, mayStore = 1 in 7483 def mr : Ii8<0x1D, MRMDestMem, (outs), 7484 (ins x86memop:$dst, RC:$src1, i32u8imm:$src2), 7485 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7486 TAPD, VEX, Sched<[MR]>; 7487} 7488 7489let Predicates = [HasF16C, NoVLX] in { 7490 defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>, SIMD_EXC; 7491 defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L, SIMD_EXC; 7492 defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH, 7493 WriteCvtPS2PHSt>, SIMD_EXC; 7494 defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY, 7495 WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC; 7496 7497 // Pattern match vcvtph2ps of a scalar i64 load. 7498 def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), 7499 (VCVTPH2PSrm addr:$src)>; 7500 def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 7501 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 7502 (VCVTPH2PSrm addr:$src)>; 7503 def : Pat<(v8f32 (X86any_cvtph2ps (loadv8i16 addr:$src))), 7504 (VCVTPH2PSYrm addr:$src)>; 7505 7506 def : Pat<(store (f64 (extractelt 7507 (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))), 7508 (iPTR 0))), addr:$dst), 7509 (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>; 7510 def : Pat<(store (i64 (extractelt 7511 (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))), 7512 (iPTR 0))), addr:$dst), 7513 (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>; 7514 def : Pat<(store (v8i16 (X86any_cvtps2ph VR256:$src1, timm:$src2)), addr:$dst), 7515 (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>; 7516} 7517 7518//===----------------------------------------------------------------------===// 7519// AVX2 Instructions 7520//===----------------------------------------------------------------------===// 7521 7522/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate 7523multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 7524 ValueType OpVT, X86FoldableSchedWrite sched, 7525 RegisterClass RC, 7526 X86MemOperand x86memop, SDNodeXForm commuteXForm> { 7527 let isCommutable = 1 in 7528 def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), 7529 (ins RC:$src1, RC:$src2, u8imm:$src3), 7530 !strconcat(OpcodeStr, 7531 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 7532 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, 7533 Sched<[sched]>, VEX_4V; 7534 def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), 7535 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 7536 !strconcat(OpcodeStr, 7537 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 7538 [(set RC:$dst, 7539 (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>, 7540 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V; 7541 7542 // Pattern to commute if load is in first source. 7543 def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)), 7544 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, 7545 (commuteXForm timm:$src3))>; 7546} 7547 7548let Predicates = [HasAVX2] in { 7549defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32, 7550 SchedWriteBlend.XMM, VR128, i128mem, 7551 BlendCommuteImm4>; 7552defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32, 7553 SchedWriteBlend.YMM, VR256, i256mem, 7554 BlendCommuteImm8>, VEX_L; 7555 7556def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3), 7557 (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>; 7558def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3), 7559 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; 7560def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3), 7561 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; 7562 7563def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), 7564 (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>; 7565def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3), 7566 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>; 7567def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3), 7568 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>; 7569} 7570 7571// For insertion into the zero index (low half) of a 256-bit vector, it is 7572// more efficient to generate a blend with immediate instead of an insert*128. 7573// NOTE: We're using FP instructions here, but execution domain fixing should 7574// take care of using integer instructions when profitable. 7575let Predicates = [HasAVX] in { 7576def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)), 7577 (VBLENDPSYrri VR256:$src1, 7578 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7579 VR128:$src2, sub_xmm), 0xf)>; 7580def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)), 7581 (VBLENDPSYrri VR256:$src1, 7582 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7583 VR128:$src2, sub_xmm), 0xf)>; 7584def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)), 7585 (VBLENDPSYrri VR256:$src1, 7586 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7587 VR128:$src2, sub_xmm), 0xf)>; 7588def : Pat<(insert_subvector (v16f16 VR256:$src1), (v8f16 VR128:$src2), (iPTR 0)), 7589 (VBLENDPSYrri VR256:$src1, 7590 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7591 VR128:$src2, sub_xmm), 0xf)>; 7592def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), 7593 (VBLENDPSYrri VR256:$src1, 7594 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7595 VR128:$src2, sub_xmm), 0xf)>; 7596 7597def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)), 7598 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7599 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7600def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)), 7601 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7602 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7603def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)), 7604 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7605 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7606def : Pat<(insert_subvector (loadv16f16 addr:$src2), (v8f16 VR128:$src1), (iPTR 0)), 7607 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7608 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7609def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)), 7610 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7611 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7612} 7613 7614//===----------------------------------------------------------------------===// 7615// VPBROADCAST - Load from memory and broadcast to all elements of the 7616// destination operand 7617// 7618multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, 7619 X86MemOperand x86memop, PatFrag bcast_frag, 7620 ValueType OpVT128, ValueType OpVT256, Predicate prd> { 7621 let Predicates = [HasAVX2, prd] in { 7622 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 7623 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7624 [(set VR128:$dst, 7625 (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>, 7626 Sched<[SchedWriteShuffle.XMM]>, VEX; 7627 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 7628 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7629 [(set VR128:$dst, 7630 (OpVT128 (bcast_frag addr:$src)))]>, 7631 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX; 7632 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 7633 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7634 [(set VR256:$dst, 7635 (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>, 7636 Sched<[WriteShuffle256]>, VEX, VEX_L; 7637 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), 7638 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7639 [(set VR256:$dst, 7640 (OpVT256 (bcast_frag addr:$src)))]>, 7641 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L; 7642 7643 // Provide aliases for broadcast from the same register class that 7644 // automatically does the extract. 7645 def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))), 7646 (!cast<Instruction>(NAME#"Yrr") 7647 (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>; 7648 } 7649} 7650 7651defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8, 7652 v16i8, v32i8, NoVLX_Or_NoBWI>; 7653defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16, 7654 v8i16, v16i16, NoVLX_Or_NoBWI>; 7655defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32, 7656 v4i32, v8i32, NoVLX>; 7657defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64, 7658 v2i64, v4i64, NoVLX>; 7659 7660let Predicates = [HasAVX2, NoVLX] in { 7661 // Provide fallback in case the load node that is used in the patterns above 7662 // is used by additional users, which prevents the pattern selection. 7663 def : Pat<(v4f32 (X86VBroadcast FR32:$src)), 7664 (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7665 def : Pat<(v8f32 (X86VBroadcast FR32:$src)), 7666 (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7667 def : Pat<(v4f64 (X86VBroadcast FR64:$src)), 7668 (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7669} 7670 7671let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 7672 def : Pat<(v16i8 (X86VBroadcast GR8:$src)), 7673 (VPBROADCASTBrr (VMOVDI2PDIrr 7674 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7675 GR8:$src, sub_8bit))))>; 7676 def : Pat<(v32i8 (X86VBroadcast GR8:$src)), 7677 (VPBROADCASTBYrr (VMOVDI2PDIrr 7678 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7679 GR8:$src, sub_8bit))))>; 7680 7681 def : Pat<(v8i16 (X86VBroadcast GR16:$src)), 7682 (VPBROADCASTWrr (VMOVDI2PDIrr 7683 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7684 GR16:$src, sub_16bit))))>; 7685 def : Pat<(v16i16 (X86VBroadcast GR16:$src)), 7686 (VPBROADCASTWYrr (VMOVDI2PDIrr 7687 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7688 GR16:$src, sub_16bit))))>; 7689 7690 def : Pat<(v8f16 (X86VBroadcastld16 addr:$src)), 7691 (VPBROADCASTWrm addr:$src)>; 7692 def : Pat<(v16f16 (X86VBroadcastld16 addr:$src)), 7693 (VPBROADCASTWYrm addr:$src)>; 7694 7695 def : Pat<(v8f16 (X86VBroadcast (v8f16 VR128:$src))), 7696 (VPBROADCASTWrr VR128:$src)>; 7697 def : Pat<(v16f16 (X86VBroadcast (v8f16 VR128:$src))), 7698 (VPBROADCASTWYrr VR128:$src)>; 7699 7700 def : Pat<(v8f16 (X86VBroadcast (f16 FR16:$src))), 7701 (VPBROADCASTWrr (COPY_TO_REGCLASS FR16:$src, VR128))>; 7702 def : Pat<(v16f16 (X86VBroadcast (f16 FR16:$src))), 7703 (VPBROADCASTWYrr (COPY_TO_REGCLASS FR16:$src, VR128))>; 7704} 7705let Predicates = [HasAVX2, NoVLX] in { 7706 def : Pat<(v4i32 (X86VBroadcast GR32:$src)), 7707 (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>; 7708 def : Pat<(v8i32 (X86VBroadcast GR32:$src)), 7709 (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>; 7710 def : Pat<(v2i64 (X86VBroadcast GR64:$src)), 7711 (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>; 7712 def : Pat<(v4i64 (X86VBroadcast GR64:$src)), 7713 (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>; 7714} 7715 7716// AVX1 broadcast patterns 7717let Predicates = [HasAVX1Only] in { 7718def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)), 7719 (VBROADCASTSSYrm addr:$src)>; 7720def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)), 7721 (VBROADCASTSDYrm addr:$src)>; 7722def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)), 7723 (VBROADCASTSSrm addr:$src)>; 7724} 7725 7726 // Provide fallback in case the load node that is used in the patterns above 7727 // is used by additional users, which prevents the pattern selection. 7728let Predicates = [HasAVX, NoVLX] in { 7729 // 128bit broadcasts: 7730 def : Pat<(v2f64 (X86VBroadcast f64:$src)), 7731 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7732 def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)), 7733 (VMOVDDUPrm addr:$src)>; 7734 7735 def : Pat<(v2f64 (X86VBroadcast v2f64:$src)), 7736 (VMOVDDUPrr VR128:$src)>; 7737} 7738 7739let Predicates = [HasAVX1Only] in { 7740 def : Pat<(v4f32 (X86VBroadcast FR32:$src)), 7741 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>; 7742 def : Pat<(v8f32 (X86VBroadcast FR32:$src)), 7743 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 7744 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm), 7745 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>; 7746 def : Pat<(v8f32 (X86VBroadcast v4f32:$src)), 7747 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 7748 (v4f32 (VPERMILPSri VR128:$src, 0)), sub_xmm), 7749 (v4f32 (VPERMILPSri VR128:$src, 0)), 1)>; 7750 def : Pat<(v4f64 (X86VBroadcast FR64:$src)), 7751 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 7752 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm), 7753 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>; 7754 def : Pat<(v4f64 (X86VBroadcast v2f64:$src)), 7755 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 7756 (v2f64 (VMOVDDUPrr VR128:$src)), sub_xmm), 7757 (v2f64 (VMOVDDUPrr VR128:$src)), 1)>; 7758 7759 def : Pat<(v4i32 (X86VBroadcast GR32:$src)), 7760 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>; 7761 def : Pat<(v8i32 (X86VBroadcast GR32:$src)), 7762 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7763 (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm), 7764 (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>; 7765 def : Pat<(v4i64 (X86VBroadcast GR64:$src)), 7766 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), 7767 (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm), 7768 (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>; 7769 7770 def : Pat<(v2i64 (X86VBroadcast i64:$src)), 7771 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>; 7772 def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)), 7773 (VMOVDDUPrm addr:$src)>; 7774} 7775 7776//===----------------------------------------------------------------------===// 7777// VPERM - Permute instructions 7778// 7779 7780multiclass avx2_perm<bits<8> opc, string OpcodeStr, 7781 ValueType OpVT, X86FoldableSchedWrite Sched, 7782 X86MemOperand memOp> { 7783 let Predicates = [HasAVX2, NoVLX] in { 7784 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), 7785 (ins VR256:$src1, VR256:$src2), 7786 !strconcat(OpcodeStr, 7787 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7788 [(set VR256:$dst, 7789 (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, 7790 Sched<[Sched]>, VEX_4V, VEX_L; 7791 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), 7792 (ins VR256:$src1, memOp:$src2), 7793 !strconcat(OpcodeStr, 7794 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7795 [(set VR256:$dst, 7796 (OpVT (X86VPermv VR256:$src1, 7797 (load addr:$src2))))]>, 7798 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L; 7799 } 7800} 7801 7802defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>; 7803let ExeDomain = SSEPackedSingle in 7804defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>; 7805 7806multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, 7807 ValueType OpVT, X86FoldableSchedWrite Sched, 7808 X86MemOperand memOp> { 7809 let Predicates = [HasAVX2, NoVLX] in { 7810 def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst), 7811 (ins VR256:$src1, u8imm:$src2), 7812 !strconcat(OpcodeStr, 7813 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7814 [(set VR256:$dst, 7815 (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>, 7816 Sched<[Sched]>, VEX, VEX_L; 7817 def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), 7818 (ins memOp:$src1, u8imm:$src2), 7819 !strconcat(OpcodeStr, 7820 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7821 [(set VR256:$dst, 7822 (OpVT (X86VPermi (mem_frag addr:$src1), 7823 (i8 timm:$src2))))]>, 7824 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L; 7825 } 7826} 7827 7828defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64, 7829 WriteShuffle256, i256mem>, VEX_W; 7830let ExeDomain = SSEPackedDouble in 7831defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64, 7832 WriteFShuffle256, f256mem>, VEX_W; 7833 7834//===----------------------------------------------------------------------===// 7835// VPERM2I128 - Permute Integer vector Values in 128-bit chunks 7836// 7837let isCommutable = 1 in 7838def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), 7839 (ins VR256:$src1, VR256:$src2, u8imm:$src3), 7840 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, 7841 Sched<[WriteShuffle256]>, VEX_4V, VEX_L; 7842def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), 7843 (ins VR256:$src1, f256mem:$src2, u8imm:$src3), 7844 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, 7845 Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; 7846 7847let Predicates = [HasAVX2] in { 7848 defm : vperm2x128_lowering<"VPERM2I128", v4i64, loadv4i64>; 7849 defm : vperm2x128_lowering<"VPERM2I128", v8i32, loadv8i32>; 7850 defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>; 7851 defm : vperm2x128_lowering<"VPERM2I128", v16f16, loadv16f16>; 7852 defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>; 7853 defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>; 7854} 7855 7856//===----------------------------------------------------------------------===// 7857// VINSERTI128 - Insert packed integer values 7858// 7859let hasSideEffects = 0 in { 7860def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst), 7861 (ins VR256:$src1, VR128:$src2, u8imm:$src3), 7862 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7863 []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L; 7864let mayLoad = 1 in 7865def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), 7866 (ins VR256:$src1, i128mem:$src2, u8imm:$src3), 7867 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7868 []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; 7869} 7870 7871let Predicates = [HasAVX2, NoVLX] in { 7872 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v2i64, v4i64, loadv2i64, loadv4i64>; 7873 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v4i32, v8i32, loadv4i32, loadv8i32>; 7874 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8i16, v16i16, loadv8i16, loadv16i16>; 7875 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8f16, v16f16, loadv8f16, loadv16f16>; 7876 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>; 7877 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>; 7878} 7879 7880//===----------------------------------------------------------------------===// 7881// VEXTRACTI128 - Extract packed integer values 7882// 7883def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst), 7884 (ins VR256:$src1, u8imm:$src2), 7885 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7886 Sched<[WriteShuffle256]>, VEX, VEX_L; 7887let hasSideEffects = 0, mayStore = 1 in 7888def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), 7889 (ins i128mem:$dst, VR256:$src1, u8imm:$src2), 7890 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7891 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L; 7892 7893let Predicates = [HasAVX2, NoVLX] in { 7894 defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>; 7895 defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>; 7896 defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>; 7897 defm : vextract_lowering<"VEXTRACTI128", v16f16, v8f16>; 7898 defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>; 7899 defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>; 7900} 7901 7902//===----------------------------------------------------------------------===// 7903// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores 7904// 7905multiclass avx2_pmovmask<string OpcodeStr, 7906 Intrinsic IntLd128, Intrinsic IntLd256, 7907 Intrinsic IntSt128, Intrinsic IntSt256, 7908 X86SchedWriteMaskMove schedX, 7909 X86SchedWriteMaskMove schedY> { 7910 def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst), 7911 (ins VR128:$src1, i128mem:$src2), 7912 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7913 [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, 7914 VEX_4V, Sched<[schedX.RM]>; 7915 def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst), 7916 (ins VR256:$src1, i256mem:$src2), 7917 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7918 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, 7919 VEX_4V, VEX_L, Sched<[schedY.RM]>; 7920 def mr : AVX28I<0x8e, MRMDestMem, (outs), 7921 (ins i128mem:$dst, VR128:$src1, VR128:$src2), 7922 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7923 [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, 7924 VEX_4V, Sched<[schedX.MR]>; 7925 def Ymr : AVX28I<0x8e, MRMDestMem, (outs), 7926 (ins i256mem:$dst, VR256:$src1, VR256:$src2), 7927 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7928 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, 7929 VEX_4V, VEX_L, Sched<[schedY.MR]>; 7930} 7931 7932defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd", 7933 int_x86_avx2_maskload_d, 7934 int_x86_avx2_maskload_d_256, 7935 int_x86_avx2_maskstore_d, 7936 int_x86_avx2_maskstore_d_256, 7937 WriteVecMaskMove32, WriteVecMaskMove32Y>; 7938defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", 7939 int_x86_avx2_maskload_q, 7940 int_x86_avx2_maskload_q_256, 7941 int_x86_avx2_maskstore_q, 7942 int_x86_avx2_maskstore_q_256, 7943 WriteVecMaskMove64, WriteVecMaskMove64Y>, VEX_W; 7944 7945multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT, 7946 ValueType MaskVT> { 7947 // masked store 7948 def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)), 7949 (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>; 7950 // masked load 7951 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)), 7952 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; 7953 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), 7954 (VT immAllZerosV))), 7955 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; 7956} 7957let Predicates = [HasAVX] in { 7958 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>; 7959 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>; 7960 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>; 7961 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>; 7962} 7963let Predicates = [HasAVX1Only] in { 7964 // load/store i32/i64 not supported use ps/pd version 7965 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>; 7966 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>; 7967 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>; 7968 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>; 7969} 7970let Predicates = [HasAVX2] in { 7971 defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>; 7972 defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>; 7973 defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>; 7974 defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>; 7975} 7976 7977//===----------------------------------------------------------------------===// 7978// Variable Bit Shifts 7979// 7980multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, 7981 ValueType vt128, ValueType vt256> { 7982 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), 7983 (ins VR128:$src1, VR128:$src2), 7984 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7985 [(set VR128:$dst, 7986 (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>, 7987 VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>; 7988 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), 7989 (ins VR128:$src1, i128mem:$src2), 7990 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7991 [(set VR128:$dst, 7992 (vt128 (OpNode VR128:$src1, 7993 (vt128 (load addr:$src2)))))]>, 7994 VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded, 7995 SchedWriteVarVecShift.XMM.ReadAfterFold]>; 7996 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), 7997 (ins VR256:$src1, VR256:$src2), 7998 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7999 [(set VR256:$dst, 8000 (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>, 8001 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>; 8002 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), 8003 (ins VR256:$src1, i256mem:$src2), 8004 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8005 [(set VR256:$dst, 8006 (vt256 (OpNode VR256:$src1, 8007 (vt256 (load addr:$src2)))))]>, 8008 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded, 8009 SchedWriteVarVecShift.YMM.ReadAfterFold]>; 8010} 8011 8012let Predicates = [HasAVX2, NoVLX] in { 8013 defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>; 8014 defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, VEX_W; 8015 defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>; 8016 defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, VEX_W; 8017 defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>; 8018} 8019 8020//===----------------------------------------------------------------------===// 8021// VGATHER - GATHER Operations 8022 8023// FIXME: Improve scheduling of gather instructions. 8024multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256, 8025 X86MemOperand memop128, X86MemOperand memop256> { 8026let mayLoad = 1, hasSideEffects = 0 in { 8027 def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb), 8028 (ins VR128:$src1, memop128:$src2, VR128:$mask), 8029 !strconcat(OpcodeStr, 8030 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), 8031 []>, VEX, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>; 8032 def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb), 8033 (ins RC256:$src1, memop256:$src2, RC256:$mask), 8034 !strconcat(OpcodeStr, 8035 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), 8036 []>, VEX, VEX_L, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>; 8037} 8038} 8039 8040let Predicates = [HasAVX2] in { 8041 let mayLoad = 1, hasSideEffects = 0, Constraints 8042 = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" 8043 in { 8044 defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", 8045 VR256, vx128mem, vx256mem>, VEX_W; 8046 defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", 8047 VR256, vx128mem, vy256mem>, VEX_W; 8048 defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", 8049 VR256, vx128mem, vy256mem>; 8050 defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", 8051 VR128, vx64mem, vy128mem>; 8052 8053 let ExeDomain = SSEPackedDouble in { 8054 defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", 8055 VR256, vx128mem, vx256mem>, VEX_W; 8056 defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", 8057 VR256, vx128mem, vy256mem>, VEX_W; 8058 } 8059 8060 let ExeDomain = SSEPackedSingle in { 8061 defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", 8062 VR256, vx128mem, vy256mem>; 8063 defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", 8064 VR128, vx64mem, vy128mem>; 8065 } 8066 } 8067} 8068 8069//===----------------------------------------------------------------------===// 8070// GFNI instructions 8071//===----------------------------------------------------------------------===// 8072 8073multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT, 8074 RegisterClass RC, PatFrag MemOpFrag, 8075 X86MemOperand X86MemOp, X86FoldableSchedWrite sched, 8076 bit Is2Addr = 0> { 8077 let ExeDomain = SSEPackedInt, 8078 AsmString = !if(Is2Addr, 8079 OpcodeStr#"\t{$src2, $dst|$dst, $src2}", 8080 OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { 8081 let isCommutable = 1 in 8082 def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "", 8083 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>, 8084 Sched<[sched]>, T8PD; 8085 8086 def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "", 8087 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, 8088 (MemOpFrag addr:$src2))))]>, 8089 Sched<[sched.Folded, sched.ReadAfterFold]>, T8PD; 8090 } 8091} 8092 8093multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT, 8094 SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag, 8095 X86MemOperand X86MemOp, X86FoldableSchedWrite sched, 8096 bit Is2Addr = 0> { 8097 let AsmString = !if(Is2Addr, 8098 OpStr#"\t{$src3, $src2, $dst|$dst, $src2, $src3}", 8099 OpStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in { 8100 def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst), 8101 (ins RC:$src1, RC:$src2, u8imm:$src3), "", 8102 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))], 8103 SSEPackedInt>, Sched<[sched]>; 8104 def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst), 8105 (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "", 8106 [(set RC:$dst, (OpVT (OpNode RC:$src1, 8107 (MemOpFrag addr:$src2), 8108 timm:$src3)))], SSEPackedInt>, 8109 Sched<[sched.Folded, sched.ReadAfterFold]>; 8110 } 8111} 8112 8113multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> { 8114 let Constraints = "$src1 = $dst", 8115 Predicates = [HasGFNI, UseSSE2] in 8116 defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode, 8117 VR128, load, i128mem, SchedWriteVecIMul.XMM, 1>; 8118 let Predicates = [HasGFNI, HasAVX, NoVLX] in { 8119 defm V#NAME : GF2P8AFFINE_rmi<Op, "v"#OpStr, v16i8, OpNode, VR128, 8120 load, i128mem, SchedWriteVecIMul.XMM>, 8121 VEX_4V, VEX_W; 8122 defm V#NAME#Y : GF2P8AFFINE_rmi<Op, "v"#OpStr, v32i8, OpNode, VR256, 8123 load, i256mem, SchedWriteVecIMul.YMM>, 8124 VEX_4V, VEX_L, VEX_W; 8125 } 8126} 8127 8128// GF2P8MULB 8129let Constraints = "$src1 = $dst", 8130 Predicates = [HasGFNI, UseSSE2] in 8131defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop, 8132 i128mem, SchedWriteVecALU.XMM, 1>; 8133let Predicates = [HasGFNI, HasAVX, NoVLX] in { 8134 defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load, 8135 i128mem, SchedWriteVecALU.XMM>, VEX_4V; 8136 defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load, 8137 i256mem, SchedWriteVecALU.YMM>, VEX_4V, VEX_L; 8138} 8139// GF2P8AFFINEINVQB, GF2P8AFFINEQB 8140let isCommutable = 0 in { 8141 defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb", 8142 X86GF2P8affineinvqb>, TAPD; 8143 defm GF2P8AFFINEQB : GF2P8AFFINE_common<0xCE, "gf2p8affineqb", 8144 X86GF2P8affineqb>, TAPD; 8145} 8146 8147// AVX-IFMA 8148let Predicates = [HasAVXIFMA, NoVLX_Or_NoIFMA], Constraints = "$src1 = $dst", 8149 checkVEXPredicate = 1 in 8150multiclass avx_ifma_rm<bits<8> opc, string OpcodeStr, SDNode OpNode> { 8151 // NOTE: The SDNode have the multiply operands first with the add last. 8152 // This enables commuted load patterns to be autogenerated by tablegen. 8153 let isCommutable = 1 in { 8154 def rr : AVX8I<opc, MRMSrcReg, (outs VR128:$dst), 8155 (ins VR128:$src1, VR128:$src2, VR128:$src3), 8156 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8157 [(set VR128:$dst, (v2i64 (OpNode VR128:$src2, 8158 VR128:$src3, VR128:$src1)))]>, 8159 VEX_4V, Sched<[SchedWriteVecIMul.XMM]>; 8160 } 8161 def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst), 8162 (ins VR128:$src1, VR128:$src2, i128mem:$src3), 8163 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8164 [(set VR128:$dst, (v2i64 (OpNode VR128:$src2, 8165 (loadv2i64 addr:$src3), VR128:$src1)))]>, 8166 VEX_4V, Sched<[SchedWriteVecIMul.XMM]>; 8167 let isCommutable = 1 in { 8168 def Yrr : AVX8I<opc, MRMSrcReg, (outs VR256:$dst), 8169 (ins VR256:$src1, VR256:$src2, VR256:$src3), 8170 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8171 [(set VR256:$dst, (v4i64 (OpNode VR256:$src2, 8172 VR256:$src3, VR256:$src1)))]>, 8173 VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>; 8174 } 8175 def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst), 8176 (ins VR256:$src1, VR256:$src2, i256mem:$src3), 8177 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8178 [(set VR256:$dst, (v4i64 (OpNode VR256:$src2, 8179 (loadv4i64 addr:$src3), VR256:$src1)))]>, 8180 VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>; 8181} 8182 8183defm VPMADD52HUQ : avx_ifma_rm<0xb5, "vpmadd52huq", x86vpmadd52h>, VEX_W, ExplicitVEXPrefix; 8184defm VPMADD52LUQ : avx_ifma_rm<0xb4, "vpmadd52luq", x86vpmadd52l>, VEX_W, ExplicitVEXPrefix; 8185 8186// AVX-VNNI-INT8 8187let Constraints = "$src1 = $dst" in 8188multiclass avx_dotprod_rm<bits<8> Opc, string OpcodeStr, ValueType OpVT, 8189 RegisterClass RC, PatFrag MemOpFrag, 8190 X86MemOperand X86memop, SDNode OpNode, 8191 X86FoldableSchedWrite Sched, 8192 bit IsCommutable> { 8193 let isCommutable = IsCommutable in 8194 def rr : I<Opc, MRMSrcReg, (outs RC:$dst), 8195 (ins RC:$src1, RC:$src2, RC:$src3), 8196 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8197 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, 8198 VEX_4V, Sched<[Sched]>; 8199 def rm : I<Opc, MRMSrcMem, (outs RC:$dst), 8200 (ins RC:$src1, RC:$src2, X86memop:$src3), 8201 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8202 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, 8203 (MemOpFrag addr:$src3))))]>, 8204 VEX_4V, Sched<[Sched.Folded, Sched.ReadAfterFold]>; 8205} 8206 8207let Predicates = [HasAVXVNNIINT8] in { 8208 defm VPDPBSSD : avx_dotprod_rm<0x50,"vpdpbssd", v4i32, VR128, loadv4i32, 8209 i128mem, X86vpdpbssd, SchedWriteVecIMul.XMM, 8210 1>, T8XD; 8211 defm VPDPBSSDY : avx_dotprod_rm<0x50,"vpdpbssd", v8i32, VR256, loadv8i32, 8212 i256mem, X86vpdpbssd, SchedWriteVecIMul.YMM, 8213 1>, VEX_L, T8XD; 8214 defm VPDPBUUD : avx_dotprod_rm<0x50,"vpdpbuud", v4i32, VR128, loadv4i32, 8215 i128mem, X86vpdpbuud, SchedWriteVecIMul.XMM, 8216 1>, T8PS; 8217 defm VPDPBUUDY : avx_dotprod_rm<0x50,"vpdpbuud", v8i32, VR256, loadv8i32, 8218 i256mem, X86vpdpbuud, SchedWriteVecIMul.YMM, 8219 1>, VEX_L, T8PS; 8220 defm VPDPBSSDS : avx_dotprod_rm<0x51,"vpdpbssds", v4i32, VR128, loadv4i32, 8221 i128mem, X86vpdpbssds, SchedWriteVecIMul.XMM, 8222 1>, T8XD; 8223 defm VPDPBSSDSY : avx_dotprod_rm<0x51,"vpdpbssds", v8i32, VR256, loadv8i32, 8224 i256mem, X86vpdpbssds, SchedWriteVecIMul.YMM, 8225 1>, VEX_L, T8XD; 8226 defm VPDPBUUDS : avx_dotprod_rm<0x51,"vpdpbuuds", v4i32, VR128, loadv4i32, 8227 i128mem, X86vpdpbuuds, SchedWriteVecIMul.XMM, 8228 1>, T8PS; 8229 defm VPDPBUUDSY : avx_dotprod_rm<0x51,"vpdpbuuds", v8i32, VR256, loadv8i32, 8230 i256mem, X86vpdpbuuds, SchedWriteVecIMul.YMM, 8231 1>, VEX_L, T8PS; 8232 defm VPDPBSUD : avx_dotprod_rm<0x50,"vpdpbsud", v4i32, VR128, loadv4i32, 8233 i128mem, X86vpdpbsud, SchedWriteVecIMul.XMM, 8234 0>, T8XS; 8235 defm VPDPBSUDY : avx_dotprod_rm<0x50,"vpdpbsud", v8i32, VR256, loadv8i32, 8236 i256mem, X86vpdpbsud, SchedWriteVecIMul.YMM, 8237 0>, VEX_L, T8XS; 8238 defm VPDPBSUDS : avx_dotprod_rm<0x51,"vpdpbsuds", v4i32, VR128, loadv4i32, 8239 i128mem, X86vpdpbsuds, SchedWriteVecIMul.XMM, 8240 0>, T8XS; 8241 defm VPDPBSUDSY : avx_dotprod_rm<0x51,"vpdpbsuds", v8i32, VR256, loadv8i32, 8242 i256mem, X86vpdpbsuds, SchedWriteVecIMul.YMM, 8243 0>, VEX_L, T8XS; 8244} 8245 8246// AVX-NE-CONVERT 8247multiclass AVX_NE_CONVERT_BASE<bits<8> Opcode, string OpcodeStr, 8248 X86MemOperand MemOp128, X86MemOperand MemOp256> { 8249 def rm : I<Opcode, MRMSrcMem, (outs VR128:$dst), (ins MemOp128:$src), 8250 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 8251 [(set VR128:$dst, 8252 (!cast<Intrinsic>("int_x86_"#OpcodeStr#"128") addr:$src))]>, 8253 Sched<[WriteCvtPH2PS]>, VEX; 8254 def Yrm : I<Opcode, MRMSrcMem, (outs VR256:$dst), (ins MemOp256:$src), 8255 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 8256 [(set VR256:$dst, 8257 (!cast<Intrinsic>("int_x86_"#OpcodeStr#"256") addr:$src))]>, 8258 Sched<[WriteCvtPH2PSY]>, VEX, VEX_L; 8259} 8260 8261multiclass VCVTNEPS2BF16_BASE { 8262 def rr : I<0x72, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 8263 "vcvtneps2bf16\t{$src, $dst|$dst, $src}", 8264 [(set VR128:$dst, (int_x86_vcvtneps2bf16128 VR128:$src))]>, 8265 Sched<[WriteCvtPH2PS]>; 8266 def rm : I<0x72, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 8267 "vcvtneps2bf16{x}\t{$src, $dst|$dst, $src}", 8268 [(set VR128:$dst, (int_x86_vcvtneps2bf16128 (loadv4f32 addr:$src)))]>, 8269 Sched<[WriteCvtPH2PS]>; 8270 def Yrr : I<0x72, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 8271 "vcvtneps2bf16\t{$src, $dst|$dst, $src}", 8272 [(set VR128:$dst, (int_x86_vcvtneps2bf16256 VR256:$src))]>, 8273 Sched<[WriteCvtPH2PSY]>, VEX_L; 8274 def Yrm : I<0x72, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 8275 "vcvtneps2bf16{y}\t{$src, $dst|$dst, $src}", 8276 [(set VR128:$dst, (int_x86_vcvtneps2bf16256 (loadv8f32 addr:$src)))]>, 8277 Sched<[WriteCvtPH2PSY]>, VEX_L; 8278} 8279 8280let Predicates = [HasAVXNECONVERT] in { 8281 defm VBCSTNEBF162PS : AVX_NE_CONVERT_BASE<0xb1, "vbcstnebf162ps", f16mem, 8282 f16mem>, T8XS; 8283 defm VBCSTNESH2PS : AVX_NE_CONVERT_BASE<0xb1, "vbcstnesh2ps", f16mem, f16mem>, 8284 T8PD; 8285 defm VCVTNEEBF162PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneebf162ps", f128mem, 8286 f256mem>, T8XS; 8287 defm VCVTNEEPH2PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneeph2ps", f128mem, 8288 f256mem>, T8PD; 8289 defm VCVTNEOBF162PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneobf162ps", f128mem, 8290 f256mem>, T8XD; 8291 defm VCVTNEOPH2PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneoph2ps", f128mem, 8292 f256mem>, T8PS; 8293 let checkVEXPredicate = 1 in 8294 defm VCVTNEPS2BF16 : VCVTNEPS2BF16_BASE, VEX, T8XS, ExplicitVEXPrefix; 8295} 8296 8297def : InstAlias<"vcvtneps2bf16x\t{$src, $dst|$dst, $src}", 8298 (VCVTNEPS2BF16rr VR128:$dst, VR128:$src), 0, "att">; 8299def : InstAlias<"vcvtneps2bf16y\t{$src, $dst|$dst, $src}", 8300 (VCVTNEPS2BF16Yrr VR128:$dst, VR256:$src), 0, "att">; 8301