1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file describes the X86 SSE instruction set, defining the instructions, 11// and properties of the instructions which are needed for code generation, 12// machine code emission, and analysis. 13// 14//===----------------------------------------------------------------------===// 15 16class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> { 17 InstrItinClass rr = arg_rr; 18 InstrItinClass rm = arg_rm; 19 // InstrSchedModel info. 20 X86FoldableSchedWrite Sched = WriteFAdd; 21} 22 23class SizeItins<OpndItins arg_s, OpndItins arg_d> { 24 OpndItins s = arg_s; 25 OpndItins d = arg_d; 26} 27 28 29class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm, 30 InstrItinClass arg_ri> { 31 InstrItinClass rr = arg_rr; 32 InstrItinClass rm = arg_rm; 33 InstrItinClass ri = arg_ri; 34} 35 36 37// scalar 38let Sched = WriteFAdd in { 39def SSE_ALU_F32S : OpndItins< 40 IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM 41>; 42 43def SSE_ALU_F64S : OpndItins< 44 IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM 45>; 46} 47 48def SSE_ALU_ITINS_S : SizeItins< 49 SSE_ALU_F32S, SSE_ALU_F64S 50>; 51 52let Sched = WriteFMul in { 53def SSE_MUL_F32S : OpndItins< 54 IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM 55>; 56 57def SSE_MUL_F64S : OpndItins< 58 IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM 59>; 60} 61 62def SSE_MUL_ITINS_S : SizeItins< 63 SSE_MUL_F32S, SSE_MUL_F64S 64>; 65 66let Sched = WriteFDiv in { 67def SSE_DIV_F32S : OpndItins< 68 IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM 69>; 70 71def SSE_DIV_F64S : OpndItins< 72 IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM 73>; 74} 75 76def SSE_DIV_ITINS_S : SizeItins< 77 SSE_DIV_F32S, SSE_DIV_F64S 78>; 79 80// parallel 81let Sched = WriteFAdd in { 82def SSE_ALU_F32P : OpndItins< 83 IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM 84>; 85 86def SSE_ALU_F64P : OpndItins< 87 IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM 88>; 89} 90 91def SSE_ALU_ITINS_P : SizeItins< 92 SSE_ALU_F32P, SSE_ALU_F64P 93>; 94 95let Sched = WriteFMul in { 96def SSE_MUL_F32P : OpndItins< 97 IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM 98>; 99 100def SSE_MUL_F64P : OpndItins< 101 IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM 102>; 103} 104 105def SSE_MUL_ITINS_P : SizeItins< 106 SSE_MUL_F32P, SSE_MUL_F64P 107>; 108 109let Sched = WriteFDiv in { 110def SSE_DIV_F32P : OpndItins< 111 IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM 112>; 113 114def SSE_DIV_F64P : OpndItins< 115 IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM 116>; 117} 118 119def SSE_DIV_ITINS_P : SizeItins< 120 SSE_DIV_F32P, SSE_DIV_F64P 121>; 122 123let Sched = WriteVecLogic in 124def SSE_VEC_BIT_ITINS_P : OpndItins< 125 IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM 126>; 127 128def SSE_BIT_ITINS_P : OpndItins< 129 IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM 130>; 131 132let Sched = WriteVecALU in { 133def SSE_INTALU_ITINS_P : OpndItins< 134 IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM 135>; 136 137def SSE_INTALUQ_ITINS_P : OpndItins< 138 IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM 139>; 140} 141 142let Sched = WriteVecIMul in 143def SSE_INTMUL_ITINS_P : OpndItins< 144 IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM 145>; 146 147def SSE_INTSHIFT_ITINS_P : ShiftOpndItins< 148 IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI 149>; 150 151def SSE_MOVA_ITINS : OpndItins< 152 IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM 153>; 154 155def SSE_MOVU_ITINS : OpndItins< 156 IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM 157>; 158 159def SSE_DPPD_ITINS : OpndItins< 160 IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM 161>; 162 163def SSE_DPPS_ITINS : OpndItins< 164 IIC_SSE_DPPS_RR, IIC_SSE_DPPD_RM 165>; 166 167def DEFAULT_ITINS : OpndItins< 168 IIC_ALU_NONMEM, IIC_ALU_MEM 169>; 170 171def SSE_EXTRACT_ITINS : OpndItins< 172 IIC_SSE_EXTRACTPS_RR, IIC_SSE_EXTRACTPS_RM 173>; 174 175def SSE_INSERT_ITINS : OpndItins< 176 IIC_SSE_INSERTPS_RR, IIC_SSE_INSERTPS_RM 177>; 178 179let Sched = WriteMPSAD in 180def SSE_MPSADBW_ITINS : OpndItins< 181 IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM 182>; 183 184let Sched = WriteVecIMul in 185def SSE_PMULLD_ITINS : OpndItins< 186 IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM 187>; 188 189// Definitions for backward compatibility. 190// The instructions mapped on these definitions uses a different itinerary 191// than the actual scheduling model. 192let Sched = WriteShuffle in 193def DEFAULT_ITINS_SHUFFLESCHED : OpndItins< 194 IIC_ALU_NONMEM, IIC_ALU_MEM 195>; 196 197let Sched = WriteVecIMul in 198def DEFAULT_ITINS_VECIMULSCHED : OpndItins< 199 IIC_ALU_NONMEM, IIC_ALU_MEM 200>; 201 202let Sched = WriteShuffle in 203def SSE_INTALU_ITINS_SHUFF_P : OpndItins< 204 IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM 205>; 206 207let Sched = WriteMPSAD in 208def DEFAULT_ITINS_MPSADSCHED : OpndItins< 209 IIC_ALU_NONMEM, IIC_ALU_MEM 210>; 211 212let Sched = WriteFBlend in 213def DEFAULT_ITINS_FBLENDSCHED : OpndItins< 214 IIC_ALU_NONMEM, IIC_ALU_MEM 215>; 216 217let Sched = WriteBlend in 218def DEFAULT_ITINS_BLENDSCHED : OpndItins< 219 IIC_ALU_NONMEM, IIC_ALU_MEM 220>; 221 222let Sched = WriteVarBlend in 223def DEFAULT_ITINS_VARBLENDSCHED : OpndItins< 224 IIC_ALU_NONMEM, IIC_ALU_MEM 225>; 226 227let Sched = WriteFBlend in 228def SSE_INTALU_ITINS_FBLEND_P : OpndItins< 229 IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM 230>; 231 232let Sched = WriteBlend in 233def SSE_INTALU_ITINS_BLEND_P : OpndItins< 234 IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM 235>; 236 237//===----------------------------------------------------------------------===// 238// SSE 1 & 2 Instructions Classes 239//===----------------------------------------------------------------------===// 240 241/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class 242multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, 243 RegisterClass RC, X86MemOperand x86memop, 244 OpndItins itins, 245 bit Is2Addr = 1> { 246 let isCommutable = 1 in { 247 def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 248 !if(Is2Addr, 249 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 250 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 251 [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr>, 252 Sched<[itins.Sched]>; 253 } 254 def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 255 !if(Is2Addr, 256 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 257 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 258 [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm>, 259 Sched<[itins.Sched.Folded, ReadAfterLd]>; 260} 261 262/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class 263multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC, 264 string asm, string SSEVer, string FPSizeStr, 265 Operand memopr, ComplexPattern mem_cpat, 266 OpndItins itins, 267 bit Is2Addr = 1> { 268let isCodeGenOnly = 1 in { 269 def rr_Int : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 270 !if(Is2Addr, 271 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 272 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 273 [(set RC:$dst, (!cast<Intrinsic>( 274 !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr)) 275 RC:$src1, RC:$src2))], itins.rr>, 276 Sched<[itins.Sched]>; 277 def rm_Int : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2), 278 !if(Is2Addr, 279 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 280 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 281 [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse", 282 SSEVer, "_", OpcodeStr, FPSizeStr)) 283 RC:$src1, mem_cpat:$src2))], itins.rm>, 284 Sched<[itins.Sched.Folded, ReadAfterLd]>; 285} 286} 287 288/// sse12_fp_packed - SSE 1 & 2 packed instructions class 289multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, 290 RegisterClass RC, ValueType vt, 291 X86MemOperand x86memop, PatFrag mem_frag, 292 Domain d, OpndItins itins, bit Is2Addr = 1> { 293 let isCommutable = 1 in 294 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 295 !if(Is2Addr, 296 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 297 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 298 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>, 299 Sched<[itins.Sched]>; 300 let mayLoad = 1 in 301 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 302 !if(Is2Addr, 303 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 304 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 305 [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], 306 itins.rm, d>, 307 Sched<[itins.Sched.Folded, ReadAfterLd]>; 308} 309 310/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class 311multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, 312 string OpcodeStr, X86MemOperand x86memop, 313 list<dag> pat_rr, list<dag> pat_rm, 314 bit Is2Addr = 1> { 315 let isCommutable = 1, hasSideEffects = 0 in 316 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 317 !if(Is2Addr, 318 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 319 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 320 pat_rr, NoItinerary, d>, 321 Sched<[WriteVecLogic]>; 322 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 323 !if(Is2Addr, 324 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 325 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 326 pat_rm, NoItinerary, d>, 327 Sched<[WriteVecLogicLd, ReadAfterLd]>; 328} 329 330//===----------------------------------------------------------------------===// 331// Non-instruction patterns 332//===----------------------------------------------------------------------===// 333 334// A vector extract of the first f32/f64 position is a subregister copy 335def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), 336 (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>; 337def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), 338 (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>; 339 340// A 128-bit subvector extract from the first 256-bit vector position 341// is a subregister copy that needs no instruction. 342def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))), 343 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>; 344def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))), 345 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>; 346 347def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))), 348 (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>; 349def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))), 350 (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>; 351 352def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))), 353 (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>; 354def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))), 355 (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>; 356 357// A 128-bit subvector insert to the first 256-bit vector position 358// is a subregister copy that needs no instruction. 359let AddedComplexity = 25 in { // to give priority over vinsertf128rm 360def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)), 361 (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; 362def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)), 363 (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; 364def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)), 365 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; 366def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)), 367 (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; 368def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)), 369 (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; 370def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)), 371 (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; 372} 373 374// Implicitly promote a 32-bit scalar to a vector. 375def : Pat<(v4f32 (scalar_to_vector FR32:$src)), 376 (COPY_TO_REGCLASS FR32:$src, VR128)>; 377def : Pat<(v8f32 (scalar_to_vector FR32:$src)), 378 (COPY_TO_REGCLASS FR32:$src, VR128)>; 379// Implicitly promote a 64-bit scalar to a vector. 380def : Pat<(v2f64 (scalar_to_vector FR64:$src)), 381 (COPY_TO_REGCLASS FR64:$src, VR128)>; 382def : Pat<(v4f64 (scalar_to_vector FR64:$src)), 383 (COPY_TO_REGCLASS FR64:$src, VR128)>; 384 385// Bitcasts between 128-bit vector types. Return the original type since 386// no instruction is needed for the conversion 387let Predicates = [HasSSE2] in { 388 def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; 389 def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>; 390 def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>; 391 def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>; 392 def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>; 393 def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>; 394 def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>; 395 def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>; 396 def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>; 397 def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>; 398 def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>; 399 def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>; 400 def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>; 401 def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>; 402 def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>; 403 def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>; 404 def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>; 405 def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>; 406 def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>; 407 def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>; 408 def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>; 409 def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>; 410 def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>; 411 def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>; 412 def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>; 413 def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>; 414 def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>; 415 def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>; 416 def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>; 417 def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>; 418} 419 420// Bitcasts between 256-bit vector types. Return the original type since 421// no instruction is needed for the conversion 422let Predicates = [HasAVX] in { 423 def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>; 424 def : Pat<(v4f64 (bitconvert (v8i32 VR256:$src))), (v4f64 VR256:$src)>; 425 def : Pat<(v4f64 (bitconvert (v4i64 VR256:$src))), (v4f64 VR256:$src)>; 426 def : Pat<(v4f64 (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>; 427 def : Pat<(v4f64 (bitconvert (v32i8 VR256:$src))), (v4f64 VR256:$src)>; 428 def : Pat<(v8f32 (bitconvert (v8i32 VR256:$src))), (v8f32 VR256:$src)>; 429 def : Pat<(v8f32 (bitconvert (v4i64 VR256:$src))), (v8f32 VR256:$src)>; 430 def : Pat<(v8f32 (bitconvert (v4f64 VR256:$src))), (v8f32 VR256:$src)>; 431 def : Pat<(v8f32 (bitconvert (v32i8 VR256:$src))), (v8f32 VR256:$src)>; 432 def : Pat<(v8f32 (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>; 433 def : Pat<(v4i64 (bitconvert (v8f32 VR256:$src))), (v4i64 VR256:$src)>; 434 def : Pat<(v4i64 (bitconvert (v8i32 VR256:$src))), (v4i64 VR256:$src)>; 435 def : Pat<(v4i64 (bitconvert (v4f64 VR256:$src))), (v4i64 VR256:$src)>; 436 def : Pat<(v4i64 (bitconvert (v32i8 VR256:$src))), (v4i64 VR256:$src)>; 437 def : Pat<(v4i64 (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>; 438 def : Pat<(v32i8 (bitconvert (v4f64 VR256:$src))), (v32i8 VR256:$src)>; 439 def : Pat<(v32i8 (bitconvert (v4i64 VR256:$src))), (v32i8 VR256:$src)>; 440 def : Pat<(v32i8 (bitconvert (v8f32 VR256:$src))), (v32i8 VR256:$src)>; 441 def : Pat<(v32i8 (bitconvert (v8i32 VR256:$src))), (v32i8 VR256:$src)>; 442 def : Pat<(v32i8 (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>; 443 def : Pat<(v8i32 (bitconvert (v32i8 VR256:$src))), (v8i32 VR256:$src)>; 444 def : Pat<(v8i32 (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>; 445 def : Pat<(v8i32 (bitconvert (v8f32 VR256:$src))), (v8i32 VR256:$src)>; 446 def : Pat<(v8i32 (bitconvert (v4i64 VR256:$src))), (v8i32 VR256:$src)>; 447 def : Pat<(v8i32 (bitconvert (v4f64 VR256:$src))), (v8i32 VR256:$src)>; 448 def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))), (v16i16 VR256:$src)>; 449 def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))), (v16i16 VR256:$src)>; 450 def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))), (v16i16 VR256:$src)>; 451 def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))), (v16i16 VR256:$src)>; 452 def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))), (v16i16 VR256:$src)>; 453} 454 455// Alias instructions that map fld0 to xorps for sse or vxorps for avx. 456// This is expanded by ExpandPostRAPseudos. 457let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 458 isPseudo = 1, SchedRW = [WriteZero] in { 459 def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", 460 [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>; 461 def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", 462 [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>; 463} 464 465//===----------------------------------------------------------------------===// 466// AVX & SSE - Zero/One Vectors 467//===----------------------------------------------------------------------===// 468 469// Alias instruction that maps zero vector to pxor / xorp* for sse. 470// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then 471// swizzled by ExecutionDepsFix to pxor. 472// We set canFoldAsLoad because this can be converted to a constant-pool 473// load of an all-zeros value if folding it would be beneficial. 474let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 475 isPseudo = 1, SchedRW = [WriteZero] in { 476def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", 477 [(set VR128:$dst, (v4f32 immAllZerosV))]>; 478} 479 480def : Pat<(v2f64 immAllZerosV), (V_SET0)>; 481def : Pat<(v4i32 immAllZerosV), (V_SET0)>; 482def : Pat<(v2i64 immAllZerosV), (V_SET0)>; 483def : Pat<(v8i16 immAllZerosV), (V_SET0)>; 484def : Pat<(v16i8 immAllZerosV), (V_SET0)>; 485 486 487// The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI, 488// and doesn't need it because on sandy bridge the register is set to zero 489// at the rename stage without using any execution unit, so SET0PSY 490// and SET0PDY can be used for vector int instructions without penalty 491let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 492 isPseudo = 1, Predicates = [HasAVX], SchedRW = [WriteZero] in { 493def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "", 494 [(set VR256:$dst, (v8f32 immAllZerosV))]>; 495} 496 497let Predicates = [HasAVX] in 498 def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>; 499 500let Predicates = [HasAVX2] in { 501 def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>; 502 def : Pat<(v8i32 immAllZerosV), (AVX_SET0)>; 503 def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>; 504 def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>; 505} 506 507// AVX1 has no support for 256-bit integer instructions, but since the 128-bit 508// VPXOR instruction writes zero to its upper part, it's safe build zeros. 509let Predicates = [HasAVX1Only] in { 510def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>; 511def : Pat<(bc_v32i8 (v8f32 immAllZerosV)), 512 (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>; 513 514def : Pat<(v16i16 immAllZerosV), (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>; 515def : Pat<(bc_v16i16 (v8f32 immAllZerosV)), 516 (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>; 517 518def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>; 519def : Pat<(bc_v8i32 (v8f32 immAllZerosV)), 520 (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>; 521 522def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>; 523def : Pat<(bc_v4i64 (v8f32 immAllZerosV)), 524 (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>; 525} 526 527// We set canFoldAsLoad because this can be converted to a constant-pool 528// load of an all-ones value if folding it would be beneficial. 529let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 530 isPseudo = 1, SchedRW = [WriteZero] in { 531 def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "", 532 [(set VR128:$dst, (v4i32 immAllOnesV))]>; 533 let Predicates = [HasAVX2] in 534 def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "", 535 [(set VR256:$dst, (v8i32 immAllOnesV))]>; 536} 537 538 539//===----------------------------------------------------------------------===// 540// SSE 1 & 2 - Move FP Scalar Instructions 541// 542// Move Instructions. Register-to-register movss/movsd is not used for FR32/64 543// register copies because it's a partial register update; Register-to-register 544// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires 545// that the insert be implementable in terms of a copy, and just mentioned, we 546// don't use movss/movsd for copies. 547//===----------------------------------------------------------------------===// 548 549multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt, 550 X86MemOperand x86memop, string base_opc, 551 string asm_opr> { 552 def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst), 553 (ins VR128:$src1, RC:$src2), 554 !strconcat(base_opc, asm_opr), 555 [(set VR128:$dst, (vt (OpNode VR128:$src1, 556 (scalar_to_vector RC:$src2))))], 557 IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>; 558 559 // For the disassembler 560 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in 561 def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), 562 (ins VR128:$src1, RC:$src2), 563 !strconcat(base_opc, asm_opr), 564 [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>; 565} 566 567multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, 568 X86MemOperand x86memop, string OpcodeStr> { 569 // AVX 570 defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr, 571 "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, 572 VEX_4V, VEX_LIG; 573 574 def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), 575 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 576 [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>, 577 VEX, VEX_LIG, Sched<[WriteStore]>; 578 // SSE1 & 2 579 let Constraints = "$src1 = $dst" in { 580 defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr, 581 "\t{$src2, $dst|$dst, $src2}">; 582 } 583 584 def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), 585 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 586 [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>, 587 Sched<[WriteStore]>; 588} 589 590// Loading from memory automatically zeroing upper bits. 591multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop, 592 PatFrag mem_pat, string OpcodeStr> { 593 def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 594 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 595 [(set RC:$dst, (mem_pat addr:$src))], 596 IIC_SSE_MOV_S_RM>, VEX, VEX_LIG, Sched<[WriteLoad]>; 597 def NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 598 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 599 [(set RC:$dst, (mem_pat addr:$src))], 600 IIC_SSE_MOV_S_RM>, Sched<[WriteLoad]>; 601} 602 603defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss">, XS; 604defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd">, XD; 605 606let canFoldAsLoad = 1, isReMaterializable = 1 in { 607 defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS; 608 609 let AddedComplexity = 20 in 610 defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD; 611} 612 613// Patterns 614let Predicates = [UseAVX] in { 615 let AddedComplexity = 20 in { 616 // MOVSSrm zeros the high parts of the register; represent this 617 // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 618 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), 619 (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; 620 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), 621 (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; 622 def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), 623 (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; 624 625 // MOVSDrm zeros the high parts of the register; represent this 626 // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 627 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), 628 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; 629 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), 630 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; 631 def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), 632 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; 633 def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), 634 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; 635 def : Pat<(v2f64 (X86vzload addr:$src)), 636 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; 637 638 // Represent the same patterns above but in the form they appear for 639 // 256-bit types 640 def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, 641 (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), 642 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; 643 def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, 644 (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), 645 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; 646 def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, 647 (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), 648 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; 649 } 650 def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, 651 (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), 652 (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>; 653 654 // Extract and store. 655 def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), 656 addr:$dst), 657 (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>; 658 def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), 659 addr:$dst), 660 (VMOVSDmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64))>; 661 662 // Shuffle with VMOVSS 663 def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), 664 (VMOVSSrr (v4i32 VR128:$src1), 665 (COPY_TO_REGCLASS (v4i32 VR128:$src2), FR32))>; 666 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), 667 (VMOVSSrr (v4f32 VR128:$src1), 668 (COPY_TO_REGCLASS (v4f32 VR128:$src2), FR32))>; 669 670 // 256-bit variants 671 def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)), 672 (SUBREG_TO_REG (i32 0), 673 (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_xmm), 674 (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_xmm)), 675 sub_xmm)>; 676 def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)), 677 (SUBREG_TO_REG (i32 0), 678 (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_xmm), 679 (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_xmm)), 680 sub_xmm)>; 681 682 // Shuffle with VMOVSD 683 def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), 684 (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 685 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), 686 (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 687 def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)), 688 (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 689 def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)), 690 (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 691 692 // 256-bit variants 693 def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)), 694 (SUBREG_TO_REG (i32 0), 695 (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_xmm), 696 (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_xmm)), 697 sub_xmm)>; 698 def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)), 699 (SUBREG_TO_REG (i32 0), 700 (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_xmm), 701 (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)), 702 sub_xmm)>; 703 704 // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem 705 // is during lowering, where it's not possible to recognize the fold cause 706 // it has two uses through a bitcast. One use disappears at isel time and the 707 // fold opportunity reappears. 708 def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)), 709 (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 710 def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)), 711 (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 712 def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)), 713 (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 714 def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)), 715 (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 716} 717 718let Predicates = [UseSSE1] in { 719 let Predicates = [NoSSE41], AddedComplexity = 15 in { 720 // Move scalar to XMM zero-extended, zeroing a VR128 then do a 721 // MOVSS to the lower bits. 722 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), 723 (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>; 724 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 725 (MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; 726 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 727 (MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; 728 } 729 730 let AddedComplexity = 20 in { 731 // MOVSSrm already zeros the high parts of the register. 732 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), 733 (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; 734 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), 735 (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; 736 def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), 737 (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; 738 } 739 740 // Extract and store. 741 def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), 742 addr:$dst), 743 (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>; 744 745 // Shuffle with MOVSS 746 def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), 747 (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>; 748 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), 749 (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>; 750} 751 752let Predicates = [UseSSE2] in { 753 let Predicates = [NoSSE41], AddedComplexity = 15 in { 754 // Move scalar to XMM zero-extended, zeroing a VR128 then do a 755 // MOVSD to the lower bits. 756 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), 757 (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>; 758 } 759 760 let AddedComplexity = 20 in { 761 // MOVSDrm already zeros the high parts of the register. 762 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), 763 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; 764 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), 765 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; 766 def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), 767 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; 768 def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), 769 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; 770 def : Pat<(v2f64 (X86vzload addr:$src)), 771 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; 772 } 773 774 // Extract and store. 775 def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), 776 addr:$dst), 777 (MOVSDmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR64))>; 778 779 // Shuffle with MOVSD 780 def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), 781 (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 782 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), 783 (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 784 def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)), 785 (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 786 def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)), 787 (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 788 789 // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem 790 // is during lowering, where it's not possible to recognize the fold cause 791 // it has two uses through a bitcast. One use disappears at isel time and the 792 // fold opportunity reappears. 793 def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)), 794 (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 795 def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)), 796 (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 797 def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)), 798 (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 799 def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)), 800 (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; 801} 802 803//===----------------------------------------------------------------------===// 804// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions 805//===----------------------------------------------------------------------===// 806 807multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC, 808 X86MemOperand x86memop, PatFrag ld_frag, 809 string asm, Domain d, 810 OpndItins itins, 811 bit IsReMaterializable = 1> { 812let hasSideEffects = 0 in 813 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), 814 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>, 815 Sched<[WriteFShuffle]>; 816let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in 817 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 818 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 819 [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>, 820 Sched<[WriteLoad]>; 821} 822 823let Predicates = [HasAVX, NoVLX] in { 824defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, 825 "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, 826 PS, VEX; 827defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, 828 "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, 829 PD, VEX; 830defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, 831 "movups", SSEPackedSingle, SSE_MOVU_ITINS>, 832 PS, VEX; 833defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, 834 "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, 835 PD, VEX; 836 837defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, 838 "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, 839 PS, VEX, VEX_L; 840defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, 841 "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, 842 PD, VEX, VEX_L; 843defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, 844 "movups", SSEPackedSingle, SSE_MOVU_ITINS>, 845 PS, VEX, VEX_L; 846defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, 847 "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, 848 PD, VEX, VEX_L; 849} 850 851let Predicates = [UseSSE1] in { 852defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, 853 "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, 854 PS; 855defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, 856 "movups", SSEPackedSingle, SSE_MOVU_ITINS>, 857 PS; 858} 859let Predicates = [UseSSE2] in { 860defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, 861 "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, 862 PD; 863defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, 864 "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, 865 PD; 866} 867 868let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX] in { 869def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 870 "movaps\t{$src, $dst|$dst, $src}", 871 [(alignedstore (v4f32 VR128:$src), addr:$dst)], 872 IIC_SSE_MOVA_P_MR>, VEX; 873def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 874 "movapd\t{$src, $dst|$dst, $src}", 875 [(alignedstore (v2f64 VR128:$src), addr:$dst)], 876 IIC_SSE_MOVA_P_MR>, VEX; 877def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 878 "movups\t{$src, $dst|$dst, $src}", 879 [(store (v4f32 VR128:$src), addr:$dst)], 880 IIC_SSE_MOVU_P_MR>, VEX; 881def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 882 "movupd\t{$src, $dst|$dst, $src}", 883 [(store (v2f64 VR128:$src), addr:$dst)], 884 IIC_SSE_MOVU_P_MR>, VEX; 885def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 886 "movaps\t{$src, $dst|$dst, $src}", 887 [(alignedstore256 (v8f32 VR256:$src), addr:$dst)], 888 IIC_SSE_MOVA_P_MR>, VEX, VEX_L; 889def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 890 "movapd\t{$src, $dst|$dst, $src}", 891 [(alignedstore256 (v4f64 VR256:$src), addr:$dst)], 892 IIC_SSE_MOVA_P_MR>, VEX, VEX_L; 893def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 894 "movups\t{$src, $dst|$dst, $src}", 895 [(store (v8f32 VR256:$src), addr:$dst)], 896 IIC_SSE_MOVU_P_MR>, VEX, VEX_L; 897def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 898 "movupd\t{$src, $dst|$dst, $src}", 899 [(store (v4f64 VR256:$src), addr:$dst)], 900 IIC_SSE_MOVU_P_MR>, VEX, VEX_L; 901} // SchedRW 902 903// For disassembler 904let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 905 SchedRW = [WriteFShuffle] in { 906 def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst), 907 (ins VR128:$src), 908 "movaps\t{$src, $dst|$dst, $src}", [], 909 IIC_SSE_MOVA_P_RR>, VEX; 910 def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst), 911 (ins VR128:$src), 912 "movapd\t{$src, $dst|$dst, $src}", [], 913 IIC_SSE_MOVA_P_RR>, VEX; 914 def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst), 915 (ins VR128:$src), 916 "movups\t{$src, $dst|$dst, $src}", [], 917 IIC_SSE_MOVU_P_RR>, VEX; 918 def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst), 919 (ins VR128:$src), 920 "movupd\t{$src, $dst|$dst, $src}", [], 921 IIC_SSE_MOVU_P_RR>, VEX; 922 def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst), 923 (ins VR256:$src), 924 "movaps\t{$src, $dst|$dst, $src}", [], 925 IIC_SSE_MOVA_P_RR>, VEX, VEX_L; 926 def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst), 927 (ins VR256:$src), 928 "movapd\t{$src, $dst|$dst, $src}", [], 929 IIC_SSE_MOVA_P_RR>, VEX, VEX_L; 930 def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst), 931 (ins VR256:$src), 932 "movups\t{$src, $dst|$dst, $src}", [], 933 IIC_SSE_MOVU_P_RR>, VEX, VEX_L; 934 def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst), 935 (ins VR256:$src), 936 "movupd\t{$src, $dst|$dst, $src}", [], 937 IIC_SSE_MOVU_P_RR>, VEX, VEX_L; 938} 939 940let Predicates = [HasAVX] in { 941def : Pat<(v8i32 (X86vzmovl 942 (insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)))), 943 (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; 944def : Pat<(v4i64 (X86vzmovl 945 (insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)))), 946 (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; 947def : Pat<(v8f32 (X86vzmovl 948 (insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)))), 949 (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; 950def : Pat<(v4f64 (X86vzmovl 951 (insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)))), 952 (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; 953} 954 955 956def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src), 957 (VMOVUPSYmr addr:$dst, VR256:$src)>; 958def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src), 959 (VMOVUPDYmr addr:$dst, VR256:$src)>; 960 961let SchedRW = [WriteStore] in { 962def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 963 "movaps\t{$src, $dst|$dst, $src}", 964 [(alignedstore (v4f32 VR128:$src), addr:$dst)], 965 IIC_SSE_MOVA_P_MR>; 966def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 967 "movapd\t{$src, $dst|$dst, $src}", 968 [(alignedstore (v2f64 VR128:$src), addr:$dst)], 969 IIC_SSE_MOVA_P_MR>; 970def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 971 "movups\t{$src, $dst|$dst, $src}", 972 [(store (v4f32 VR128:$src), addr:$dst)], 973 IIC_SSE_MOVU_P_MR>; 974def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 975 "movupd\t{$src, $dst|$dst, $src}", 976 [(store (v2f64 VR128:$src), addr:$dst)], 977 IIC_SSE_MOVU_P_MR>; 978} // SchedRW 979 980// For disassembler 981let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 982 SchedRW = [WriteFShuffle] in { 983 def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 984 "movaps\t{$src, $dst|$dst, $src}", [], 985 IIC_SSE_MOVA_P_RR>; 986 def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 987 "movapd\t{$src, $dst|$dst, $src}", [], 988 IIC_SSE_MOVA_P_RR>; 989 def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 990 "movups\t{$src, $dst|$dst, $src}", [], 991 IIC_SSE_MOVU_P_RR>; 992 def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 993 "movupd\t{$src, $dst|$dst, $src}", [], 994 IIC_SSE_MOVU_P_RR>; 995} 996 997let Predicates = [HasAVX] in { 998 def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src), 999 (VMOVUPSmr addr:$dst, VR128:$src)>; 1000 def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src), 1001 (VMOVUPDmr addr:$dst, VR128:$src)>; 1002} 1003 1004let Predicates = [UseSSE1] in 1005 def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src), 1006 (MOVUPSmr addr:$dst, VR128:$src)>; 1007let Predicates = [UseSSE2] in 1008 def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src), 1009 (MOVUPDmr addr:$dst, VR128:$src)>; 1010 1011// Use vmovaps/vmovups for AVX integer load/store. 1012let Predicates = [HasAVX, NoVLX] in { 1013 // 128-bit load/store 1014 def : Pat<(alignedloadv2i64 addr:$src), 1015 (VMOVAPSrm addr:$src)>; 1016 def : Pat<(loadv2i64 addr:$src), 1017 (VMOVUPSrm addr:$src)>; 1018 1019 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), 1020 (VMOVAPSmr addr:$dst, VR128:$src)>; 1021 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), 1022 (VMOVAPSmr addr:$dst, VR128:$src)>; 1023 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), 1024 (VMOVAPSmr addr:$dst, VR128:$src)>; 1025 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), 1026 (VMOVAPSmr addr:$dst, VR128:$src)>; 1027 def : Pat<(store (v2i64 VR128:$src), addr:$dst), 1028 (VMOVUPSmr addr:$dst, VR128:$src)>; 1029 def : Pat<(store (v4i32 VR128:$src), addr:$dst), 1030 (VMOVUPSmr addr:$dst, VR128:$src)>; 1031 def : Pat<(store (v8i16 VR128:$src), addr:$dst), 1032 (VMOVUPSmr addr:$dst, VR128:$src)>; 1033 def : Pat<(store (v16i8 VR128:$src), addr:$dst), 1034 (VMOVUPSmr addr:$dst, VR128:$src)>; 1035 1036 // 256-bit load/store 1037 def : Pat<(alignedloadv4i64 addr:$src), 1038 (VMOVAPSYrm addr:$src)>; 1039 def : Pat<(loadv4i64 addr:$src), 1040 (VMOVUPSYrm addr:$src)>; 1041 def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst), 1042 (VMOVAPSYmr addr:$dst, VR256:$src)>; 1043 def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst), 1044 (VMOVAPSYmr addr:$dst, VR256:$src)>; 1045 def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst), 1046 (VMOVAPSYmr addr:$dst, VR256:$src)>; 1047 def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst), 1048 (VMOVAPSYmr addr:$dst, VR256:$src)>; 1049 def : Pat<(store (v4i64 VR256:$src), addr:$dst), 1050 (VMOVUPSYmr addr:$dst, VR256:$src)>; 1051 def : Pat<(store (v8i32 VR256:$src), addr:$dst), 1052 (VMOVUPSYmr addr:$dst, VR256:$src)>; 1053 def : Pat<(store (v16i16 VR256:$src), addr:$dst), 1054 (VMOVUPSYmr addr:$dst, VR256:$src)>; 1055 def : Pat<(store (v32i8 VR256:$src), addr:$dst), 1056 (VMOVUPSYmr addr:$dst, VR256:$src)>; 1057 1058 // Special patterns for storing subvector extracts of lower 128-bits 1059 // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr 1060 def : Pat<(alignedstore (v2f64 (extract_subvector 1061 (v4f64 VR256:$src), (iPTR 0))), addr:$dst), 1062 (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; 1063 def : Pat<(alignedstore (v4f32 (extract_subvector 1064 (v8f32 VR256:$src), (iPTR 0))), addr:$dst), 1065 (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; 1066 def : Pat<(alignedstore (v2i64 (extract_subvector 1067 (v4i64 VR256:$src), (iPTR 0))), addr:$dst), 1068 (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; 1069 def : Pat<(alignedstore (v4i32 (extract_subvector 1070 (v8i32 VR256:$src), (iPTR 0))), addr:$dst), 1071 (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; 1072 def : Pat<(alignedstore (v8i16 (extract_subvector 1073 (v16i16 VR256:$src), (iPTR 0))), addr:$dst), 1074 (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; 1075 def : Pat<(alignedstore (v16i8 (extract_subvector 1076 (v32i8 VR256:$src), (iPTR 0))), addr:$dst), 1077 (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; 1078 1079 def : Pat<(store (v2f64 (extract_subvector 1080 (v4f64 VR256:$src), (iPTR 0))), addr:$dst), 1081 (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; 1082 def : Pat<(store (v4f32 (extract_subvector 1083 (v8f32 VR256:$src), (iPTR 0))), addr:$dst), 1084 (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; 1085 def : Pat<(store (v2i64 (extract_subvector 1086 (v4i64 VR256:$src), (iPTR 0))), addr:$dst), 1087 (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; 1088 def : Pat<(store (v4i32 (extract_subvector 1089 (v8i32 VR256:$src), (iPTR 0))), addr:$dst), 1090 (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; 1091 def : Pat<(store (v8i16 (extract_subvector 1092 (v16i16 VR256:$src), (iPTR 0))), addr:$dst), 1093 (VMOVUPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; 1094 def : Pat<(store (v16i8 (extract_subvector 1095 (v32i8 VR256:$src), (iPTR 0))), addr:$dst), 1096 (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; 1097} 1098 1099// Use movaps / movups for SSE integer load / store (one byte shorter). 1100// The instructions selected below are then converted to MOVDQA/MOVDQU 1101// during the SSE domain pass. 1102let Predicates = [UseSSE1] in { 1103 def : Pat<(alignedloadv2i64 addr:$src), 1104 (MOVAPSrm addr:$src)>; 1105 def : Pat<(loadv2i64 addr:$src), 1106 (MOVUPSrm addr:$src)>; 1107 1108 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), 1109 (MOVAPSmr addr:$dst, VR128:$src)>; 1110 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), 1111 (MOVAPSmr addr:$dst, VR128:$src)>; 1112 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), 1113 (MOVAPSmr addr:$dst, VR128:$src)>; 1114 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), 1115 (MOVAPSmr addr:$dst, VR128:$src)>; 1116 def : Pat<(store (v2i64 VR128:$src), addr:$dst), 1117 (MOVUPSmr addr:$dst, VR128:$src)>; 1118 def : Pat<(store (v4i32 VR128:$src), addr:$dst), 1119 (MOVUPSmr addr:$dst, VR128:$src)>; 1120 def : Pat<(store (v8i16 VR128:$src), addr:$dst), 1121 (MOVUPSmr addr:$dst, VR128:$src)>; 1122 def : Pat<(store (v16i8 VR128:$src), addr:$dst), 1123 (MOVUPSmr addr:$dst, VR128:$src)>; 1124} 1125 1126// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper 1127// bits are disregarded. FIXME: Set encoding to pseudo! 1128let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in { 1129let isCodeGenOnly = 1 in { 1130 def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src), 1131 "movaps\t{$src, $dst|$dst, $src}", 1132 [(set FR32:$dst, (alignedloadfsf32 addr:$src))], 1133 IIC_SSE_MOVA_P_RM>, VEX; 1134 def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src), 1135 "movapd\t{$src, $dst|$dst, $src}", 1136 [(set FR64:$dst, (alignedloadfsf64 addr:$src))], 1137 IIC_SSE_MOVA_P_RM>, VEX; 1138 def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src), 1139 "movaps\t{$src, $dst|$dst, $src}", 1140 [(set FR32:$dst, (alignedloadfsf32 addr:$src))], 1141 IIC_SSE_MOVA_P_RM>; 1142 def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src), 1143 "movapd\t{$src, $dst|$dst, $src}", 1144 [(set FR64:$dst, (alignedloadfsf64 addr:$src))], 1145 IIC_SSE_MOVA_P_RM>; 1146} 1147} 1148 1149//===----------------------------------------------------------------------===// 1150// SSE 1 & 2 - Move Low packed FP Instructions 1151//===----------------------------------------------------------------------===// 1152 1153multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode, 1154 string base_opc, string asm_opr, 1155 InstrItinClass itin> { 1156 def PSrm : PI<opc, MRMSrcMem, 1157 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 1158 !strconcat(base_opc, "s", asm_opr), 1159 [(set VR128:$dst, 1160 (psnode VR128:$src1, 1161 (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))], 1162 itin, SSEPackedSingle>, PS, 1163 Sched<[WriteFShuffleLd, ReadAfterLd]>; 1164 1165 def PDrm : PI<opc, MRMSrcMem, 1166 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 1167 !strconcat(base_opc, "d", asm_opr), 1168 [(set VR128:$dst, (v2f64 (pdnode VR128:$src1, 1169 (scalar_to_vector (loadf64 addr:$src2)))))], 1170 itin, SSEPackedDouble>, PD, 1171 Sched<[WriteFShuffleLd, ReadAfterLd]>; 1172 1173} 1174 1175multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode, 1176 string base_opc, InstrItinClass itin> { 1177 defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc, 1178 "\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1179 itin>, VEX_4V; 1180 1181let Constraints = "$src1 = $dst" in 1182 defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc, 1183 "\t{$src2, $dst|$dst, $src2}", 1184 itin>; 1185} 1186 1187let AddedComplexity = 20 in { 1188 defm MOVL : sse12_mov_hilo_packed<0x12, X86Movlps, X86Movlpd, "movlp", 1189 IIC_SSE_MOV_LH>; 1190} 1191 1192let SchedRW = [WriteStore] in { 1193def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 1194 "movlps\t{$src, $dst|$dst, $src}", 1195 [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)), 1196 (iPTR 0))), addr:$dst)], 1197 IIC_SSE_MOV_LH>, VEX; 1198def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 1199 "movlpd\t{$src, $dst|$dst, $src}", 1200 [(store (f64 (vector_extract (v2f64 VR128:$src), 1201 (iPTR 0))), addr:$dst)], 1202 IIC_SSE_MOV_LH>, VEX; 1203def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 1204 "movlps\t{$src, $dst|$dst, $src}", 1205 [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)), 1206 (iPTR 0))), addr:$dst)], 1207 IIC_SSE_MOV_LH>; 1208def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 1209 "movlpd\t{$src, $dst|$dst, $src}", 1210 [(store (f64 (vector_extract (v2f64 VR128:$src), 1211 (iPTR 0))), addr:$dst)], 1212 IIC_SSE_MOV_LH>; 1213} // SchedRW 1214 1215let Predicates = [HasAVX] in { 1216 // Shuffle with VMOVLPS 1217 def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))), 1218 (VMOVLPSrm VR128:$src1, addr:$src2)>; 1219 def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))), 1220 (VMOVLPSrm VR128:$src1, addr:$src2)>; 1221 1222 // Shuffle with VMOVLPD 1223 def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))), 1224 (VMOVLPDrm VR128:$src1, addr:$src2)>; 1225 def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), 1226 (VMOVLPDrm VR128:$src1, addr:$src2)>; 1227 def : Pat<(v2f64 (X86Movsd VR128:$src1, 1228 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), 1229 (VMOVLPDrm VR128:$src1, addr:$src2)>; 1230 1231 // Store patterns 1232 def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), 1233 addr:$src1), 1234 (VMOVLPSmr addr:$src1, VR128:$src2)>; 1235 def : Pat<(store (v4i32 (X86Movlps 1236 (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1), 1237 (VMOVLPSmr addr:$src1, VR128:$src2)>; 1238 def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)), 1239 addr:$src1), 1240 (VMOVLPDmr addr:$src1, VR128:$src2)>; 1241 def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)), 1242 addr:$src1), 1243 (VMOVLPDmr addr:$src1, VR128:$src2)>; 1244} 1245 1246let Predicates = [UseSSE1] in { 1247 // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS 1248 def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)), 1249 (iPTR 0))), addr:$src1), 1250 (MOVLPSmr addr:$src1, VR128:$src2)>; 1251 1252 // Shuffle with MOVLPS 1253 def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))), 1254 (MOVLPSrm VR128:$src1, addr:$src2)>; 1255 def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))), 1256 (MOVLPSrm VR128:$src1, addr:$src2)>; 1257 def : Pat<(X86Movlps VR128:$src1, 1258 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), 1259 (MOVLPSrm VR128:$src1, addr:$src2)>; 1260 1261 // Store patterns 1262 def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), 1263 addr:$src1), 1264 (MOVLPSmr addr:$src1, VR128:$src2)>; 1265 def : Pat<(store (v4i32 (X86Movlps 1266 (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), 1267 addr:$src1), 1268 (MOVLPSmr addr:$src1, VR128:$src2)>; 1269} 1270 1271let Predicates = [UseSSE2] in { 1272 // Shuffle with MOVLPD 1273 def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))), 1274 (MOVLPDrm VR128:$src1, addr:$src2)>; 1275 def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), 1276 (MOVLPDrm VR128:$src1, addr:$src2)>; 1277 def : Pat<(v2f64 (X86Movsd VR128:$src1, 1278 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), 1279 (MOVLPDrm VR128:$src1, addr:$src2)>; 1280 1281 // Store patterns 1282 def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)), 1283 addr:$src1), 1284 (MOVLPDmr addr:$src1, VR128:$src2)>; 1285 def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)), 1286 addr:$src1), 1287 (MOVLPDmr addr:$src1, VR128:$src2)>; 1288} 1289 1290//===----------------------------------------------------------------------===// 1291// SSE 1 & 2 - Move Hi packed FP Instructions 1292//===----------------------------------------------------------------------===// 1293 1294let AddedComplexity = 20 in { 1295 defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Movlhpd, "movhp", 1296 IIC_SSE_MOV_LH>; 1297} 1298 1299let SchedRW = [WriteStore] in { 1300// v2f64 extract element 1 is always custom lowered to unpack high to low 1301// and extract element 0 so the non-store version isn't too horrible. 1302def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 1303 "movhps\t{$src, $dst|$dst, $src}", 1304 [(store (f64 (vector_extract 1305 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), 1306 (bc_v2f64 (v4f32 VR128:$src))), 1307 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX; 1308def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 1309 "movhpd\t{$src, $dst|$dst, $src}", 1310 [(store (f64 (vector_extract 1311 (v2f64 (X86Unpckh VR128:$src, VR128:$src)), 1312 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX; 1313def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 1314 "movhps\t{$src, $dst|$dst, $src}", 1315 [(store (f64 (vector_extract 1316 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), 1317 (bc_v2f64 (v4f32 VR128:$src))), 1318 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>; 1319def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 1320 "movhpd\t{$src, $dst|$dst, $src}", 1321 [(store (f64 (vector_extract 1322 (v2f64 (X86Unpckh VR128:$src, VR128:$src)), 1323 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>; 1324} // SchedRW 1325 1326let Predicates = [HasAVX] in { 1327 // VMOVHPS patterns 1328 def : Pat<(X86Movlhps VR128:$src1, 1329 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), 1330 (VMOVHPSrm VR128:$src1, addr:$src2)>; 1331 def : Pat<(X86Movlhps VR128:$src1, 1332 (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), 1333 (VMOVHPSrm VR128:$src1, addr:$src2)>; 1334 1335 // VMOVHPD patterns 1336 1337 // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem 1338 // is during lowering, where it's not possible to recognize the load fold 1339 // cause it has two uses through a bitcast. One use disappears at isel time 1340 // and the fold opportunity reappears. 1341 def : Pat<(v2f64 (X86Unpckl VR128:$src1, 1342 (scalar_to_vector (loadf64 addr:$src2)))), 1343 (VMOVHPDrm VR128:$src1, addr:$src2)>; 1344 // Also handle an i64 load because that may get selected as a faster way to 1345 // load the data. 1346 def : Pat<(v2f64 (X86Unpckl VR128:$src1, 1347 (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), 1348 (VMOVHPDrm VR128:$src1, addr:$src2)>; 1349 1350 def : Pat<(store (f64 (vector_extract 1351 (v2f64 (X86VPermilpi VR128:$src, (i8 1))), 1352 (iPTR 0))), addr:$dst), 1353 (VMOVHPDmr addr:$dst, VR128:$src)>; 1354} 1355 1356let Predicates = [UseSSE1] in { 1357 // MOVHPS patterns 1358 def : Pat<(X86Movlhps VR128:$src1, 1359 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), 1360 (MOVHPSrm VR128:$src1, addr:$src2)>; 1361 def : Pat<(X86Movlhps VR128:$src1, 1362 (bc_v4f32 (v2i64 (X86vzload addr:$src2)))), 1363 (MOVHPSrm VR128:$src1, addr:$src2)>; 1364} 1365 1366let Predicates = [UseSSE2] in { 1367 // MOVHPD patterns 1368 1369 // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem 1370 // is during lowering, where it's not possible to recognize the load fold 1371 // cause it has two uses through a bitcast. One use disappears at isel time 1372 // and the fold opportunity reappears. 1373 def : Pat<(v2f64 (X86Unpckl VR128:$src1, 1374 (scalar_to_vector (loadf64 addr:$src2)))), 1375 (MOVHPDrm VR128:$src1, addr:$src2)>; 1376 // Also handle an i64 load because that may get selected as a faster way to 1377 // load the data. 1378 def : Pat<(v2f64 (X86Unpckl VR128:$src1, 1379 (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), 1380 (MOVHPDrm VR128:$src1, addr:$src2)>; 1381 1382 def : Pat<(store (f64 (vector_extract 1383 (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))), 1384 (iPTR 0))), addr:$dst), 1385 (MOVHPDmr addr:$dst, VR128:$src)>; 1386} 1387 1388//===----------------------------------------------------------------------===// 1389// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions 1390//===----------------------------------------------------------------------===// 1391 1392let AddedComplexity = 20, Predicates = [UseAVX] in { 1393 def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst), 1394 (ins VR128:$src1, VR128:$src2), 1395 "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1396 [(set VR128:$dst, 1397 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))], 1398 IIC_SSE_MOV_LH>, 1399 VEX_4V, Sched<[WriteFShuffle]>; 1400 def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst), 1401 (ins VR128:$src1, VR128:$src2), 1402 "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1403 [(set VR128:$dst, 1404 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))], 1405 IIC_SSE_MOV_LH>, 1406 VEX_4V, Sched<[WriteFShuffle]>; 1407} 1408let Constraints = "$src1 = $dst", AddedComplexity = 20 in { 1409 def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst), 1410 (ins VR128:$src1, VR128:$src2), 1411 "movlhps\t{$src2, $dst|$dst, $src2}", 1412 [(set VR128:$dst, 1413 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))], 1414 IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; 1415 def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst), 1416 (ins VR128:$src1, VR128:$src2), 1417 "movhlps\t{$src2, $dst|$dst, $src2}", 1418 [(set VR128:$dst, 1419 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))], 1420 IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; 1421} 1422 1423let Predicates = [UseAVX] in { 1424 // MOVLHPS patterns 1425 def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)), 1426 (VMOVLHPSrr VR128:$src1, VR128:$src2)>; 1427 def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)), 1428 (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>; 1429 1430 // MOVHLPS patterns 1431 def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)), 1432 (VMOVHLPSrr VR128:$src1, VR128:$src2)>; 1433} 1434 1435let Predicates = [UseSSE1] in { 1436 // MOVLHPS patterns 1437 def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)), 1438 (MOVLHPSrr VR128:$src1, VR128:$src2)>; 1439 def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)), 1440 (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>; 1441 1442 // MOVHLPS patterns 1443 def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)), 1444 (MOVHLPSrr VR128:$src1, VR128:$src2)>; 1445} 1446 1447//===----------------------------------------------------------------------===// 1448// SSE 1 & 2 - Conversion Instructions 1449//===----------------------------------------------------------------------===// 1450 1451def SSE_CVT_PD : OpndItins< 1452 IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM 1453>; 1454 1455let Sched = WriteCvtI2F in 1456def SSE_CVT_PS : OpndItins< 1457 IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM 1458>; 1459 1460let Sched = WriteCvtI2F in 1461def SSE_CVT_Scalar : OpndItins< 1462 IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM 1463>; 1464 1465let Sched = WriteCvtF2I in 1466def SSE_CVT_SS2SI_32 : OpndItins< 1467 IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM 1468>; 1469 1470let Sched = WriteCvtF2I in 1471def SSE_CVT_SS2SI_64 : OpndItins< 1472 IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM 1473>; 1474 1475let Sched = WriteCvtF2I in 1476def SSE_CVT_SD2SI : OpndItins< 1477 IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM 1478>; 1479 1480multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 1481 SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, 1482 string asm, OpndItins itins> { 1483 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, 1484 [(set DstRC:$dst, (OpNode SrcRC:$src))], 1485 itins.rr>, Sched<[itins.Sched]>; 1486 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, 1487 [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))], 1488 itins.rm>, Sched<[itins.Sched.Folded]>; 1489} 1490 1491multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 1492 X86MemOperand x86memop, string asm, Domain d, 1493 OpndItins itins> { 1494let hasSideEffects = 0 in { 1495 def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, 1496 [], itins.rr, d>, Sched<[itins.Sched]>; 1497 let mayLoad = 1 in 1498 def rm : I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, 1499 [], itins.rm, d>, Sched<[itins.Sched.Folded]>; 1500} 1501} 1502 1503multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 1504 X86MemOperand x86memop, string asm> { 1505let hasSideEffects = 0, Predicates = [UseAVX] in { 1506 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), 1507 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, 1508 Sched<[WriteCvtI2F]>; 1509 let mayLoad = 1 in 1510 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), 1511 (ins DstRC:$src1, x86memop:$src), 1512 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, 1513 Sched<[WriteCvtI2FLd, ReadAfterLd]>; 1514} // hasSideEffects = 0 1515} 1516 1517let Predicates = [UseAVX] in { 1518defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, 1519 "cvttss2si\t{$src, $dst|$dst, $src}", 1520 SSE_CVT_SS2SI_32>, 1521 XS, VEX, VEX_LIG; 1522defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, 1523 "cvttss2si\t{$src, $dst|$dst, $src}", 1524 SSE_CVT_SS2SI_64>, 1525 XS, VEX, VEX_W, VEX_LIG; 1526defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, 1527 "cvttsd2si\t{$src, $dst|$dst, $src}", 1528 SSE_CVT_SD2SI>, 1529 XD, VEX, VEX_LIG; 1530defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, 1531 "cvttsd2si\t{$src, $dst|$dst, $src}", 1532 SSE_CVT_SD2SI>, 1533 XD, VEX, VEX_W, VEX_LIG; 1534 1535def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", 1536 (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0>; 1537def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", 1538 (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0>; 1539def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", 1540 (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0>; 1541def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", 1542 (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0>; 1543def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", 1544 (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0>; 1545def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", 1546 (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>; 1547def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", 1548 (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>; 1549def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", 1550 (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>; 1551} 1552// The assembler can recognize rr 64-bit instructions by seeing a rxx 1553// register, but the same isn't true when only using memory operands, 1554// provide other assembly "l" and "q" forms to address this explicitly 1555// where appropriate to do so. 1556defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}">, 1557 XS, VEX_4V, VEX_LIG; 1558defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">, 1559 XS, VEX_4V, VEX_W, VEX_LIG; 1560defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">, 1561 XD, VEX_4V, VEX_LIG; 1562defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">, 1563 XD, VEX_4V, VEX_W, VEX_LIG; 1564 1565let Predicates = [UseAVX] in { 1566 def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", 1567 (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0>; 1568 def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", 1569 (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0>; 1570 1571 def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), 1572 (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; 1573 def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))), 1574 (VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>; 1575 def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))), 1576 (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; 1577 def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))), 1578 (VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>; 1579 1580 def : Pat<(f32 (sint_to_fp GR32:$src)), 1581 (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>; 1582 def : Pat<(f32 (sint_to_fp GR64:$src)), 1583 (VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>; 1584 def : Pat<(f64 (sint_to_fp GR32:$src)), 1585 (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>; 1586 def : Pat<(f64 (sint_to_fp GR64:$src)), 1587 (VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>; 1588} 1589 1590defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, 1591 "cvttss2si\t{$src, $dst|$dst, $src}", 1592 SSE_CVT_SS2SI_32>, XS; 1593defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, 1594 "cvttss2si\t{$src, $dst|$dst, $src}", 1595 SSE_CVT_SS2SI_64>, XS, REX_W; 1596defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, 1597 "cvttsd2si\t{$src, $dst|$dst, $src}", 1598 SSE_CVT_SD2SI>, XD; 1599defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, 1600 "cvttsd2si\t{$src, $dst|$dst, $src}", 1601 SSE_CVT_SD2SI>, XD, REX_W; 1602defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32, 1603 "cvtsi2ss{l}\t{$src, $dst|$dst, $src}", 1604 SSE_CVT_Scalar>, XS; 1605defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64, 1606 "cvtsi2ss{q}\t{$src, $dst|$dst, $src}", 1607 SSE_CVT_Scalar>, XS, REX_W; 1608defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32, 1609 "cvtsi2sd{l}\t{$src, $dst|$dst, $src}", 1610 SSE_CVT_Scalar>, XD; 1611defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64, 1612 "cvtsi2sd{q}\t{$src, $dst|$dst, $src}", 1613 SSE_CVT_Scalar>, XD, REX_W; 1614 1615def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", 1616 (CVTTSS2SIrr GR32:$dst, FR32:$src), 0>; 1617def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", 1618 (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0>; 1619def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", 1620 (CVTTSD2SIrr GR32:$dst, FR64:$src), 0>; 1621def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", 1622 (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0>; 1623def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", 1624 (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0>; 1625def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", 1626 (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>; 1627def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", 1628 (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0>; 1629def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", 1630 (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>; 1631 1632def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}", 1633 (CVTSI2SSrm FR64:$dst, i32mem:$src), 0>; 1634def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", 1635 (CVTSI2SDrm FR64:$dst, i32mem:$src), 0>; 1636 1637// Conversion Instructions Intrinsics - Match intrinsics which expect MM 1638// and/or XMM operand(s). 1639 1640multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 1641 Intrinsic Int, Operand memop, ComplexPattern mem_cpat, 1642 string asm, OpndItins itins> { 1643 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), 1644 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 1645 [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>, 1646 Sched<[itins.Sched]>; 1647 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), 1648 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 1649 [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>, 1650 Sched<[itins.Sched.Folded]>; 1651} 1652 1653multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, 1654 RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop, 1655 PatFrag ld_frag, string asm, OpndItins itins, 1656 bit Is2Addr = 1> { 1657 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2), 1658 !if(Is2Addr, 1659 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 1660 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 1661 [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], 1662 itins.rr>, Sched<[itins.Sched]>; 1663 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), 1664 (ins DstRC:$src1, x86memop:$src2), 1665 !if(Is2Addr, 1666 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 1667 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 1668 [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], 1669 itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; 1670} 1671 1672let Predicates = [UseAVX] in { 1673defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, 1674 int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si", 1675 SSE_CVT_SD2SI>, XD, VEX, VEX_LIG; 1676defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, 1677 int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si", 1678 SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG; 1679} 1680defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si, 1681 sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD; 1682defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64, 1683 sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W; 1684 1685 1686let isCodeGenOnly = 1 in { 1687 let Predicates = [UseAVX] in { 1688 defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1689 int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}", 1690 SSE_CVT_Scalar, 0>, XS, VEX_4V; 1691 defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1692 int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}", 1693 SSE_CVT_Scalar, 0>, XS, VEX_4V, 1694 VEX_W; 1695 defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1696 int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}", 1697 SSE_CVT_Scalar, 0>, XD, VEX_4V; 1698 defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1699 int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}", 1700 SSE_CVT_Scalar, 0>, XD, 1701 VEX_4V, VEX_W; 1702 } 1703 let Constraints = "$src1 = $dst" in { 1704 defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1705 int_x86_sse_cvtsi2ss, i32mem, loadi32, 1706 "cvtsi2ss{l}", SSE_CVT_Scalar>, XS; 1707 defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1708 int_x86_sse_cvtsi642ss, i64mem, loadi64, 1709 "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W; 1710 defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1711 int_x86_sse2_cvtsi2sd, i32mem, loadi32, 1712 "cvtsi2sd{l}", SSE_CVT_Scalar>, XD; 1713 defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1714 int_x86_sse2_cvtsi642sd, i64mem, loadi64, 1715 "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W; 1716 } 1717} // isCodeGenOnly = 1 1718 1719/// SSE 1 Only 1720 1721// Aliases for intrinsics 1722let isCodeGenOnly = 1 in { 1723let Predicates = [UseAVX] in { 1724defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, 1725 ssmem, sse_load_f32, "cvttss2si", 1726 SSE_CVT_SS2SI_32>, XS, VEX; 1727defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, 1728 int_x86_sse_cvttss2si64, ssmem, sse_load_f32, 1729 "cvttss2si", SSE_CVT_SS2SI_64>, 1730 XS, VEX, VEX_W; 1731defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, 1732 sdmem, sse_load_f64, "cvttsd2si", 1733 SSE_CVT_SD2SI>, XD, VEX; 1734defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, 1735 int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, 1736 "cvttsd2si", SSE_CVT_SD2SI>, 1737 XD, VEX, VEX_W; 1738} 1739defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, 1740 ssmem, sse_load_f32, "cvttss2si", 1741 SSE_CVT_SS2SI_32>, XS; 1742defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, 1743 int_x86_sse_cvttss2si64, ssmem, sse_load_f32, 1744 "cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W; 1745defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, 1746 sdmem, sse_load_f64, "cvttsd2si", 1747 SSE_CVT_SD2SI>, XD; 1748defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, 1749 int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, 1750 "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W; 1751} // isCodeGenOnly = 1 1752 1753let Predicates = [UseAVX] in { 1754defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, 1755 ssmem, sse_load_f32, "cvtss2si", 1756 SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG; 1757defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, 1758 ssmem, sse_load_f32, "cvtss2si", 1759 SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG; 1760} 1761defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, 1762 ssmem, sse_load_f32, "cvtss2si", 1763 SSE_CVT_SS2SI_32>, XS; 1764defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, 1765 ssmem, sse_load_f32, "cvtss2si", 1766 SSE_CVT_SS2SI_64>, XS, REX_W; 1767 1768defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem, 1769 "vcvtdq2ps\t{$src, $dst|$dst, $src}", 1770 SSEPackedSingle, SSE_CVT_PS>, 1771 PS, VEX, Requires<[HasAVX]>; 1772defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, VR256, i256mem, 1773 "vcvtdq2ps\t{$src, $dst|$dst, $src}", 1774 SSEPackedSingle, SSE_CVT_PS>, 1775 PS, VEX, VEX_L, Requires<[HasAVX]>; 1776 1777defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem, 1778 "cvtdq2ps\t{$src, $dst|$dst, $src}", 1779 SSEPackedSingle, SSE_CVT_PS>, 1780 PS, Requires<[UseSSE2]>; 1781 1782let Predicates = [UseAVX] in { 1783def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", 1784 (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>; 1785def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", 1786 (VCVTSS2SIrm GR32:$dst, ssmem:$src), 0>; 1787def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", 1788 (VCVTSD2SIrr GR32:$dst, VR128:$src), 0>; 1789def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", 1790 (VCVTSD2SIrm GR32:$dst, sdmem:$src), 0>; 1791def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", 1792 (VCVTSS2SI64rr GR64:$dst, VR128:$src), 0>; 1793def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", 1794 (VCVTSS2SI64rm GR64:$dst, ssmem:$src), 0>; 1795def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", 1796 (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>; 1797def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", 1798 (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>; 1799} 1800 1801def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", 1802 (CVTSS2SIrr GR32:$dst, VR128:$src), 0>; 1803def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", 1804 (CVTSS2SIrm GR32:$dst, ssmem:$src), 0>; 1805def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", 1806 (CVTSD2SIrr GR32:$dst, VR128:$src), 0>; 1807def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", 1808 (CVTSD2SIrm GR32:$dst, sdmem:$src), 0>; 1809def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", 1810 (CVTSS2SI64rr GR64:$dst, VR128:$src), 0>; 1811def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", 1812 (CVTSS2SI64rm GR64:$dst, ssmem:$src), 0>; 1813def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", 1814 (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>; 1815def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", 1816 (CVTSD2SI64rm GR64:$dst, sdmem:$src)>; 1817 1818/// SSE 2 Only 1819 1820// Convert scalar double to scalar single 1821let hasSideEffects = 0, Predicates = [UseAVX] in { 1822def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), 1823 (ins FR64:$src1, FR64:$src2), 1824 "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], 1825 IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG, 1826 Sched<[WriteCvtF2F]>; 1827let mayLoad = 1 in 1828def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), 1829 (ins FR64:$src1, f64mem:$src2), 1830 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1831 [], IIC_SSE_CVT_Scalar_RM>, 1832 XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG, 1833 Sched<[WriteCvtF2FLd, ReadAfterLd]>; 1834} 1835 1836def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>, 1837 Requires<[UseAVX]>; 1838 1839def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), 1840 "cvtsd2ss\t{$src, $dst|$dst, $src}", 1841 [(set FR32:$dst, (fround FR64:$src))], 1842 IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>; 1843def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), 1844 "cvtsd2ss\t{$src, $dst|$dst, $src}", 1845 [(set FR32:$dst, (fround (loadf64 addr:$src)))], 1846 IIC_SSE_CVT_Scalar_RM>, 1847 XD, 1848 Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>; 1849 1850let isCodeGenOnly = 1 in { 1851def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg, 1852 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1853 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1854 [(set VR128:$dst, 1855 (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))], 1856 IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[UseAVX]>, 1857 Sched<[WriteCvtF2F]>; 1858def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg, 1859 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), 1860 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1861 [(set VR128:$dst, (int_x86_sse2_cvtsd2ss 1862 VR128:$src1, sse_load_f64:$src2))], 1863 IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[UseAVX]>, 1864 Sched<[WriteCvtF2FLd, ReadAfterLd]>; 1865 1866let Constraints = "$src1 = $dst" in { 1867def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg, 1868 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1869 "cvtsd2ss\t{$src2, $dst|$dst, $src2}", 1870 [(set VR128:$dst, 1871 (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))], 1872 IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>, 1873 Sched<[WriteCvtF2F]>; 1874def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg, 1875 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), 1876 "cvtsd2ss\t{$src2, $dst|$dst, $src2}", 1877 [(set VR128:$dst, (int_x86_sse2_cvtsd2ss 1878 VR128:$src1, sse_load_f64:$src2))], 1879 IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>, 1880 Sched<[WriteCvtF2FLd, ReadAfterLd]>; 1881} 1882} // isCodeGenOnly = 1 1883 1884// Convert scalar single to scalar double 1885// SSE2 instructions with XS prefix 1886let hasSideEffects = 0, Predicates = [UseAVX] in { 1887def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), 1888 (ins FR32:$src1, FR32:$src2), 1889 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1890 [], IIC_SSE_CVT_Scalar_RR>, 1891 XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG, 1892 Sched<[WriteCvtF2F]>; 1893let mayLoad = 1 in 1894def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), 1895 (ins FR32:$src1, f32mem:$src2), 1896 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1897 [], IIC_SSE_CVT_Scalar_RM>, 1898 XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>, 1899 Sched<[WriteCvtF2FLd, ReadAfterLd]>; 1900} 1901 1902def : Pat<(f64 (fextend FR32:$src)), 1903 (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>; 1904def : Pat<(fextend (loadf32 addr:$src)), 1905 (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>; 1906 1907def : Pat<(extloadf32 addr:$src), 1908 (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, 1909 Requires<[UseAVX, OptForSize]>; 1910def : Pat<(extloadf32 addr:$src), 1911 (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>, 1912 Requires<[UseAVX, OptForSpeed]>; 1913 1914def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), 1915 "cvtss2sd\t{$src, $dst|$dst, $src}", 1916 [(set FR64:$dst, (fextend FR32:$src))], 1917 IIC_SSE_CVT_Scalar_RR>, XS, 1918 Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>; 1919def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), 1920 "cvtss2sd\t{$src, $dst|$dst, $src}", 1921 [(set FR64:$dst, (extloadf32 addr:$src))], 1922 IIC_SSE_CVT_Scalar_RM>, XS, 1923 Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>; 1924 1925// extload f32 -> f64. This matches load+fextend because we have a hack in 1926// the isel (PreprocessForFPConvert) that can introduce loads after dag 1927// combine. 1928// Since these loads aren't folded into the fextend, we have to match it 1929// explicitly here. 1930def : Pat<(fextend (loadf32 addr:$src)), 1931 (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>; 1932def : Pat<(extloadf32 addr:$src), 1933 (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>; 1934 1935let isCodeGenOnly = 1 in { 1936def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg, 1937 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1938 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1939 [(set VR128:$dst, 1940 (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))], 1941 IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[UseAVX]>, 1942 Sched<[WriteCvtF2F]>; 1943def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem, 1944 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), 1945 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1946 [(set VR128:$dst, 1947 (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))], 1948 IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[UseAVX]>, 1949 Sched<[WriteCvtF2FLd, ReadAfterLd]>; 1950let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix 1951def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg, 1952 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1953 "cvtss2sd\t{$src2, $dst|$dst, $src2}", 1954 [(set VR128:$dst, 1955 (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))], 1956 IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>, 1957 Sched<[WriteCvtF2F]>; 1958def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem, 1959 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), 1960 "cvtss2sd\t{$src2, $dst|$dst, $src2}", 1961 [(set VR128:$dst, 1962 (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))], 1963 IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>, 1964 Sched<[WriteCvtF2FLd, ReadAfterLd]>; 1965} 1966} // isCodeGenOnly = 1 1967 1968// Convert packed single/double fp to doubleword 1969def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1970 "cvtps2dq\t{$src, $dst|$dst, $src}", 1971 [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))], 1972 IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>; 1973def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1974 "cvtps2dq\t{$src, $dst|$dst, $src}", 1975 [(set VR128:$dst, 1976 (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))], 1977 IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>; 1978def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 1979 "cvtps2dq\t{$src, $dst|$dst, $src}", 1980 [(set VR256:$dst, 1981 (int_x86_avx_cvt_ps2dq_256 VR256:$src))], 1982 IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; 1983def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 1984 "cvtps2dq\t{$src, $dst|$dst, $src}", 1985 [(set VR256:$dst, 1986 (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))], 1987 IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; 1988def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1989 "cvtps2dq\t{$src, $dst|$dst, $src}", 1990 [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))], 1991 IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>; 1992def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1993 "cvtps2dq\t{$src, $dst|$dst, $src}", 1994 [(set VR128:$dst, 1995 (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))], 1996 IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>; 1997 1998 1999// Convert Packed Double FP to Packed DW Integers 2000let Predicates = [HasAVX] in { 2001// The assembler can recognize rr 256-bit instructions by seeing a ymm 2002// register, but the same isn't true when using memory operands instead. 2003// Provide other assembly rr and rm forms to address this explicitly. 2004def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2005 "vcvtpd2dq\t{$src, $dst|$dst, $src}", 2006 [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>, 2007 VEX, Sched<[WriteCvtF2I]>; 2008 2009// XMM only 2010def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", 2011 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>; 2012def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2013 "vcvtpd2dqx\t{$src, $dst|$dst, $src}", 2014 [(set VR128:$dst, 2015 (int_x86_sse2_cvtpd2dq (loadv2f64 addr:$src)))]>, VEX, 2016 Sched<[WriteCvtF2ILd]>; 2017 2018// YMM only 2019def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 2020 "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", 2021 [(set VR128:$dst, 2022 (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L, 2023 Sched<[WriteCvtF2I]>; 2024def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 2025 "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", 2026 [(set VR128:$dst, 2027 (int_x86_avx_cvt_pd2dq_256 (loadv4f64 addr:$src)))]>, 2028 VEX, VEX_L, Sched<[WriteCvtF2ILd]>; 2029def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}", 2030 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>; 2031} 2032 2033def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2034 "cvtpd2dq\t{$src, $dst|$dst, $src}", 2035 [(set VR128:$dst, 2036 (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))], 2037 IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>; 2038def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2039 "cvtpd2dq\t{$src, $dst|$dst, $src}", 2040 [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))], 2041 IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>; 2042 2043// Convert with truncation packed single/double fp to doubleword 2044// SSE2 packed instructions with XS prefix 2045def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2046 "cvttps2dq\t{$src, $dst|$dst, $src}", 2047 [(set VR128:$dst, 2048 (int_x86_sse2_cvttps2dq VR128:$src))], 2049 IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>; 2050def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2051 "cvttps2dq\t{$src, $dst|$dst, $src}", 2052 [(set VR128:$dst, (int_x86_sse2_cvttps2dq 2053 (loadv4f32 addr:$src)))], 2054 IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>; 2055def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 2056 "cvttps2dq\t{$src, $dst|$dst, $src}", 2057 [(set VR256:$dst, 2058 (int_x86_avx_cvtt_ps2dq_256 VR256:$src))], 2059 IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; 2060def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 2061 "cvttps2dq\t{$src, $dst|$dst, $src}", 2062 [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256 2063 (loadv8f32 addr:$src)))], 2064 IIC_SSE_CVT_PS_RM>, VEX, VEX_L, 2065 Sched<[WriteCvtF2ILd]>; 2066 2067def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2068 "cvttps2dq\t{$src, $dst|$dst, $src}", 2069 [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))], 2070 IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>; 2071def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2072 "cvttps2dq\t{$src, $dst|$dst, $src}", 2073 [(set VR128:$dst, 2074 (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))], 2075 IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>; 2076 2077let Predicates = [HasAVX] in { 2078 def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), 2079 (VCVTDQ2PSrr VR128:$src)>; 2080 def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))), 2081 (VCVTDQ2PSrm addr:$src)>; 2082 2083 def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src), 2084 (VCVTDQ2PSrr VR128:$src)>; 2085 def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))), 2086 (VCVTDQ2PSrm addr:$src)>; 2087 2088 def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), 2089 (VCVTTPS2DQrr VR128:$src)>; 2090 def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))), 2091 (VCVTTPS2DQrm addr:$src)>; 2092 2093 def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))), 2094 (VCVTDQ2PSYrr VR256:$src)>; 2095 def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (loadv4i64 addr:$src)))), 2096 (VCVTDQ2PSYrm addr:$src)>; 2097 2098 def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))), 2099 (VCVTTPS2DQYrr VR256:$src)>; 2100 def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))), 2101 (VCVTTPS2DQYrm addr:$src)>; 2102} 2103 2104let Predicates = [UseSSE2] in { 2105 def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), 2106 (CVTDQ2PSrr VR128:$src)>; 2107 def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))), 2108 (CVTDQ2PSrm addr:$src)>; 2109 2110 def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src), 2111 (CVTDQ2PSrr VR128:$src)>; 2112 def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))), 2113 (CVTDQ2PSrm addr:$src)>; 2114 2115 def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), 2116 (CVTTPS2DQrr VR128:$src)>; 2117 def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))), 2118 (CVTTPS2DQrm addr:$src)>; 2119} 2120 2121def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2122 "cvttpd2dq\t{$src, $dst|$dst, $src}", 2123 [(set VR128:$dst, 2124 (int_x86_sse2_cvttpd2dq VR128:$src))], 2125 IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>; 2126 2127// The assembler can recognize rr 256-bit instructions by seeing a ymm 2128// register, but the same isn't true when using memory operands instead. 2129// Provide other assembly rr and rm forms to address this explicitly. 2130 2131// XMM only 2132def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", 2133 (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>; 2134def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2135 "cvttpd2dqx\t{$src, $dst|$dst, $src}", 2136 [(set VR128:$dst, (int_x86_sse2_cvttpd2dq 2137 (loadv2f64 addr:$src)))], 2138 IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>; 2139 2140// YMM only 2141def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 2142 "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", 2143 [(set VR128:$dst, 2144 (int_x86_avx_cvtt_pd2dq_256 VR256:$src))], 2145 IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; 2146def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 2147 "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", 2148 [(set VR128:$dst, 2149 (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))], 2150 IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; 2151def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}", 2152 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>; 2153 2154let Predicates = [HasAVX] in { 2155 def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), 2156 (VCVTTPD2DQYrr VR256:$src)>; 2157 def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))), 2158 (VCVTTPD2DQYrm addr:$src)>; 2159} // Predicates = [HasAVX] 2160 2161def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2162 "cvttpd2dq\t{$src, $dst|$dst, $src}", 2163 [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))], 2164 IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>; 2165def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), 2166 "cvttpd2dq\t{$src, $dst|$dst, $src}", 2167 [(set VR128:$dst, (int_x86_sse2_cvttpd2dq 2168 (memopv2f64 addr:$src)))], 2169 IIC_SSE_CVT_PD_RM>, 2170 Sched<[WriteCvtF2ILd]>; 2171 2172// Convert packed single to packed double 2173let Predicates = [HasAVX] in { 2174 // SSE2 instructions without OpSize prefix 2175def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2176 "vcvtps2pd\t{$src, $dst|$dst, $src}", 2177 [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))], 2178 IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>; 2179def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 2180 "vcvtps2pd\t{$src, $dst|$dst, $src}", 2181 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))], 2182 IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>; 2183def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 2184 "vcvtps2pd\t{$src, $dst|$dst, $src}", 2185 [(set VR256:$dst, 2186 (int_x86_avx_cvt_ps2_pd_256 VR128:$src))], 2187 IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>; 2188def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), 2189 "vcvtps2pd\t{$src, $dst|$dst, $src}", 2190 [(set VR256:$dst, 2191 (int_x86_avx_cvt_ps2_pd_256 (loadv4f32 addr:$src)))], 2192 IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>; 2193} 2194 2195let Predicates = [UseSSE2] in { 2196def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2197 "cvtps2pd\t{$src, $dst|$dst, $src}", 2198 [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))], 2199 IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>; 2200def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 2201 "cvtps2pd\t{$src, $dst|$dst, $src}", 2202 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))], 2203 IIC_SSE_CVT_PD_RM>, PS, Sched<[WriteCvtF2FLd]>; 2204} 2205 2206// Convert Packed DW Integers to Packed Double FP 2207let Predicates = [HasAVX] in { 2208let hasSideEffects = 0, mayLoad = 1 in 2209def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 2210 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 2211 []>, VEX, Sched<[WriteCvtI2FLd]>; 2212def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2213 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 2214 [(set VR128:$dst, 2215 (int_x86_sse2_cvtdq2pd VR128:$src))]>, VEX, 2216 Sched<[WriteCvtI2F]>; 2217def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), 2218 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 2219 [(set VR256:$dst, 2220 (int_x86_avx_cvtdq2_pd_256 2221 (bitconvert (loadv2i64 addr:$src))))]>, VEX, VEX_L, 2222 Sched<[WriteCvtI2FLd]>; 2223def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 2224 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 2225 [(set VR256:$dst, 2226 (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX, VEX_L, 2227 Sched<[WriteCvtI2F]>; 2228} 2229 2230let hasSideEffects = 0, mayLoad = 1 in 2231def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 2232 "cvtdq2pd\t{$src, $dst|$dst, $src}", [], 2233 IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>; 2234def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2235 "cvtdq2pd\t{$src, $dst|$dst, $src}", 2236 [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))], 2237 IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>; 2238 2239// AVX 256-bit register conversion intrinsics 2240let Predicates = [HasAVX] in { 2241 def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))), 2242 (VCVTDQ2PDYrr VR128:$src)>; 2243 def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))), 2244 (VCVTDQ2PDYrm addr:$src)>; 2245} // Predicates = [HasAVX] 2246 2247// Convert packed double to packed single 2248// The assembler can recognize rr 256-bit instructions by seeing a ymm 2249// register, but the same isn't true when using memory operands instead. 2250// Provide other assembly rr and rm forms to address this explicitly. 2251def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2252 "cvtpd2ps\t{$src, $dst|$dst, $src}", 2253 [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))], 2254 IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>; 2255 2256// XMM only 2257def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", 2258 (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>; 2259def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2260 "cvtpd2psx\t{$src, $dst|$dst, $src}", 2261 [(set VR128:$dst, 2262 (int_x86_sse2_cvtpd2ps (loadv2f64 addr:$src)))], 2263 IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>; 2264 2265// YMM only 2266def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 2267 "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", 2268 [(set VR128:$dst, 2269 (int_x86_avx_cvt_pd2_ps_256 VR256:$src))], 2270 IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>; 2271def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 2272 "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", 2273 [(set VR128:$dst, 2274 (int_x86_avx_cvt_pd2_ps_256 (loadv4f64 addr:$src)))], 2275 IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>; 2276def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}", 2277 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>; 2278 2279def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2280 "cvtpd2ps\t{$src, $dst|$dst, $src}", 2281 [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))], 2282 IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>; 2283def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2284 "cvtpd2ps\t{$src, $dst|$dst, $src}", 2285 [(set VR128:$dst, 2286 (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))], 2287 IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>; 2288 2289 2290// AVX 256-bit register conversion intrinsics 2291// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below 2292// whenever possible to avoid declaring two versions of each one. 2293let Predicates = [HasAVX] in { 2294 def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src), 2295 (VCVTDQ2PSYrr VR256:$src)>; 2296 def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (loadv4i64 addr:$src))), 2297 (VCVTDQ2PSYrm addr:$src)>; 2298 2299 // Match fround and fextend for 128/256-bit conversions 2300 def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))), 2301 (VCVTPD2PSrr VR128:$src)>; 2302 def : Pat<(v4f32 (X86vfpround (loadv2f64 addr:$src))), 2303 (VCVTPD2PSXrm addr:$src)>; 2304 def : Pat<(v4f32 (fround (v4f64 VR256:$src))), 2305 (VCVTPD2PSYrr VR256:$src)>; 2306 def : Pat<(v4f32 (fround (loadv4f64 addr:$src))), 2307 (VCVTPD2PSYrm addr:$src)>; 2308 2309 def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))), 2310 (VCVTPS2PDrr VR128:$src)>; 2311 def : Pat<(v4f64 (fextend (v4f32 VR128:$src))), 2312 (VCVTPS2PDYrr VR128:$src)>; 2313 def : Pat<(v4f64 (extloadv4f32 addr:$src)), 2314 (VCVTPS2PDYrm addr:$src)>; 2315} 2316 2317let Predicates = [UseSSE2] in { 2318 // Match fround and fextend for 128 conversions 2319 def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))), 2320 (CVTPD2PSrr VR128:$src)>; 2321 def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))), 2322 (CVTPD2PSrm addr:$src)>; 2323 2324 def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))), 2325 (CVTPS2PDrr VR128:$src)>; 2326} 2327 2328//===----------------------------------------------------------------------===// 2329// SSE 1 & 2 - Compare Instructions 2330//===----------------------------------------------------------------------===// 2331 2332// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions 2333multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, 2334 Operand CC, SDNode OpNode, ValueType VT, 2335 PatFrag ld_frag, string asm, string asm_alt, 2336 OpndItins itins, ImmLeaf immLeaf> { 2337 def rr : SIi8<0xC2, MRMSrcReg, 2338 (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, 2339 [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, immLeaf:$cc))], 2340 itins.rr>, Sched<[itins.Sched]>; 2341 def rm : SIi8<0xC2, MRMSrcMem, 2342 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, 2343 [(set RC:$dst, (OpNode (VT RC:$src1), 2344 (ld_frag addr:$src2), immLeaf:$cc))], 2345 itins.rm>, 2346 Sched<[itins.Sched.Folded, ReadAfterLd]>; 2347 2348 // Accept explicit immediate argument form instead of comparison code. 2349 let isAsmParserOnly = 1, hasSideEffects = 0 in { 2350 def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst), 2351 (ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, [], 2352 IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>; 2353 let mayLoad = 1 in 2354 def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst), 2355 (ins RC:$src1, x86memop:$src2, i8imm:$cc), asm_alt, [], 2356 IIC_SSE_ALU_F32S_RM>, 2357 Sched<[itins.Sched.Folded, ReadAfterLd]>; 2358 } 2359} 2360 2361defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32, 2362 "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2363 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 2364 SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG; 2365defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64, 2366 "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2367 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 2368 SSE_ALU_F32S, i8immZExt5>, // same latency as 32 bit compare 2369 XD, VEX_4V, VEX_LIG; 2370 2371let Constraints = "$src1 = $dst" in { 2372 defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32, 2373 "cmp${cc}ss\t{$src2, $dst|$dst, $src2}", 2374 "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S, 2375 i8immZExt3>, XS; 2376 defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64, 2377 "cmp${cc}sd\t{$src2, $dst|$dst, $src2}", 2378 "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", 2379 SSE_ALU_F64S, i8immZExt3>, XD; 2380} 2381 2382multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC, 2383 Intrinsic Int, string asm, OpndItins itins, 2384 ImmLeaf immLeaf> { 2385 def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), 2386 (ins VR128:$src1, VR128:$src, CC:$cc), asm, 2387 [(set VR128:$dst, (Int VR128:$src1, 2388 VR128:$src, immLeaf:$cc))], 2389 itins.rr>, 2390 Sched<[itins.Sched]>; 2391 def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), 2392 (ins VR128:$src1, x86memop:$src, CC:$cc), asm, 2393 [(set VR128:$dst, (Int VR128:$src1, 2394 (load addr:$src), immLeaf:$cc))], 2395 itins.rm>, 2396 Sched<[itins.Sched.Folded, ReadAfterLd]>; 2397} 2398 2399let isCodeGenOnly = 1 in { 2400 // Aliases to match intrinsics which expect XMM operand(s). 2401 defm Int_VCMPSS : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss, 2402 "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}", 2403 SSE_ALU_F32S, i8immZExt5>, 2404 XS, VEX_4V; 2405 defm Int_VCMPSD : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd, 2406 "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}", 2407 SSE_ALU_F32S, i8immZExt5>, // same latency as f32 2408 XD, VEX_4V; 2409 let Constraints = "$src1 = $dst" in { 2410 defm Int_CMPSS : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss, 2411 "cmp${cc}ss\t{$src, $dst|$dst, $src}", 2412 SSE_ALU_F32S, i8immZExt3>, XS; 2413 defm Int_CMPSD : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd, 2414 "cmp${cc}sd\t{$src, $dst|$dst, $src}", 2415 SSE_ALU_F64S, i8immZExt3>, 2416 XD; 2417} 2418} 2419 2420 2421// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS 2422multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode, 2423 ValueType vt, X86MemOperand x86memop, 2424 PatFrag ld_frag, string OpcodeStr> { 2425 def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 2426 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 2427 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))], 2428 IIC_SSE_COMIS_RR>, 2429 Sched<[WriteFAdd]>; 2430 def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), 2431 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 2432 [(set EFLAGS, (OpNode (vt RC:$src1), 2433 (ld_frag addr:$src2)))], 2434 IIC_SSE_COMIS_RM>, 2435 Sched<[WriteFAddLd, ReadAfterLd]>; 2436} 2437 2438let Defs = [EFLAGS] in { 2439 defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, 2440 "ucomiss">, PS, VEX, VEX_LIG; 2441 defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, 2442 "ucomisd">, PD, VEX, VEX_LIG; 2443 let Pattern = []<dag> in { 2444 defm VCOMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, 2445 "comiss">, PS, VEX, VEX_LIG; 2446 defm VCOMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, 2447 "comisd">, PD, VEX, VEX_LIG; 2448 } 2449 2450 let isCodeGenOnly = 1 in { 2451 defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, 2452 load, "ucomiss">, PS, VEX; 2453 defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, 2454 load, "ucomisd">, PD, VEX; 2455 2456 defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, 2457 load, "comiss">, PS, VEX; 2458 defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, 2459 load, "comisd">, PD, VEX; 2460 } 2461 defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, 2462 "ucomiss">, PS; 2463 defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, 2464 "ucomisd">, PD; 2465 2466 let Pattern = []<dag> in { 2467 defm COMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, 2468 "comiss">, PS; 2469 defm COMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, 2470 "comisd">, PD; 2471 } 2472 2473 let isCodeGenOnly = 1 in { 2474 defm Int_UCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, 2475 load, "ucomiss">, PS; 2476 defm Int_UCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, 2477 load, "ucomisd">, PD; 2478 2479 defm Int_COMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load, 2480 "comiss">, PS; 2481 defm Int_COMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load, 2482 "comisd">, PD; 2483 } 2484} // Defs = [EFLAGS] 2485 2486// sse12_cmp_packed - sse 1 & 2 compare packed instructions 2487multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, 2488 Operand CC, Intrinsic Int, string asm, 2489 string asm_alt, Domain d, ImmLeaf immLeaf, 2490 OpndItins itins = SSE_ALU_F32P> { 2491 def rri : PIi8<0xC2, MRMSrcReg, 2492 (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, 2493 [(set RC:$dst, (Int RC:$src1, RC:$src2, immLeaf:$cc))], 2494 itins.rr, d>, 2495 Sched<[WriteFAdd]>; 2496 def rmi : PIi8<0xC2, MRMSrcMem, 2497 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, 2498 [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), immLeaf:$cc))], 2499 itins.rm, d>, 2500 Sched<[WriteFAddLd, ReadAfterLd]>; 2501 2502 // Accept explicit immediate argument form instead of comparison code. 2503 let isAsmParserOnly = 1, hasSideEffects = 0 in { 2504 def rri_alt : PIi8<0xC2, MRMSrcReg, 2505 (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), 2506 asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>; 2507 def rmi_alt : PIi8<0xC2, MRMSrcMem, 2508 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), 2509 asm_alt, [], itins.rm, d>, 2510 Sched<[WriteFAddLd, ReadAfterLd]>; 2511 } 2512} 2513 2514defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps, 2515 "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2516 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 2517 SSEPackedSingle, i8immZExt5>, PS, VEX_4V; 2518defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd, 2519 "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2520 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 2521 SSEPackedDouble, i8immZExt5>, PD, VEX_4V; 2522defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256, 2523 "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2524 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 2525 SSEPackedSingle, i8immZExt5>, PS, VEX_4V, VEX_L; 2526defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256, 2527 "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2528 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 2529 SSEPackedDouble, i8immZExt5>, PD, VEX_4V, VEX_L; 2530let Constraints = "$src1 = $dst" in { 2531 defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps, 2532 "cmp${cc}ps\t{$src2, $dst|$dst, $src2}", 2533 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", 2534 SSEPackedSingle, i8immZExt5, SSE_ALU_F32P>, PS; 2535 defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd, 2536 "cmp${cc}pd\t{$src2, $dst|$dst, $src2}", 2537 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", 2538 SSEPackedDouble, i8immZExt5, SSE_ALU_F64P>, PD; 2539} 2540 2541let Predicates = [HasAVX] in { 2542def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), 2543 (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>; 2544def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)), 2545 (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>; 2546def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), 2547 (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>; 2548def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)), 2549 (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; 2550 2551def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)), 2552 (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>; 2553def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (memop addr:$src2), imm:$cc)), 2554 (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>; 2555def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)), 2556 (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>; 2557def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (memop addr:$src2), imm:$cc)), 2558 (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>; 2559} 2560 2561let Predicates = [UseSSE1] in { 2562def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), 2563 (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>; 2564def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)), 2565 (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>; 2566} 2567 2568let Predicates = [UseSSE2] in { 2569def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), 2570 (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>; 2571def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)), 2572 (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; 2573} 2574 2575//===----------------------------------------------------------------------===// 2576// SSE 1 & 2 - Shuffle Instructions 2577//===----------------------------------------------------------------------===// 2578 2579/// sse12_shuffle - sse 1 & 2 fp shuffle instructions 2580multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, 2581 ValueType vt, string asm, PatFrag mem_frag, 2582 Domain d> { 2583 def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), 2584 (ins RC:$src1, x86memop:$src2, i8imm:$src3), asm, 2585 [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), 2586 (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, 2587 Sched<[WriteFShuffleLd, ReadAfterLd]>; 2588 def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), 2589 (ins RC:$src1, RC:$src2, i8imm:$src3), asm, 2590 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, 2591 (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, 2592 Sched<[WriteFShuffle]>; 2593} 2594 2595defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, 2596 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2597 loadv4f32, SSEPackedSingle>, PS, VEX_4V; 2598defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, 2599 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2600 loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L; 2601defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, 2602 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2603 loadv2f64, SSEPackedDouble>, PD, VEX_4V; 2604defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, 2605 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2606 loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L; 2607 2608let Constraints = "$src1 = $dst" in { 2609 defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, 2610 "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", 2611 memopv4f32, SSEPackedSingle>, PS; 2612 defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64, 2613 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}", 2614 memopv2f64, SSEPackedDouble>, PD; 2615} 2616 2617let Predicates = [HasAVX] in { 2618 def : Pat<(v4i32 (X86Shufp VR128:$src1, 2619 (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))), 2620 (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; 2621 def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), 2622 (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>; 2623 2624 def : Pat<(v2i64 (X86Shufp VR128:$src1, 2625 (loadv2i64 addr:$src2), (i8 imm:$imm))), 2626 (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>; 2627 def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), 2628 (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>; 2629 2630 // 256-bit patterns 2631 def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))), 2632 (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>; 2633 def : Pat<(v8i32 (X86Shufp VR256:$src1, 2634 (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))), 2635 (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>; 2636 2637 def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))), 2638 (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>; 2639 def : Pat<(v4i64 (X86Shufp VR256:$src1, 2640 (loadv4i64 addr:$src2), (i8 imm:$imm))), 2641 (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>; 2642} 2643 2644let Predicates = [UseSSE1] in { 2645 def : Pat<(v4i32 (X86Shufp VR128:$src1, 2646 (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))), 2647 (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; 2648 def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), 2649 (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>; 2650} 2651 2652let Predicates = [UseSSE2] in { 2653 // Generic SHUFPD patterns 2654 def : Pat<(v2i64 (X86Shufp VR128:$src1, 2655 (memopv2i64 addr:$src2), (i8 imm:$imm))), 2656 (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>; 2657 def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), 2658 (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>; 2659} 2660 2661//===----------------------------------------------------------------------===// 2662// SSE 1 & 2 - Unpack FP Instructions 2663//===----------------------------------------------------------------------===// 2664 2665/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave 2666multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, 2667 PatFrag mem_frag, RegisterClass RC, 2668 X86MemOperand x86memop, string asm, 2669 Domain d> { 2670 def rr : PI<opc, MRMSrcReg, 2671 (outs RC:$dst), (ins RC:$src1, RC:$src2), 2672 asm, [(set RC:$dst, 2673 (vt (OpNode RC:$src1, RC:$src2)))], 2674 IIC_SSE_UNPCK, d>, Sched<[WriteFShuffle]>; 2675 def rm : PI<opc, MRMSrcMem, 2676 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 2677 asm, [(set RC:$dst, 2678 (vt (OpNode RC:$src1, 2679 (mem_frag addr:$src2))))], 2680 IIC_SSE_UNPCK, d>, 2681 Sched<[WriteFShuffleLd, ReadAfterLd]>; 2682} 2683 2684defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32, 2685 VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2686 SSEPackedSingle>, PS, VEX_4V; 2687defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64, 2688 VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2689 SSEPackedDouble>, PD, VEX_4V; 2690defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32, 2691 VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2692 SSEPackedSingle>, PS, VEX_4V; 2693defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64, 2694 VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2695 SSEPackedDouble>, PD, VEX_4V; 2696 2697defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32, 2698 VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2699 SSEPackedSingle>, PS, VEX_4V, VEX_L; 2700defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64, 2701 VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2702 SSEPackedDouble>, PD, VEX_4V, VEX_L; 2703defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32, 2704 VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2705 SSEPackedSingle>, PS, VEX_4V, VEX_L; 2706defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64, 2707 VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2708 SSEPackedDouble>, PD, VEX_4V, VEX_L; 2709 2710let Constraints = "$src1 = $dst" in { 2711 defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32, 2712 VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", 2713 SSEPackedSingle>, PS; 2714 defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64, 2715 VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}", 2716 SSEPackedDouble>, PD; 2717 defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32, 2718 VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}", 2719 SSEPackedSingle>, PS; 2720 defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64, 2721 VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}", 2722 SSEPackedDouble>, PD; 2723} // Constraints = "$src1 = $dst" 2724 2725let Predicates = [HasAVX1Only] in { 2726 def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), 2727 (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; 2728 def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)), 2729 (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; 2730 def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), 2731 (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; 2732 def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)), 2733 (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; 2734 2735 def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))), 2736 (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; 2737 def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)), 2738 (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; 2739 def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))), 2740 (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; 2741 def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)), 2742 (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; 2743} 2744 2745let Predicates = [HasAVX] in { 2746 // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the 2747 // problem is during lowering, where it's not possible to recognize the load 2748 // fold cause it has two uses through a bitcast. One use disappears at isel 2749 // time and the fold opportunity reappears. 2750 def : Pat<(v2f64 (X86Movddup VR128:$src)), 2751 (VUNPCKLPDrr VR128:$src, VR128:$src)>; 2752} 2753 2754let Predicates = [UseSSE2] in { 2755 // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the 2756 // problem is during lowering, where it's not possible to recognize the load 2757 // fold cause it has two uses through a bitcast. One use disappears at isel 2758 // time and the fold opportunity reappears. 2759 def : Pat<(v2f64 (X86Movddup VR128:$src)), 2760 (UNPCKLPDrr VR128:$src, VR128:$src)>; 2761} 2762 2763//===----------------------------------------------------------------------===// 2764// SSE 1 & 2 - Extract Floating-Point Sign mask 2765//===----------------------------------------------------------------------===// 2766 2767/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave 2768multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm, 2769 Domain d> { 2770 def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src), 2771 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 2772 [(set GR32orGR64:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>, 2773 Sched<[WriteVecLogic]>; 2774} 2775 2776let Predicates = [HasAVX] in { 2777 defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, 2778 "movmskps", SSEPackedSingle>, PS, VEX; 2779 defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, 2780 "movmskpd", SSEPackedDouble>, PD, VEX; 2781 defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256, 2782 "movmskps", SSEPackedSingle>, PS, 2783 VEX, VEX_L; 2784 defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256, 2785 "movmskpd", SSEPackedDouble>, PD, 2786 VEX, VEX_L; 2787 2788 def : Pat<(i32 (X86fgetsign FR32:$src)), 2789 (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>; 2790 def : Pat<(i64 (X86fgetsign FR32:$src)), 2791 (SUBREG_TO_REG (i64 0), 2792 (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>; 2793 def : Pat<(i32 (X86fgetsign FR64:$src)), 2794 (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>; 2795 def : Pat<(i64 (X86fgetsign FR64:$src)), 2796 (SUBREG_TO_REG (i64 0), 2797 (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>; 2798} 2799 2800defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps", 2801 SSEPackedSingle>, PS; 2802defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd", 2803 SSEPackedDouble>, PD; 2804 2805def : Pat<(i32 (X86fgetsign FR32:$src)), 2806 (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>, 2807 Requires<[UseSSE1]>; 2808def : Pat<(i64 (X86fgetsign FR32:$src)), 2809 (SUBREG_TO_REG (i64 0), 2810 (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>, 2811 Requires<[UseSSE1]>; 2812def : Pat<(i32 (X86fgetsign FR64:$src)), 2813 (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>, 2814 Requires<[UseSSE2]>; 2815def : Pat<(i64 (X86fgetsign FR64:$src)), 2816 (SUBREG_TO_REG (i64 0), 2817 (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>, 2818 Requires<[UseSSE2]>; 2819 2820//===---------------------------------------------------------------------===// 2821// SSE2 - Packed Integer Logical Instructions 2822//===---------------------------------------------------------------------===// 2823 2824let ExeDomain = SSEPackedInt in { // SSE integer instructions 2825 2826/// PDI_binop_rm - Simple SSE2 binary operator. 2827multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 2828 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 2829 X86MemOperand x86memop, OpndItins itins, 2830 bit IsCommutable, bit Is2Addr> { 2831 let isCommutable = IsCommutable in 2832 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 2833 (ins RC:$src1, RC:$src2), 2834 !if(Is2Addr, 2835 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 2836 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 2837 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>, 2838 Sched<[itins.Sched]>; 2839 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 2840 (ins RC:$src1, x86memop:$src2), 2841 !if(Is2Addr, 2842 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 2843 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 2844 [(set RC:$dst, (OpVT (OpNode RC:$src1, 2845 (bitconvert (memop_frag addr:$src2)))))], 2846 itins.rm>, 2847 Sched<[itins.Sched.Folded, ReadAfterLd]>; 2848} 2849} // ExeDomain = SSEPackedInt 2850 2851multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode, 2852 ValueType OpVT128, ValueType OpVT256, 2853 OpndItins itins, bit IsCommutable = 0> { 2854let Predicates = [HasAVX, NoVLX] in 2855 defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128, 2856 VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V; 2857 2858let Constraints = "$src1 = $dst" in 2859 defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128, 2860 memopv2i64, i128mem, itins, IsCommutable, 1>; 2861 2862let Predicates = [HasAVX2, NoVLX] in 2863 defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, 2864 OpVT256, VR256, loadv4i64, i256mem, itins, 2865 IsCommutable, 0>, VEX_4V, VEX_L; 2866} 2867 2868// These are ordered here for pattern ordering requirements with the fp versions 2869 2870defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, 2871 SSE_VEC_BIT_ITINS_P, 1>; 2872defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, 2873 SSE_VEC_BIT_ITINS_P, 1>; 2874defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, 2875 SSE_VEC_BIT_ITINS_P, 1>; 2876defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, 2877 SSE_VEC_BIT_ITINS_P, 0>; 2878 2879//===----------------------------------------------------------------------===// 2880// SSE 1 & 2 - Logical Instructions 2881//===----------------------------------------------------------------------===// 2882 2883/// sse12_fp_alias_pack_logical - SSE 1 & 2 aliased packed FP logical ops 2884/// 2885multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr, 2886 SDNode OpNode, OpndItins itins> { 2887 defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, 2888 FR32, f32, f128mem, memopfsf32, SSEPackedSingle, itins, 0>, 2889 PS, VEX_4V; 2890 2891 defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, 2892 FR64, f64, f128mem, memopfsf64, SSEPackedDouble, itins, 0>, 2893 PD, VEX_4V; 2894 2895 let Constraints = "$src1 = $dst" in { 2896 defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32, 2897 f32, f128mem, memopfsf32, SSEPackedSingle, itins>, 2898 PS; 2899 2900 defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64, 2901 f64, f128mem, memopfsf64, SSEPackedDouble, itins>, 2902 PD; 2903 } 2904} 2905 2906// Alias bitwise logical operations using SSE logical ops on packed FP values. 2907let isCodeGenOnly = 1 in { 2908 defm FsAND : sse12_fp_alias_pack_logical<0x54, "and", X86fand, 2909 SSE_BIT_ITINS_P>; 2910 defm FsOR : sse12_fp_alias_pack_logical<0x56, "or", X86for, 2911 SSE_BIT_ITINS_P>; 2912 defm FsXOR : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor, 2913 SSE_BIT_ITINS_P>; 2914 2915 let isCommutable = 0 in 2916 defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", X86fandn, 2917 SSE_BIT_ITINS_P>; 2918} 2919 2920/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops 2921/// 2922multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, 2923 SDNode OpNode> { 2924 let Predicates = [HasAVX, NoVLX] in { 2925 defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, 2926 !strconcat(OpcodeStr, "ps"), f256mem, 2927 [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))], 2928 [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)), 2929 (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L; 2930 2931 defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, 2932 !strconcat(OpcodeStr, "pd"), f256mem, 2933 [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)), 2934 (bc_v4i64 (v4f64 VR256:$src2))))], 2935 [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)), 2936 (loadv4i64 addr:$src2)))], 0>, 2937 PD, VEX_4V, VEX_L; 2938 2939 // In AVX no need to add a pattern for 128-bit logical rr ps, because they 2940 // are all promoted to v2i64, and the patterns are covered by the int 2941 // version. This is needed in SSE only, because v2i64 isn't supported on 2942 // SSE1, but only on SSE2. 2943 defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, 2944 !strconcat(OpcodeStr, "ps"), f128mem, [], 2945 [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), 2946 (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V; 2947 2948 defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, 2949 !strconcat(OpcodeStr, "pd"), f128mem, 2950 [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), 2951 (bc_v2i64 (v2f64 VR128:$src2))))], 2952 [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), 2953 (loadv2i64 addr:$src2)))], 0>, 2954 PD, VEX_4V; 2955 } 2956 2957 let Constraints = "$src1 = $dst" in { 2958 defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, 2959 !strconcat(OpcodeStr, "ps"), f128mem, 2960 [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))], 2961 [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), 2962 (memopv2i64 addr:$src2)))]>, PS; 2963 2964 defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, 2965 !strconcat(OpcodeStr, "pd"), f128mem, 2966 [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), 2967 (bc_v2i64 (v2f64 VR128:$src2))))], 2968 [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), 2969 (memopv2i64 addr:$src2)))]>, PD; 2970 } 2971} 2972 2973defm AND : sse12_fp_packed_logical<0x54, "and", and>; 2974defm OR : sse12_fp_packed_logical<0x56, "or", or>; 2975defm XOR : sse12_fp_packed_logical<0x57, "xor", xor>; 2976let isCommutable = 0 in 2977 defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>; 2978 2979// AVX1 requires type coercions in order to fold loads directly into logical 2980// operations. 2981let Predicates = [HasAVX1Only] in { 2982 def : Pat<(bc_v8f32 (and VR256:$src1, (loadv4i64 addr:$src2))), 2983 (VANDPSYrm VR256:$src1, addr:$src2)>; 2984 def : Pat<(bc_v8f32 (or VR256:$src1, (loadv4i64 addr:$src2))), 2985 (VORPSYrm VR256:$src1, addr:$src2)>; 2986 def : Pat<(bc_v8f32 (xor VR256:$src1, (loadv4i64 addr:$src2))), 2987 (VXORPSYrm VR256:$src1, addr:$src2)>; 2988 def : Pat<(bc_v8f32 (X86andnp VR256:$src1, (loadv4i64 addr:$src2))), 2989 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2990} 2991 2992//===----------------------------------------------------------------------===// 2993// SSE 1 & 2 - Arithmetic Instructions 2994//===----------------------------------------------------------------------===// 2995 2996/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and 2997/// vector forms. 2998/// 2999/// In addition, we also have a special variant of the scalar form here to 3000/// represent the associated intrinsic operation. This form is unlike the 3001/// plain scalar form, in that it takes an entire vector (instead of a scalar) 3002/// and leaves the top elements unmodified (therefore these cannot be commuted). 3003/// 3004/// These three forms can each be reg+reg or reg+mem. 3005/// 3006 3007/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those 3008/// classes below 3009multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, 3010 SDNode OpNode, SizeItins itins> { 3011 let Predicates = [HasAVX, NoVLX] in { 3012 defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, 3013 VR128, v4f32, f128mem, loadv4f32, 3014 SSEPackedSingle, itins.s, 0>, PS, VEX_4V; 3015 defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, 3016 VR128, v2f64, f128mem, loadv2f64, 3017 SSEPackedDouble, itins.d, 0>, PD, VEX_4V; 3018 3019 defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), 3020 OpNode, VR256, v8f32, f256mem, loadv8f32, 3021 SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L; 3022 defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), 3023 OpNode, VR256, v4f64, f256mem, loadv4f64, 3024 SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L; 3025 } 3026 3027 let Constraints = "$src1 = $dst" in { 3028 defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, 3029 v4f32, f128mem, memopv4f32, SSEPackedSingle, 3030 itins.s>, PS; 3031 defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, 3032 v2f64, f128mem, memopv2f64, SSEPackedDouble, 3033 itins.d>, PD; 3034 } 3035} 3036 3037multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, 3038 SizeItins itins> { 3039 defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), 3040 OpNode, FR32, f32mem, itins.s, 0>, XS, VEX_4V, VEX_LIG; 3041 defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), 3042 OpNode, FR64, f64mem, itins.d, 0>, XD, VEX_4V, VEX_LIG; 3043 3044 let Constraints = "$src1 = $dst" in { 3045 defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), 3046 OpNode, FR32, f32mem, itins.s>, XS; 3047 defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), 3048 OpNode, FR64, f64mem, itins.d>, XD; 3049 } 3050} 3051 3052multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, 3053 SizeItins itins> { 3054 defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128, 3055 !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32, 3056 itins.s, 0>, XS, VEX_4V, VEX_LIG; 3057 defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128, 3058 !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64, 3059 itins.d, 0>, XD, VEX_4V, VEX_LIG; 3060 3061 let Constraints = "$src1 = $dst" in { 3062 defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128, 3063 !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32, 3064 itins.s>, XS; 3065 defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128, 3066 !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64, 3067 itins.d>, XD; 3068 } 3069} 3070 3071// Binary Arithmetic instructions 3072defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>, 3073 basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>, 3074 basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>; 3075defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>, 3076 basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>, 3077 basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>; 3078let isCommutable = 0 in { 3079 defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>, 3080 basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>, 3081 basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>; 3082 defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>, 3083 basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>, 3084 basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>; 3085 defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>, 3086 basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>, 3087 basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>; 3088 defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>, 3089 basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>, 3090 basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>; 3091} 3092 3093let isCodeGenOnly = 1 in { 3094 defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>, 3095 basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>; 3096 defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>, 3097 basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>; 3098} 3099 3100// Patterns used to select SSE scalar fp arithmetic instructions from 3101// a scalar fp operation followed by a blend. 3102// 3103// These patterns know, for example, how to select an ADDSS from a 3104// float add plus vector insert. 3105// 3106// The effect is that the backend no longer emits unnecessary vector 3107// insert instructions immediately after SSE scalar fp instructions 3108// like addss or mulss. 3109// 3110// For example, given the following code: 3111// __m128 foo(__m128 A, __m128 B) { 3112// A[0] += B[0]; 3113// return A; 3114// } 3115// 3116// previously we generated: 3117// addss %xmm0, %xmm1 3118// movss %xmm1, %xmm0 3119// 3120// we now generate: 3121// addss %xmm1, %xmm0 3122 3123let Predicates = [UseSSE1] in { 3124 def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd 3125 (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), 3126 FR32:$src))))), 3127 (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; 3128 def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub 3129 (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), 3130 FR32:$src))))), 3131 (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; 3132 def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul 3133 (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), 3134 FR32:$src))))), 3135 (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; 3136 def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv 3137 (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), 3138 FR32:$src))))), 3139 (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; 3140} 3141 3142let Predicates = [UseSSE2] in { 3143 // SSE2 patterns to select scalar double-precision fp arithmetic instructions 3144 def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd 3145 (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), 3146 FR64:$src))))), 3147 (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; 3148 def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub 3149 (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), 3150 FR64:$src))))), 3151 (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; 3152 def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul 3153 (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), 3154 FR64:$src))))), 3155 (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; 3156 def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv 3157 (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), 3158 FR64:$src))))), 3159 (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; 3160} 3161 3162let Predicates = [UseSSE41] in { 3163 // If the subtarget has SSE4.1 but not AVX, the vector insert instruction is 3164 // lowered into a X86insertps or a X86Blendi rather than a X86Movss. When 3165 // selecting SSE scalar single-precision fp arithmetic instructions, make 3166 // sure that we correctly match them. 3167 3168 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector 3169 (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), 3170 FR32:$src))), (iPTR 0))), 3171 (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; 3172 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector 3173 (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), 3174 FR32:$src))), (iPTR 0))), 3175 (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; 3176 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector 3177 (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), 3178 FR32:$src))), (iPTR 0))), 3179 (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; 3180 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector 3181 (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), 3182 FR32:$src))), (iPTR 0))), 3183 (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; 3184 3185 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd 3186 (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), 3187 FR32:$src))), (i8 1))), 3188 (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; 3189 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub 3190 (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), 3191 FR32:$src))), (i8 1))), 3192 (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; 3193 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul 3194 (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), 3195 FR32:$src))), (i8 1))), 3196 (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; 3197 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv 3198 (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), 3199 FR32:$src))), (i8 1))), 3200 (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; 3201 3202 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd 3203 (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), 3204 FR64:$src))), (i8 1))), 3205 (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; 3206 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub 3207 (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), 3208 FR64:$src))), (i8 1))), 3209 (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; 3210 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul 3211 (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), 3212 FR64:$src))), (i8 1))), 3213 (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; 3214 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv 3215 (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), 3216 FR64:$src))), (i8 1))), 3217 (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; 3218 3219 def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd 3220 (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), 3221 FR64:$src))), (v2f64 VR128:$dst), (i8 2))), 3222 (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; 3223 def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub 3224 (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), 3225 FR64:$src))), (v2f64 VR128:$dst), (i8 2))), 3226 (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; 3227 def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul 3228 (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), 3229 FR64:$src))), (v2f64 VR128:$dst), (i8 2))), 3230 (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; 3231 def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv 3232 (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), 3233 FR64:$src))), (v2f64 VR128:$dst), (i8 2))), 3234 (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; 3235} 3236 3237let Predicates = [HasAVX] in { 3238 // The following patterns select AVX Scalar single/double precision fp 3239 // arithmetic instructions. 3240 3241 def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd 3242 (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), 3243 FR64:$src))))), 3244 (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; 3245 def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub 3246 (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), 3247 FR64:$src))))), 3248 (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; 3249 def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul 3250 (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), 3251 FR64:$src))))), 3252 (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; 3253 def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv 3254 (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), 3255 FR64:$src))))), 3256 (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; 3257 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector 3258 (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), 3259 FR32:$src))), (iPTR 0))), 3260 (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; 3261 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector 3262 (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), 3263 FR32:$src))), (iPTR 0))), 3264 (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; 3265 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector 3266 (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), 3267 FR32:$src))), (iPTR 0))), 3268 (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; 3269 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector 3270 (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), 3271 FR32:$src))), (iPTR 0))), 3272 (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; 3273 3274 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd 3275 (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), 3276 FR32:$src))), (i8 1))), 3277 (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; 3278 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub 3279 (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), 3280 FR32:$src))), (i8 1))), 3281 (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; 3282 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul 3283 (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), 3284 FR32:$src))), (i8 1))), 3285 (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; 3286 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv 3287 (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), 3288 FR32:$src))), (i8 1))), 3289 (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; 3290 3291 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd 3292 (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), 3293 FR64:$src))), (i8 1))), 3294 (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; 3295 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub 3296 (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), 3297 FR64:$src))), (i8 1))), 3298 (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; 3299 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul 3300 (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), 3301 FR64:$src))), (i8 1))), 3302 (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; 3303 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv 3304 (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), 3305 FR64:$src))), (i8 1))), 3306 (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; 3307 3308 def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd 3309 (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), 3310 FR64:$src))), (v2f64 VR128:$dst), (i8 2))), 3311 (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; 3312 def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub 3313 (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), 3314 FR64:$src))), (v2f64 VR128:$dst), (i8 2))), 3315 (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; 3316 def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul 3317 (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), 3318 FR64:$src))), (v2f64 VR128:$dst), (i8 2))), 3319 (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; 3320 def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv 3321 (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), 3322 FR64:$src))), (v2f64 VR128:$dst), (i8 2))), 3323 (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; 3324} 3325 3326// Patterns used to select SSE scalar fp arithmetic instructions from 3327// a vector packed single/double fp operation followed by a vector insert. 3328// 3329// The effect is that the backend converts the packed fp instruction 3330// followed by a vector insert into a single SSE scalar fp instruction. 3331// 3332// For example, given the following code: 3333// __m128 foo(__m128 A, __m128 B) { 3334// __m128 C = A + B; 3335// return (__m128) {c[0], a[1], a[2], a[3]}; 3336// } 3337// 3338// previously we generated: 3339// addps %xmm0, %xmm1 3340// movss %xmm1, %xmm0 3341// 3342// we now generate: 3343// addss %xmm1, %xmm0 3344 3345let Predicates = [UseSSE1] in { 3346 def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), 3347 (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))), 3348 (ADDSSrr_Int v4f32:$dst, v4f32:$src)>; 3349 def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), 3350 (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))), 3351 (SUBSSrr_Int v4f32:$dst, v4f32:$src)>; 3352 def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), 3353 (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))), 3354 (MULSSrr_Int v4f32:$dst, v4f32:$src)>; 3355 def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), 3356 (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))), 3357 (DIVSSrr_Int v4f32:$dst, v4f32:$src)>; 3358} 3359 3360let Predicates = [UseSSE2] in { 3361 // SSE2 patterns to select scalar double-precision fp arithmetic instructions 3362 // from a packed double-precision fp instruction plus movsd. 3363 3364 def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), 3365 (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)))), 3366 (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; 3367 def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), 3368 (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)))), 3369 (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; 3370 def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), 3371 (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)))), 3372 (MULSDrr_Int v2f64:$dst, v2f64:$src)>; 3373 def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), 3374 (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))), 3375 (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; 3376} 3377 3378let Predicates = [UseSSE41] in { 3379 // With SSE4.1 we may see these operations using X86Blendi rather than 3380 // X86Movs{s,d}. 3381 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), 3382 (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), 3383 (ADDSSrr_Int v4f32:$dst, v4f32:$src)>; 3384 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), 3385 (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), 3386 (SUBSSrr_Int v4f32:$dst, v4f32:$src)>; 3387 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), 3388 (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), 3389 (MULSSrr_Int v4f32:$dst, v4f32:$src)>; 3390 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), 3391 (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), 3392 (DIVSSrr_Int v4f32:$dst, v4f32:$src)>; 3393 3394 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), 3395 (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), 3396 (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; 3397 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), 3398 (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), 3399 (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; 3400 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), 3401 (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), 3402 (MULSDrr_Int v2f64:$dst, v2f64:$src)>; 3403 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), 3404 (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), 3405 (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; 3406 3407 def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), 3408 (v2f64 VR128:$dst), (i8 2))), 3409 (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; 3410 def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), 3411 (v2f64 VR128:$dst), (i8 2))), 3412 (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; 3413 def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), 3414 (v2f64 VR128:$dst), (i8 2))), 3415 (MULSDrr_Int v2f64:$dst, v2f64:$src)>; 3416 def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), 3417 (v2f64 VR128:$dst), (i8 2))), 3418 (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; 3419} 3420 3421let Predicates = [HasAVX] in { 3422 // The following patterns select AVX Scalar single/double precision fp 3423 // arithmetic instructions from a packed single precision fp instruction 3424 // plus movss/movsd. 3425 3426 def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), 3427 (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))), 3428 (VADDSSrr_Int v4f32:$dst, v4f32:$src)>; 3429 def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), 3430 (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))), 3431 (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>; 3432 def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), 3433 (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))), 3434 (VMULSSrr_Int v4f32:$dst, v4f32:$src)>; 3435 def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), 3436 (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))), 3437 (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>; 3438 def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), 3439 (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)))), 3440 (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; 3441 def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), 3442 (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)))), 3443 (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; 3444 def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), 3445 (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)))), 3446 (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; 3447 def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), 3448 (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))), 3449 (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; 3450 3451 // Also handle X86Blendi-based patterns. 3452 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), 3453 (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), 3454 (VADDSSrr_Int v4f32:$dst, v4f32:$src)>; 3455 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), 3456 (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), 3457 (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>; 3458 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), 3459 (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), 3460 (VMULSSrr_Int v4f32:$dst, v4f32:$src)>; 3461 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), 3462 (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), 3463 (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>; 3464 3465 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), 3466 (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), 3467 (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; 3468 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), 3469 (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), 3470 (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; 3471 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), 3472 (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), 3473 (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; 3474 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), 3475 (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), 3476 (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; 3477 3478 def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), 3479 (v2f64 VR128:$dst), (i8 2))), 3480 (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; 3481 def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), 3482 (v2f64 VR128:$dst), (i8 2))), 3483 (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; 3484 def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), 3485 (v2f64 VR128:$dst), (i8 2))), 3486 (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; 3487 def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), 3488 (v2f64 VR128:$dst), (i8 2))), 3489 (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; 3490} 3491 3492/// Unop Arithmetic 3493/// In addition, we also have a special variant of the scalar form here to 3494/// represent the associated intrinsic operation. This form is unlike the 3495/// plain scalar form, in that it takes an entire vector (instead of a 3496/// scalar) and leaves the top elements undefined. 3497/// 3498/// And, we have a special variant form for a full-vector intrinsic form. 3499 3500let Sched = WriteFSqrt in { 3501def SSE_SQRTPS : OpndItins< 3502 IIC_SSE_SQRTPS_RR, IIC_SSE_SQRTPS_RM 3503>; 3504 3505def SSE_SQRTSS : OpndItins< 3506 IIC_SSE_SQRTSS_RR, IIC_SSE_SQRTSS_RM 3507>; 3508 3509def SSE_SQRTPD : OpndItins< 3510 IIC_SSE_SQRTPD_RR, IIC_SSE_SQRTPD_RM 3511>; 3512 3513def SSE_SQRTSD : OpndItins< 3514 IIC_SSE_SQRTSD_RR, IIC_SSE_SQRTSD_RM 3515>; 3516} 3517 3518let Sched = WriteFRsqrt in { 3519def SSE_RSQRTPS : OpndItins< 3520 IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM 3521>; 3522 3523def SSE_RSQRTSS : OpndItins< 3524 IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM 3525>; 3526} 3527 3528let Sched = WriteFRcp in { 3529def SSE_RCPP : OpndItins< 3530 IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM 3531>; 3532 3533def SSE_RCPS : OpndItins< 3534 IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM 3535>; 3536} 3537 3538/// sse1_fp_unop_s - SSE1 unops in scalar form 3539/// For the non-AVX defs, we need $src1 to be tied to $dst because 3540/// the HW instructions are 2 operand / destructive. 3541multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, 3542 OpndItins itins> { 3543let Predicates = [HasAVX], hasSideEffects = 0 in { 3544 def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), 3545 (ins FR32:$src1, FR32:$src2), 3546 !strconcat("v", OpcodeStr, 3547 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3548 []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>; 3549 let mayLoad = 1 in { 3550 def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), 3551 (ins FR32:$src1,f32mem:$src2), 3552 !strconcat("v", OpcodeStr, 3553 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3554 []>, VEX_4V, VEX_LIG, 3555 Sched<[itins.Sched.Folded, ReadAfterLd]>; 3556 let isCodeGenOnly = 1 in 3557 def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), 3558 (ins VR128:$src1, ssmem:$src2), 3559 !strconcat("v", OpcodeStr, 3560 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3561 []>, VEX_4V, VEX_LIG, 3562 Sched<[itins.Sched.Folded, ReadAfterLd]>; 3563 } 3564} 3565 3566 def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src), 3567 !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), 3568 [(set FR32:$dst, (OpNode FR32:$src))]>, Sched<[itins.Sched]>; 3569 // For scalar unary operations, fold a load into the operation 3570 // only in OptForSize mode. It eliminates an instruction, but it also 3571 // eliminates a whole-register clobber (the load), so it introduces a 3572 // partial register update condition. 3573 def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src), 3574 !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), 3575 [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS, 3576 Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>; 3577 let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in { 3578 def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), 3579 (ins VR128:$src1, VR128:$src2), 3580 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"), 3581 [], itins.rr>, Sched<[itins.Sched]>; 3582 let mayLoad = 1, hasSideEffects = 0 in 3583 def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), 3584 (ins VR128:$src1, ssmem:$src2), 3585 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"), 3586 [], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; 3587 } 3588} 3589 3590/// sse1_fp_unop_p - SSE1 unops in packed form. 3591multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, 3592 OpndItins itins> { 3593let Predicates = [HasAVX] in { 3594 def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3595 !strconcat("v", OpcodeStr, 3596 "ps\t{$src, $dst|$dst, $src}"), 3597 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], 3598 itins.rr>, VEX, Sched<[itins.Sched]>; 3599 def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 3600 !strconcat("v", OpcodeStr, 3601 "ps\t{$src, $dst|$dst, $src}"), 3602 [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))], 3603 itins.rm>, VEX, Sched<[itins.Sched.Folded]>; 3604 def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3605 !strconcat("v", OpcodeStr, 3606 "ps\t{$src, $dst|$dst, $src}"), 3607 [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))], 3608 itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>; 3609 def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 3610 !strconcat("v", OpcodeStr, 3611 "ps\t{$src, $dst|$dst, $src}"), 3612 [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))], 3613 itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>; 3614} 3615 3616 def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3617 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 3618 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>, 3619 Sched<[itins.Sched]>; 3620 def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 3621 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 3622 [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>, 3623 Sched<[itins.Sched.Folded]>; 3624} 3625 3626/// sse1_fp_unop_p_int - SSE1 intrinsics unops in packed forms. 3627multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr, 3628 Intrinsic V4F32Int, Intrinsic V8F32Int, 3629 OpndItins itins> { 3630let isCodeGenOnly = 1 in { 3631let Predicates = [HasAVX] in { 3632 def V#NAME#PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3633 !strconcat("v", OpcodeStr, 3634 "ps\t{$src, $dst|$dst, $src}"), 3635 [(set VR128:$dst, (V4F32Int VR128:$src))], 3636 itins.rr>, VEX, Sched<[itins.Sched]>; 3637 def V#NAME#PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 3638 !strconcat("v", OpcodeStr, 3639 "ps\t{$src, $dst|$dst, $src}"), 3640 [(set VR128:$dst, (V4F32Int (loadv4f32 addr:$src)))], 3641 itins.rm>, VEX, Sched<[itins.Sched.Folded]>; 3642 def V#NAME#PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3643 !strconcat("v", OpcodeStr, 3644 "ps\t{$src, $dst|$dst, $src}"), 3645 [(set VR256:$dst, (V8F32Int VR256:$src))], 3646 itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>; 3647 def V#NAME#PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst), 3648 (ins f256mem:$src), 3649 !strconcat("v", OpcodeStr, 3650 "ps\t{$src, $dst|$dst, $src}"), 3651 [(set VR256:$dst, (V8F32Int (loadv8f32 addr:$src)))], 3652 itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>; 3653} 3654 3655 def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3656 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 3657 [(set VR128:$dst, (V4F32Int VR128:$src))], 3658 itins.rr>, Sched<[itins.Sched]>; 3659 def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 3660 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 3661 [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))], 3662 itins.rm>, Sched<[itins.Sched.Folded]>; 3663} // isCodeGenOnly = 1 3664} 3665 3666/// sse2_fp_unop_s - SSE2 unops in scalar form. 3667multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, 3668 SDNode OpNode, Intrinsic F64Int, OpndItins itins> { 3669let Predicates = [HasAVX], hasSideEffects = 0 in { 3670 def V#NAME#SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), 3671 (ins FR64:$src1, FR64:$src2), 3672 !strconcat("v", OpcodeStr, 3673 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3674 []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>; 3675 let mayLoad = 1 in { 3676 def V#NAME#SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), 3677 (ins FR64:$src1,f64mem:$src2), 3678 !strconcat("v", OpcodeStr, 3679 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3680 []>, VEX_4V, VEX_LIG, 3681 Sched<[itins.Sched.Folded, ReadAfterLd]>; 3682 let isCodeGenOnly = 1 in 3683 def V#NAME#SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), 3684 (ins VR128:$src1, sdmem:$src2), 3685 !strconcat("v", OpcodeStr, 3686 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3687 []>, VEX_4V, VEX_LIG, 3688 Sched<[itins.Sched.Folded, ReadAfterLd]>; 3689 } 3690} 3691 3692 def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src), 3693 !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), 3694 [(set FR64:$dst, (OpNode FR64:$src))], itins.rr>, 3695 Sched<[itins.Sched]>; 3696 // See the comments in sse1_fp_unop_s for why this is OptForSize. 3697 def SDm : I<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src), 3698 !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), 3699 [(set FR64:$dst, (OpNode (load addr:$src)))], itins.rm>, XD, 3700 Requires<[UseSSE2, OptForSize]>, Sched<[itins.Sched.Folded]>; 3701let isCodeGenOnly = 1 in { 3702 def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3703 !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), 3704 [(set VR128:$dst, (F64Int VR128:$src))], itins.rr>, 3705 Sched<[itins.Sched]>; 3706 def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src), 3707 !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), 3708 [(set VR128:$dst, (F64Int sse_load_f64:$src))], itins.rm>, 3709 Sched<[itins.Sched.Folded]>; 3710} 3711} 3712 3713/// sse2_fp_unop_p - SSE2 unops in vector forms. 3714multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, 3715 SDNode OpNode, OpndItins itins> { 3716let Predicates = [HasAVX] in { 3717 def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3718 !strconcat("v", OpcodeStr, 3719 "pd\t{$src, $dst|$dst, $src}"), 3720 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], 3721 itins.rr>, VEX, Sched<[itins.Sched]>; 3722 def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 3723 !strconcat("v", OpcodeStr, 3724 "pd\t{$src, $dst|$dst, $src}"), 3725 [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))], 3726 itins.rm>, VEX, Sched<[itins.Sched.Folded]>; 3727 def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3728 !strconcat("v", OpcodeStr, 3729 "pd\t{$src, $dst|$dst, $src}"), 3730 [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))], 3731 itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>; 3732 def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 3733 !strconcat("v", OpcodeStr, 3734 "pd\t{$src, $dst|$dst, $src}"), 3735 [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))], 3736 itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>; 3737} 3738 3739 def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3740 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), 3741 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>, 3742 Sched<[itins.Sched]>; 3743 def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 3744 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), 3745 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>, 3746 Sched<[itins.Sched.Folded]>; 3747} 3748 3749// Square root. 3750defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>, 3751 sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>, 3752 sse2_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd, 3753 SSE_SQRTSD>, 3754 sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>; 3755 3756// Reciprocal approximations. Note that these typically require refinement 3757// in order to obtain suitable precision. 3758defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>, 3759 sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>, 3760 sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps, 3761 int_x86_avx_rsqrt_ps_256, SSE_RSQRTPS>; 3762defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>, 3763 sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>, 3764 sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps, 3765 int_x86_avx_rcp_ps_256, SSE_RCPP>; 3766 3767let Predicates = [UseAVX] in { 3768 def : Pat<(f32 (fsqrt FR32:$src)), 3769 (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; 3770 def : Pat<(f32 (fsqrt (load addr:$src))), 3771 (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>, 3772 Requires<[HasAVX, OptForSize]>; 3773 def : Pat<(f64 (fsqrt FR64:$src)), 3774 (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>; 3775 def : Pat<(f64 (fsqrt (load addr:$src))), 3776 (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>, 3777 Requires<[HasAVX, OptForSize]>; 3778 3779 def : Pat<(f32 (X86frsqrt FR32:$src)), 3780 (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; 3781 def : Pat<(f32 (X86frsqrt (load addr:$src))), 3782 (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>, 3783 Requires<[HasAVX, OptForSize]>; 3784 3785 def : Pat<(f32 (X86frcp FR32:$src)), 3786 (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; 3787 def : Pat<(f32 (X86frcp (load addr:$src))), 3788 (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>, 3789 Requires<[HasAVX, OptForSize]>; 3790} 3791let Predicates = [UseAVX] in { 3792 def : Pat<(int_x86_sse_sqrt_ss VR128:$src), 3793 (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)), 3794 (COPY_TO_REGCLASS VR128:$src, FR32)), 3795 VR128)>; 3796 def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src), 3797 (VSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; 3798 3799 def : Pat<(int_x86_sse2_sqrt_sd VR128:$src), 3800 (COPY_TO_REGCLASS (VSQRTSDr (f64 (IMPLICIT_DEF)), 3801 (COPY_TO_REGCLASS VR128:$src, FR64)), 3802 VR128)>; 3803 def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src), 3804 (VSQRTSDm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>; 3805} 3806 3807let Predicates = [HasAVX] in { 3808 def : Pat<(int_x86_sse_rsqrt_ss VR128:$src), 3809 (COPY_TO_REGCLASS (VRSQRTSSr (f32 (IMPLICIT_DEF)), 3810 (COPY_TO_REGCLASS VR128:$src, FR32)), 3811 VR128)>; 3812 def : Pat<(int_x86_sse_rsqrt_ss sse_load_f32:$src), 3813 (VRSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; 3814 3815 def : Pat<(int_x86_sse_rcp_ss VR128:$src), 3816 (COPY_TO_REGCLASS (VRCPSSr (f32 (IMPLICIT_DEF)), 3817 (COPY_TO_REGCLASS VR128:$src, FR32)), 3818 VR128)>; 3819 def : Pat<(int_x86_sse_rcp_ss sse_load_f32:$src), 3820 (VRCPSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; 3821} 3822 3823// These are unary operations, but they are modeled as having 2 source operands 3824// because the high elements of the destination are unchanged in SSE. 3825let Predicates = [UseSSE1] in { 3826 def : Pat<(int_x86_sse_rsqrt_ss VR128:$src), 3827 (RSQRTSSr_Int VR128:$src, VR128:$src)>; 3828 def : Pat<(int_x86_sse_rcp_ss VR128:$src), 3829 (RCPSSr_Int VR128:$src, VR128:$src)>; 3830 def : Pat<(int_x86_sse_sqrt_ss VR128:$src), 3831 (SQRTSSr_Int VR128:$src, VR128:$src)>; 3832} 3833 3834// There is no f64 version of the reciprocal approximation instructions. 3835 3836//===----------------------------------------------------------------------===// 3837// SSE 1 & 2 - Non-temporal stores 3838//===----------------------------------------------------------------------===// 3839 3840let AddedComplexity = 400 in { // Prefer non-temporal versions 3841let SchedRW = [WriteStore] in { 3842let Predicates = [HasAVX, NoVLX] in { 3843def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), 3844 (ins f128mem:$dst, VR128:$src), 3845 "movntps\t{$src, $dst|$dst, $src}", 3846 [(alignednontemporalstore (v4f32 VR128:$src), 3847 addr:$dst)], 3848 IIC_SSE_MOVNT>, VEX; 3849def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), 3850 (ins f128mem:$dst, VR128:$src), 3851 "movntpd\t{$src, $dst|$dst, $src}", 3852 [(alignednontemporalstore (v2f64 VR128:$src), 3853 addr:$dst)], 3854 IIC_SSE_MOVNT>, VEX; 3855 3856let ExeDomain = SSEPackedInt in 3857def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), 3858 (ins f128mem:$dst, VR128:$src), 3859 "movntdq\t{$src, $dst|$dst, $src}", 3860 [(alignednontemporalstore (v2i64 VR128:$src), 3861 addr:$dst)], 3862 IIC_SSE_MOVNT>, VEX; 3863 3864def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), 3865 (ins f256mem:$dst, VR256:$src), 3866 "movntps\t{$src, $dst|$dst, $src}", 3867 [(alignednontemporalstore (v8f32 VR256:$src), 3868 addr:$dst)], 3869 IIC_SSE_MOVNT>, VEX, VEX_L; 3870def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), 3871 (ins f256mem:$dst, VR256:$src), 3872 "movntpd\t{$src, $dst|$dst, $src}", 3873 [(alignednontemporalstore (v4f64 VR256:$src), 3874 addr:$dst)], 3875 IIC_SSE_MOVNT>, VEX, VEX_L; 3876let ExeDomain = SSEPackedInt in 3877def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), 3878 (ins f256mem:$dst, VR256:$src), 3879 "movntdq\t{$src, $dst|$dst, $src}", 3880 [(alignednontemporalstore (v4i64 VR256:$src), 3881 addr:$dst)], 3882 IIC_SSE_MOVNT>, VEX, VEX_L; 3883} 3884 3885def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3886 "movntps\t{$src, $dst|$dst, $src}", 3887 [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)], 3888 IIC_SSE_MOVNT>; 3889def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3890 "movntpd\t{$src, $dst|$dst, $src}", 3891 [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)], 3892 IIC_SSE_MOVNT>; 3893 3894let ExeDomain = SSEPackedInt in 3895def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3896 "movntdq\t{$src, $dst|$dst, $src}", 3897 [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)], 3898 IIC_SSE_MOVNT>; 3899 3900// There is no AVX form for instructions below this point 3901def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), 3902 "movnti{l}\t{$src, $dst|$dst, $src}", 3903 [(nontemporalstore (i32 GR32:$src), addr:$dst)], 3904 IIC_SSE_MOVNT>, 3905 PS, Requires<[HasSSE2]>; 3906def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), 3907 "movnti{q}\t{$src, $dst|$dst, $src}", 3908 [(nontemporalstore (i64 GR64:$src), addr:$dst)], 3909 IIC_SSE_MOVNT>, 3910 PS, Requires<[HasSSE2]>; 3911} // SchedRW = [WriteStore] 3912 3913let Predicates = [HasAVX, NoVLX] in { 3914 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), 3915 (VMOVNTPSmr addr:$dst, VR128:$src)>; 3916} 3917 3918def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), 3919 (MOVNTPSmr addr:$dst, VR128:$src)>; 3920 3921} // AddedComplexity 3922 3923//===----------------------------------------------------------------------===// 3924// SSE 1 & 2 - Prefetch and memory fence 3925//===----------------------------------------------------------------------===// 3926 3927// Prefetch intrinsic. 3928let Predicates = [HasSSE1], SchedRW = [WriteLoad] in { 3929def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src), 3930 "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))], 3931 IIC_SSE_PREFETCH>, TB; 3932def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src), 3933 "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))], 3934 IIC_SSE_PREFETCH>, TB; 3935def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src), 3936 "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))], 3937 IIC_SSE_PREFETCH>, TB; 3938def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src), 3939 "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))], 3940 IIC_SSE_PREFETCH>, TB; 3941} 3942 3943// FIXME: How should flush instruction be modeled? 3944let SchedRW = [WriteLoad] in { 3945// Flush cache 3946def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src), 3947 "clflush\t$src", [(int_x86_sse2_clflush addr:$src)], 3948 IIC_SSE_PREFETCH>, TB, Requires<[HasSSE2]>; 3949} 3950 3951let SchedRW = [WriteNop] in { 3952// Pause. This "instruction" is encoded as "rep; nop", so even though it 3953// was introduced with SSE2, it's backward compatible. 3954def PAUSE : I<0x90, RawFrm, (outs), (ins), 3955 "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>, 3956 OBXS, Requires<[HasSSE2]>; 3957} 3958 3959let SchedRW = [WriteFence] in { 3960// Load, store, and memory fence 3961def SFENCE : I<0xAE, MRM_F8, (outs), (ins), 3962 "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>, 3963 TB, Requires<[HasSSE1]>; 3964def LFENCE : I<0xAE, MRM_E8, (outs), (ins), 3965 "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>, 3966 TB, Requires<[HasSSE2]>; 3967def MFENCE : I<0xAE, MRM_F0, (outs), (ins), 3968 "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>, 3969 TB, Requires<[HasSSE2]>; 3970} // SchedRW 3971 3972def : Pat<(X86SFence), (SFENCE)>; 3973def : Pat<(X86LFence), (LFENCE)>; 3974def : Pat<(X86MFence), (MFENCE)>; 3975 3976//===----------------------------------------------------------------------===// 3977// SSE 1 & 2 - Load/Store XCSR register 3978//===----------------------------------------------------------------------===// 3979 3980def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), 3981 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)], 3982 IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>; 3983def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), 3984 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)], 3985 IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>; 3986 3987let Predicates = [UseSSE1] in { 3988def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src), 3989 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)], 3990 IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>; 3991def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst), 3992 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)], 3993 IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>; 3994} 3995 3996//===---------------------------------------------------------------------===// 3997// SSE2 - Move Aligned/Unaligned Packed Integer Instructions 3998//===---------------------------------------------------------------------===// 3999 4000let ExeDomain = SSEPackedInt in { // SSE integer instructions 4001 4002let hasSideEffects = 0, SchedRW = [WriteMove] in { 4003def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4004 "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, 4005 VEX; 4006def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 4007 "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, 4008 VEX, VEX_L; 4009def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4010 "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>, 4011 VEX; 4012def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 4013 "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>, 4014 VEX, VEX_L; 4015} 4016 4017// For Disassembler 4018let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 4019 SchedRW = [WriteMove] in { 4020def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4021 "movdqa\t{$src, $dst|$dst, $src}", [], 4022 IIC_SSE_MOVA_P_RR>, 4023 VEX; 4024def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), 4025 "movdqa\t{$src, $dst|$dst, $src}", [], 4026 IIC_SSE_MOVA_P_RR>, VEX, VEX_L; 4027def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4028 "movdqu\t{$src, $dst|$dst, $src}", [], 4029 IIC_SSE_MOVU_P_RR>, 4030 VEX; 4031def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), 4032 "movdqu\t{$src, $dst|$dst, $src}", [], 4033 IIC_SSE_MOVU_P_RR>, VEX, VEX_L; 4034} 4035 4036let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, 4037 hasSideEffects = 0, SchedRW = [WriteLoad] in { 4038def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 4039 "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>, 4040 VEX; 4041def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 4042 "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>, 4043 VEX, VEX_L; 4044let Predicates = [HasAVX] in { 4045 def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 4046 "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>, 4047 XS, VEX; 4048 def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 4049 "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>, 4050 XS, VEX, VEX_L; 4051} 4052} 4053 4054let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in { 4055def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), 4056 (ins i128mem:$dst, VR128:$src), 4057 "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>, 4058 VEX; 4059def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), 4060 (ins i256mem:$dst, VR256:$src), 4061 "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>, 4062 VEX, VEX_L; 4063let Predicates = [HasAVX] in { 4064def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 4065 "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>, 4066 XS, VEX; 4067def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), 4068 "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>, 4069 XS, VEX, VEX_L; 4070} 4071} 4072 4073let SchedRW = [WriteMove] in { 4074let hasSideEffects = 0 in 4075def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4076 "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>; 4077 4078def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4079 "movdqu\t{$src, $dst|$dst, $src}", 4080 [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>; 4081 4082// For Disassembler 4083let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { 4084def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4085 "movdqa\t{$src, $dst|$dst, $src}", [], 4086 IIC_SSE_MOVA_P_RR>; 4087 4088def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4089 "movdqu\t{$src, $dst|$dst, $src}", 4090 [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>; 4091} 4092} // SchedRW 4093 4094let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, 4095 hasSideEffects = 0, SchedRW = [WriteLoad] in { 4096def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 4097 "movdqa\t{$src, $dst|$dst, $src}", 4098 [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/], 4099 IIC_SSE_MOVA_P_RM>; 4100def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 4101 "movdqu\t{$src, $dst|$dst, $src}", 4102 [/*(set VR128:$dst, (loadv2i64 addr:$src))*/], 4103 IIC_SSE_MOVU_P_RM>, 4104 XS, Requires<[UseSSE2]>; 4105} 4106 4107let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in { 4108def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 4109 "movdqa\t{$src, $dst|$dst, $src}", 4110 [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/], 4111 IIC_SSE_MOVA_P_MR>; 4112def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 4113 "movdqu\t{$src, $dst|$dst, $src}", 4114 [/*(store (v2i64 VR128:$src), addr:$dst)*/], 4115 IIC_SSE_MOVU_P_MR>, 4116 XS, Requires<[UseSSE2]>; 4117} 4118 4119} // ExeDomain = SSEPackedInt 4120 4121let Predicates = [HasAVX] in { 4122 def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src), 4123 (VMOVDQUmr addr:$dst, VR128:$src)>; 4124 def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src), 4125 (VMOVDQUYmr addr:$dst, VR256:$src)>; 4126} 4127let Predicates = [UseSSE2] in 4128def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src), 4129 (MOVDQUmr addr:$dst, VR128:$src)>; 4130 4131//===---------------------------------------------------------------------===// 4132// SSE2 - Packed Integer Arithmetic Instructions 4133//===---------------------------------------------------------------------===// 4134 4135let Sched = WriteVecIMul in 4136def SSE_PMADD : OpndItins< 4137 IIC_SSE_PMADD, IIC_SSE_PMADD 4138>; 4139 4140let ExeDomain = SSEPackedInt in { // SSE integer instructions 4141 4142multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId, 4143 RegisterClass RC, PatFrag memop_frag, 4144 X86MemOperand x86memop, 4145 OpndItins itins, 4146 bit IsCommutable = 0, 4147 bit Is2Addr = 1> { 4148 let isCommutable = IsCommutable in 4149 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 4150 (ins RC:$src1, RC:$src2), 4151 !if(Is2Addr, 4152 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4153 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4154 [(set RC:$dst, (IntId RC:$src1, RC:$src2))], itins.rr>, 4155 Sched<[itins.Sched]>; 4156 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 4157 (ins RC:$src1, x86memop:$src2), 4158 !if(Is2Addr, 4159 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4160 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4161 [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2))))], 4162 itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; 4163} 4164 4165multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128, 4166 Intrinsic IntId256, OpndItins itins, 4167 bit IsCommutable = 0> { 4168let Predicates = [HasAVX] in 4169 defm V#NAME : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId128, 4170 VR128, loadv2i64, i128mem, itins, 4171 IsCommutable, 0>, VEX_4V; 4172 4173let Constraints = "$src1 = $dst" in 4174 defm NAME : PDI_binop_rm_int<opc, OpcodeStr, IntId128, VR128, memopv2i64, 4175 i128mem, itins, IsCommutable, 1>; 4176 4177let Predicates = [HasAVX2] in 4178 defm V#NAME#Y : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId256, 4179 VR256, loadv4i64, i256mem, itins, 4180 IsCommutable, 0>, VEX_4V, VEX_L; 4181} 4182 4183multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, 4184 string OpcodeStr, SDNode OpNode, 4185 SDNode OpNode2, RegisterClass RC, 4186 ValueType DstVT, ValueType SrcVT, PatFrag bc_frag, 4187 ShiftOpndItins itins, 4188 bit Is2Addr = 1> { 4189 // src2 is always 128-bit 4190 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 4191 (ins RC:$src1, VR128:$src2), 4192 !if(Is2Addr, 4193 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4194 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4195 [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))], 4196 itins.rr>, Sched<[WriteVecShift]>; 4197 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 4198 (ins RC:$src1, i128mem:$src2), 4199 !if(Is2Addr, 4200 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4201 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4202 [(set RC:$dst, (DstVT (OpNode RC:$src1, 4203 (bc_frag (memopv2i64 addr:$src2)))))], itins.rm>, 4204 Sched<[WriteVecShiftLd, ReadAfterLd]>; 4205 def ri : PDIi8<opc2, ImmForm, (outs RC:$dst), 4206 (ins RC:$src1, i8imm:$src2), 4207 !if(Is2Addr, 4208 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4209 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4210 [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))], itins.ri>, 4211 Sched<[WriteVecShift]>; 4212} 4213 4214/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types 4215multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, 4216 ValueType DstVT, ValueType SrcVT, RegisterClass RC, 4217 PatFrag memop_frag, X86MemOperand x86memop, 4218 OpndItins itins, 4219 bit IsCommutable = 0, bit Is2Addr = 1> { 4220 let isCommutable = IsCommutable in 4221 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 4222 (ins RC:$src1, RC:$src2), 4223 !if(Is2Addr, 4224 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4225 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4226 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>, 4227 Sched<[itins.Sched]>; 4228 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 4229 (ins RC:$src1, x86memop:$src2), 4230 !if(Is2Addr, 4231 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4232 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4233 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), 4234 (bitconvert (memop_frag addr:$src2)))))]>, 4235 Sched<[itins.Sched.Folded, ReadAfterLd]>; 4236} 4237} // ExeDomain = SSEPackedInt 4238 4239defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8, 4240 SSE_INTALU_ITINS_P, 1>; 4241defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16, 4242 SSE_INTALU_ITINS_P, 1>; 4243defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32, 4244 SSE_INTALU_ITINS_P, 1>; 4245defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64, 4246 SSE_INTALUQ_ITINS_P, 1>; 4247defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, 4248 SSE_INTMUL_ITINS_P, 1>; 4249defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16, 4250 SSE_INTMUL_ITINS_P, 1>; 4251defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16, 4252 SSE_INTMUL_ITINS_P, 1>; 4253defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8, 4254 SSE_INTALU_ITINS_P, 0>; 4255defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16, 4256 SSE_INTALU_ITINS_P, 0>; 4257defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32, 4258 SSE_INTALU_ITINS_P, 0>; 4259defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64, 4260 SSE_INTALUQ_ITINS_P, 0>; 4261defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8, 4262 SSE_INTALU_ITINS_P, 0>; 4263defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16, 4264 SSE_INTALU_ITINS_P, 0>; 4265defm PMINUB : PDI_binop_all<0xDA, "pminub", X86umin, v16i8, v32i8, 4266 SSE_INTALU_ITINS_P, 1>; 4267defm PMINSW : PDI_binop_all<0xEA, "pminsw", X86smin, v8i16, v16i16, 4268 SSE_INTALU_ITINS_P, 1>; 4269defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", X86umax, v16i8, v32i8, 4270 SSE_INTALU_ITINS_P, 1>; 4271defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", X86smax, v8i16, v16i16, 4272 SSE_INTALU_ITINS_P, 1>; 4273 4274// Intrinsic forms 4275defm PSUBSB : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b, 4276 int_x86_avx2_psubs_b, SSE_INTALU_ITINS_P, 0>; 4277defm PSUBSW : PDI_binop_all_int<0xE9, "psubsw" , int_x86_sse2_psubs_w, 4278 int_x86_avx2_psubs_w, SSE_INTALU_ITINS_P, 0>; 4279defm PADDSB : PDI_binop_all_int<0xEC, "paddsb" , int_x86_sse2_padds_b, 4280 int_x86_avx2_padds_b, SSE_INTALU_ITINS_P, 1>; 4281defm PADDSW : PDI_binop_all_int<0xED, "paddsw" , int_x86_sse2_padds_w, 4282 int_x86_avx2_padds_w, SSE_INTALU_ITINS_P, 1>; 4283defm PADDUSB : PDI_binop_all_int<0xDC, "paddusb", int_x86_sse2_paddus_b, 4284 int_x86_avx2_paddus_b, SSE_INTALU_ITINS_P, 1>; 4285defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w, 4286 int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>; 4287defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, 4288 int_x86_avx2_pmadd_wd, SSE_PMADD, 1>; 4289defm PAVGB : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b, 4290 int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>; 4291defm PAVGW : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w, 4292 int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>; 4293defm PSADBW : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw, 4294 int_x86_avx2_psad_bw, SSE_PMADD, 1>; 4295 4296let Predicates = [HasAVX] in 4297defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128, 4298 loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>, 4299 VEX_4V; 4300let Predicates = [HasAVX2] in 4301defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32, 4302 VR256, loadv4i64, i256mem, 4303 SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L; 4304let Constraints = "$src1 = $dst" in 4305defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128, 4306 memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>; 4307 4308//===---------------------------------------------------------------------===// 4309// SSE2 - Packed Integer Logical Instructions 4310//===---------------------------------------------------------------------===// 4311 4312let Predicates = [HasAVX] in { 4313defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, 4314 VR128, v8i16, v8i16, bc_v8i16, 4315 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; 4316defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli, 4317 VR128, v4i32, v4i32, bc_v4i32, 4318 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; 4319defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli, 4320 VR128, v2i64, v2i64, bc_v2i64, 4321 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; 4322 4323defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, 4324 VR128, v8i16, v8i16, bc_v8i16, 4325 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; 4326defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli, 4327 VR128, v4i32, v4i32, bc_v4i32, 4328 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; 4329defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli, 4330 VR128, v2i64, v2i64, bc_v2i64, 4331 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; 4332 4333defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, 4334 VR128, v8i16, v8i16, bc_v8i16, 4335 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; 4336defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, 4337 VR128, v4i32, v4i32, bc_v4i32, 4338 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; 4339 4340let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { 4341 // 128-bit logical shifts. 4342 def VPSLLDQri : PDIi8<0x73, MRM7r, 4343 (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), 4344 "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 4345 [(set VR128:$dst, 4346 (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>, 4347 VEX_4V; 4348 def VPSRLDQri : PDIi8<0x73, MRM3r, 4349 (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), 4350 "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 4351 [(set VR128:$dst, 4352 (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>, 4353 VEX_4V; 4354 // PSRADQri doesn't exist in SSE[1-3]. 4355} 4356} // Predicates = [HasAVX] 4357 4358let Predicates = [HasAVX2] in { 4359defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, 4360 VR256, v16i16, v8i16, bc_v8i16, 4361 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; 4362defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli, 4363 VR256, v8i32, v4i32, bc_v4i32, 4364 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; 4365defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli, 4366 VR256, v4i64, v2i64, bc_v2i64, 4367 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; 4368 4369defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, 4370 VR256, v16i16, v8i16, bc_v8i16, 4371 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; 4372defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli, 4373 VR256, v8i32, v4i32, bc_v4i32, 4374 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; 4375defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli, 4376 VR256, v4i64, v2i64, bc_v2i64, 4377 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; 4378 4379defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, 4380 VR256, v16i16, v8i16, bc_v8i16, 4381 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; 4382defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, 4383 VR256, v8i32, v4i32, bc_v4i32, 4384 SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; 4385 4386let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { 4387 // 256-bit logical shifts. 4388 def VPSLLDQYri : PDIi8<0x73, MRM7r, 4389 (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2), 4390 "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 4391 [(set VR256:$dst, 4392 (int_x86_avx2_psll_dq_bs VR256:$src1, imm:$src2))]>, 4393 VEX_4V, VEX_L; 4394 def VPSRLDQYri : PDIi8<0x73, MRM3r, 4395 (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2), 4396 "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 4397 [(set VR256:$dst, 4398 (int_x86_avx2_psrl_dq_bs VR256:$src1, imm:$src2))]>, 4399 VEX_4V, VEX_L; 4400 // PSRADQYri doesn't exist in SSE[1-3]. 4401} 4402} // Predicates = [HasAVX2] 4403 4404let Constraints = "$src1 = $dst" in { 4405defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, 4406 VR128, v8i16, v8i16, bc_v8i16, 4407 SSE_INTSHIFT_ITINS_P>; 4408defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli, 4409 VR128, v4i32, v4i32, bc_v4i32, 4410 SSE_INTSHIFT_ITINS_P>; 4411defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli, 4412 VR128, v2i64, v2i64, bc_v2i64, 4413 SSE_INTSHIFT_ITINS_P>; 4414 4415defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli, 4416 VR128, v8i16, v8i16, bc_v8i16, 4417 SSE_INTSHIFT_ITINS_P>; 4418defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli, 4419 VR128, v4i32, v4i32, bc_v4i32, 4420 SSE_INTSHIFT_ITINS_P>; 4421defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli, 4422 VR128, v2i64, v2i64, bc_v2i64, 4423 SSE_INTSHIFT_ITINS_P>; 4424 4425defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai, 4426 VR128, v8i16, v8i16, bc_v8i16, 4427 SSE_INTSHIFT_ITINS_P>; 4428defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai, 4429 VR128, v4i32, v4i32, bc_v4i32, 4430 SSE_INTSHIFT_ITINS_P>; 4431 4432let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { 4433 // 128-bit logical shifts. 4434 def PSLLDQri : PDIi8<0x73, MRM7r, 4435 (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), 4436 "pslldq\t{$src2, $dst|$dst, $src2}", 4437 [(set VR128:$dst, 4438 (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))], 4439 IIC_SSE_INTSHDQ_P_RI>; 4440 def PSRLDQri : PDIi8<0x73, MRM3r, 4441 (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), 4442 "psrldq\t{$src2, $dst|$dst, $src2}", 4443 [(set VR128:$dst, 4444 (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))], 4445 IIC_SSE_INTSHDQ_P_RI>; 4446 // PSRADQri doesn't exist in SSE[1-3]. 4447} 4448} // Constraints = "$src1 = $dst" 4449 4450let Predicates = [HasAVX] in { 4451 def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2), 4452 (VPSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>; 4453 def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2), 4454 (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; 4455 def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)), 4456 (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; 4457 4458 // Shift up / down and insert zero's. 4459 def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))), 4460 (VPSLLDQri VR128:$src, (BYTE_imm imm:$amt))>; 4461 def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))), 4462 (VPSRLDQri VR128:$src, (BYTE_imm imm:$amt))>; 4463} 4464 4465let Predicates = [HasAVX2] in { 4466 def : Pat<(int_x86_avx2_psll_dq VR256:$src1, imm:$src2), 4467 (VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>; 4468 def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2), 4469 (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>; 4470} 4471 4472let Predicates = [UseSSE2] in { 4473 def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2), 4474 (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>; 4475 def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2), 4476 (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; 4477 def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)), 4478 (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; 4479 4480 // Shift up / down and insert zero's. 4481 def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))), 4482 (PSLLDQri VR128:$src, (BYTE_imm imm:$amt))>; 4483 def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))), 4484 (PSRLDQri VR128:$src, (BYTE_imm imm:$amt))>; 4485} 4486 4487//===---------------------------------------------------------------------===// 4488// SSE2 - Packed Integer Comparison Instructions 4489//===---------------------------------------------------------------------===// 4490 4491defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8, 4492 SSE_INTALU_ITINS_P, 1>; 4493defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16, 4494 SSE_INTALU_ITINS_P, 1>; 4495defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32, 4496 SSE_INTALU_ITINS_P, 1>; 4497defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8, 4498 SSE_INTALU_ITINS_P, 0>; 4499defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, 4500 SSE_INTALU_ITINS_P, 0>; 4501defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, 4502 SSE_INTALU_ITINS_P, 0>; 4503 4504//===---------------------------------------------------------------------===// 4505// SSE2 - Packed Integer Shuffle Instructions 4506//===---------------------------------------------------------------------===// 4507 4508let ExeDomain = SSEPackedInt in { 4509multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256, 4510 SDNode OpNode> { 4511let Predicates = [HasAVX] in { 4512 def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst), 4513 (ins VR128:$src1, i8imm:$src2), 4514 !strconcat("v", OpcodeStr, 4515 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4516 [(set VR128:$dst, 4517 (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))], 4518 IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>; 4519 def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), 4520 (ins i128mem:$src1, i8imm:$src2), 4521 !strconcat("v", OpcodeStr, 4522 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4523 [(set VR128:$dst, 4524 (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)), 4525 (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, 4526 Sched<[WriteShuffleLd]>; 4527} 4528 4529let Predicates = [HasAVX2] in { 4530 def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst), 4531 (ins VR256:$src1, i8imm:$src2), 4532 !strconcat("v", OpcodeStr, 4533 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4534 [(set VR256:$dst, 4535 (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))], 4536 IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>; 4537 def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), 4538 (ins i256mem:$src1, i8imm:$src2), 4539 !strconcat("v", OpcodeStr, 4540 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4541 [(set VR256:$dst, 4542 (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)), 4543 (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L, 4544 Sched<[WriteShuffleLd]>; 4545} 4546 4547let Predicates = [UseSSE2] in { 4548 def ri : Ii8<0x70, MRMSrcReg, 4549 (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2), 4550 !strconcat(OpcodeStr, 4551 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4552 [(set VR128:$dst, 4553 (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))], 4554 IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>; 4555 def mi : Ii8<0x70, MRMSrcMem, 4556 (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2), 4557 !strconcat(OpcodeStr, 4558 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4559 [(set VR128:$dst, 4560 (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)), 4561 (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, 4562 Sched<[WriteShuffleLd, ReadAfterLd]>; 4563} 4564} 4565} // ExeDomain = SSEPackedInt 4566 4567defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd>, PD; 4568defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw>, XS; 4569defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw>, XD; 4570 4571let Predicates = [HasAVX] in { 4572 def : Pat<(v4f32 (X86PShufd (loadv4f32 addr:$src1), (i8 imm:$imm))), 4573 (VPSHUFDmi addr:$src1, imm:$imm)>; 4574 def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))), 4575 (VPSHUFDri VR128:$src1, imm:$imm)>; 4576} 4577 4578let Predicates = [UseSSE2] in { 4579 def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))), 4580 (PSHUFDmi addr:$src1, imm:$imm)>; 4581 def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))), 4582 (PSHUFDri VR128:$src1, imm:$imm)>; 4583} 4584 4585//===---------------------------------------------------------------------===// 4586// Packed Integer Pack Instructions (SSE & AVX) 4587//===---------------------------------------------------------------------===// 4588 4589let ExeDomain = SSEPackedInt in { 4590multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, 4591 ValueType ArgVT, SDNode OpNode, PatFrag bc_frag, 4592 bit Is2Addr = 1> { 4593 def rr : PDI<opc, MRMSrcReg, 4594 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 4595 !if(Is2Addr, 4596 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4597 !strconcat(OpcodeStr, 4598 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4599 [(set VR128:$dst, 4600 (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>, 4601 Sched<[WriteShuffle]>; 4602 def rm : PDI<opc, MRMSrcMem, 4603 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), 4604 !if(Is2Addr, 4605 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4606 !strconcat(OpcodeStr, 4607 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4608 [(set VR128:$dst, 4609 (OutVT (OpNode VR128:$src1, 4610 (bc_frag (memopv2i64 addr:$src2)))))]>, 4611 Sched<[WriteShuffleLd, ReadAfterLd]>; 4612} 4613 4614multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT, 4615 ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> { 4616 def Yrr : PDI<opc, MRMSrcReg, 4617 (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), 4618 !strconcat(OpcodeStr, 4619 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4620 [(set VR256:$dst, 4621 (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>, 4622 Sched<[WriteShuffle]>; 4623 def Yrm : PDI<opc, MRMSrcMem, 4624 (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), 4625 !strconcat(OpcodeStr, 4626 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4627 [(set VR256:$dst, 4628 (OutVT (OpNode VR256:$src1, 4629 (bc_frag (memopv4i64 addr:$src2)))))]>, 4630 Sched<[WriteShuffleLd, ReadAfterLd]>; 4631} 4632 4633multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, 4634 ValueType ArgVT, SDNode OpNode, PatFrag bc_frag, 4635 bit Is2Addr = 1> { 4636 def rr : SS48I<opc, MRMSrcReg, 4637 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 4638 !if(Is2Addr, 4639 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4640 !strconcat(OpcodeStr, 4641 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4642 [(set VR128:$dst, 4643 (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>, 4644 Sched<[WriteShuffle]>; 4645 def rm : SS48I<opc, MRMSrcMem, 4646 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), 4647 !if(Is2Addr, 4648 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4649 !strconcat(OpcodeStr, 4650 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4651 [(set VR128:$dst, 4652 (OutVT (OpNode VR128:$src1, 4653 (bc_frag (memopv2i64 addr:$src2)))))]>, 4654 Sched<[WriteShuffleLd, ReadAfterLd]>; 4655} 4656 4657multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT, 4658 ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> { 4659 def Yrr : SS48I<opc, MRMSrcReg, 4660 (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), 4661 !strconcat(OpcodeStr, 4662 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4663 [(set VR256:$dst, 4664 (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>, 4665 Sched<[WriteShuffle]>; 4666 def Yrm : SS48I<opc, MRMSrcMem, 4667 (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), 4668 !strconcat(OpcodeStr, 4669 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4670 [(set VR256:$dst, 4671 (OutVT (OpNode VR256:$src1, 4672 (bc_frag (memopv4i64 addr:$src2)))))]>, 4673 Sched<[WriteShuffleLd, ReadAfterLd]>; 4674} 4675 4676let Predicates = [HasAVX] in { 4677 defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, 4678 bc_v8i16, 0>, VEX_4V; 4679 defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, 4680 bc_v4i32, 0>, VEX_4V; 4681 4682 defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, 4683 bc_v8i16, 0>, VEX_4V; 4684 defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, 4685 bc_v4i32, 0>, VEX_4V; 4686} 4687 4688let Predicates = [HasAVX2] in { 4689 defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss, 4690 bc_v16i16>, VEX_4V, VEX_L; 4691 defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, 4692 bc_v8i32>, VEX_4V, VEX_L; 4693 4694 defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus, 4695 bc_v16i16>, VEX_4V, VEX_L; 4696 defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, 4697 bc_v8i32>, VEX_4V, VEX_L; 4698} 4699 4700let Constraints = "$src1 = $dst" in { 4701 defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, 4702 bc_v8i16>; 4703 defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, 4704 bc_v4i32>; 4705 4706 defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, 4707 bc_v8i16>; 4708 4709 let Predicates = [HasSSE41] in 4710 defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, 4711 bc_v4i32>; 4712} 4713} // ExeDomain = SSEPackedInt 4714 4715//===---------------------------------------------------------------------===// 4716// SSE2 - Packed Integer Unpack Instructions 4717//===---------------------------------------------------------------------===// 4718 4719let ExeDomain = SSEPackedInt in { 4720multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, 4721 SDNode OpNode, PatFrag bc_frag, bit Is2Addr = 1> { 4722 def rr : PDI<opc, MRMSrcReg, 4723 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 4724 !if(Is2Addr, 4725 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), 4726 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4727 [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], 4728 IIC_SSE_UNPCK>, Sched<[WriteShuffle]>; 4729 def rm : PDI<opc, MRMSrcMem, 4730 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), 4731 !if(Is2Addr, 4732 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), 4733 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4734 [(set VR128:$dst, (OpNode VR128:$src1, 4735 (bc_frag (memopv2i64 4736 addr:$src2))))], 4737 IIC_SSE_UNPCK>, 4738 Sched<[WriteShuffleLd, ReadAfterLd]>; 4739} 4740 4741multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt, 4742 SDNode OpNode, PatFrag bc_frag> { 4743 def Yrr : PDI<opc, MRMSrcReg, 4744 (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), 4745 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4746 [(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>, 4747 Sched<[WriteShuffle]>; 4748 def Yrm : PDI<opc, MRMSrcMem, 4749 (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), 4750 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4751 [(set VR256:$dst, (OpNode VR256:$src1, 4752 (bc_frag (memopv4i64 addr:$src2))))]>, 4753 Sched<[WriteShuffleLd, ReadAfterLd]>; 4754} 4755 4756let Predicates = [HasAVX] in { 4757 defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, 4758 bc_v16i8, 0>, VEX_4V; 4759 defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, 4760 bc_v8i16, 0>, VEX_4V; 4761 defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, 4762 bc_v4i32, 0>, VEX_4V; 4763 defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, 4764 bc_v2i64, 0>, VEX_4V; 4765 4766 defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, 4767 bc_v16i8, 0>, VEX_4V; 4768 defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, 4769 bc_v8i16, 0>, VEX_4V; 4770 defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, 4771 bc_v4i32, 0>, VEX_4V; 4772 defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, 4773 bc_v2i64, 0>, VEX_4V; 4774} 4775 4776let Predicates = [HasAVX2] in { 4777 defm VPUNPCKLBW : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl, 4778 bc_v32i8>, VEX_4V, VEX_L; 4779 defm VPUNPCKLWD : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl, 4780 bc_v16i16>, VEX_4V, VEX_L; 4781 defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl, 4782 bc_v8i32>, VEX_4V, VEX_L; 4783 defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, 4784 bc_v4i64>, VEX_4V, VEX_L; 4785 4786 defm VPUNPCKHBW : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh, 4787 bc_v32i8>, VEX_4V, VEX_L; 4788 defm VPUNPCKHWD : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh, 4789 bc_v16i16>, VEX_4V, VEX_L; 4790 defm VPUNPCKHDQ : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh, 4791 bc_v8i32>, VEX_4V, VEX_L; 4792 defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, 4793 bc_v4i64>, VEX_4V, VEX_L; 4794} 4795 4796let Constraints = "$src1 = $dst" in { 4797 defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, 4798 bc_v16i8>; 4799 defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, 4800 bc_v8i16>; 4801 defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, 4802 bc_v4i32>; 4803 defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, 4804 bc_v2i64>; 4805 4806 defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, 4807 bc_v16i8>; 4808 defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, 4809 bc_v8i16>; 4810 defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, 4811 bc_v4i32>; 4812 defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, 4813 bc_v2i64>; 4814} 4815} // ExeDomain = SSEPackedInt 4816 4817//===---------------------------------------------------------------------===// 4818// SSE2 - Packed Integer Extract and Insert 4819//===---------------------------------------------------------------------===// 4820 4821let ExeDomain = SSEPackedInt in { 4822multiclass sse2_pinsrw<bit Is2Addr = 1> { 4823 def rri : Ii8<0xC4, MRMSrcReg, 4824 (outs VR128:$dst), (ins VR128:$src1, 4825 GR32orGR64:$src2, i32i8imm:$src3), 4826 !if(Is2Addr, 4827 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", 4828 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 4829 [(set VR128:$dst, 4830 (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))], 4831 IIC_SSE_PINSRW>, Sched<[WriteShuffle]>; 4832 def rmi : Ii8<0xC4, MRMSrcMem, 4833 (outs VR128:$dst), (ins VR128:$src1, 4834 i16mem:$src2, i32i8imm:$src3), 4835 !if(Is2Addr, 4836 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", 4837 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 4838 [(set VR128:$dst, 4839 (X86pinsrw VR128:$src1, (extloadi16 addr:$src2), 4840 imm:$src3))], IIC_SSE_PINSRW>, 4841 Sched<[WriteShuffleLd, ReadAfterLd]>; 4842} 4843 4844// Extract 4845let Predicates = [HasAVX] in 4846def VPEXTRWri : Ii8<0xC5, MRMSrcReg, 4847 (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2), 4848 "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", 4849 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), 4850 imm:$src2))]>, PD, VEX, 4851 Sched<[WriteShuffle]>; 4852def PEXTRWri : PDIi8<0xC5, MRMSrcReg, 4853 (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2), 4854 "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", 4855 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), 4856 imm:$src2))], IIC_SSE_PEXTRW>, 4857 Sched<[WriteShuffleLd, ReadAfterLd]>; 4858 4859// Insert 4860let Predicates = [HasAVX] in 4861defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V; 4862 4863let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in 4864defm PINSRW : sse2_pinsrw, PD; 4865 4866} // ExeDomain = SSEPackedInt 4867 4868//===---------------------------------------------------------------------===// 4869// SSE2 - Packed Mask Creation 4870//===---------------------------------------------------------------------===// 4871 4872let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in { 4873 4874def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), 4875 (ins VR128:$src), 4876 "pmovmskb\t{$src, $dst|$dst, $src}", 4877 [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))], 4878 IIC_SSE_MOVMSK>, VEX; 4879 4880let Predicates = [HasAVX2] in { 4881def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), 4882 (ins VR256:$src), 4883 "pmovmskb\t{$src, $dst|$dst, $src}", 4884 [(set GR32orGR64:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>, 4885 VEX, VEX_L; 4886} 4887 4888def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src), 4889 "pmovmskb\t{$src, $dst|$dst, $src}", 4890 [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))], 4891 IIC_SSE_MOVMSK>; 4892 4893} // ExeDomain = SSEPackedInt 4894 4895//===---------------------------------------------------------------------===// 4896// SSE2 - Conditional Store 4897//===---------------------------------------------------------------------===// 4898 4899let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in { 4900 4901let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in 4902def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), 4903 (ins VR128:$src, VR128:$mask), 4904 "maskmovdqu\t{$mask, $src|$src, $mask}", 4905 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)], 4906 IIC_SSE_MASKMOV>, VEX; 4907let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in 4908def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs), 4909 (ins VR128:$src, VR128:$mask), 4910 "maskmovdqu\t{$mask, $src|$src, $mask}", 4911 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)], 4912 IIC_SSE_MASKMOV>, VEX; 4913 4914let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in 4915def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), 4916 "maskmovdqu\t{$mask, $src|$src, $mask}", 4917 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)], 4918 IIC_SSE_MASKMOV>; 4919let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in 4920def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), 4921 "maskmovdqu\t{$mask, $src|$src, $mask}", 4922 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)], 4923 IIC_SSE_MASKMOV>; 4924 4925} // ExeDomain = SSEPackedInt 4926 4927//===---------------------------------------------------------------------===// 4928// SSE2 - Move Doubleword 4929//===---------------------------------------------------------------------===// 4930 4931//===---------------------------------------------------------------------===// 4932// Move Int Doubleword to Packed Double Int 4933// 4934def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), 4935 "movd\t{$src, $dst|$dst, $src}", 4936 [(set VR128:$dst, 4937 (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>, 4938 VEX, Sched<[WriteMove]>; 4939def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), 4940 "movd\t{$src, $dst|$dst, $src}", 4941 [(set VR128:$dst, 4942 (v4i32 (scalar_to_vector (loadi32 addr:$src))))], 4943 IIC_SSE_MOVDQ>, 4944 VEX, Sched<[WriteLoad]>; 4945def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 4946 "movq\t{$src, $dst|$dst, $src}", 4947 [(set VR128:$dst, 4948 (v2i64 (scalar_to_vector GR64:$src)))], 4949 IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; 4950let isCodeGenOnly = 1 in 4951def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), 4952 "movq\t{$src, $dst|$dst, $src}", 4953 [(set FR64:$dst, (bitconvert GR64:$src))], 4954 IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; 4955 4956def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), 4957 "movd\t{$src, $dst|$dst, $src}", 4958 [(set VR128:$dst, 4959 (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>, 4960 Sched<[WriteMove]>; 4961def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), 4962 "movd\t{$src, $dst|$dst, $src}", 4963 [(set VR128:$dst, 4964 (v4i32 (scalar_to_vector (loadi32 addr:$src))))], 4965 IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; 4966def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 4967 "mov{d|q}\t{$src, $dst|$dst, $src}", 4968 [(set VR128:$dst, 4969 (v2i64 (scalar_to_vector GR64:$src)))], 4970 IIC_SSE_MOVDQ>, Sched<[WriteMove]>; 4971let isCodeGenOnly = 1 in 4972def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), 4973 "mov{d|q}\t{$src, $dst|$dst, $src}", 4974 [(set FR64:$dst, (bitconvert GR64:$src))], 4975 IIC_SSE_MOVDQ>, Sched<[WriteMove]>; 4976 4977//===---------------------------------------------------------------------===// 4978// Move Int Doubleword to Single Scalar 4979// 4980let isCodeGenOnly = 1 in { 4981 def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), 4982 "movd\t{$src, $dst|$dst, $src}", 4983 [(set FR32:$dst, (bitconvert GR32:$src))], 4984 IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; 4985 4986 def VMOVDI2SSrm : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), 4987 "movd\t{$src, $dst|$dst, $src}", 4988 [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))], 4989 IIC_SSE_MOVDQ>, 4990 VEX, Sched<[WriteLoad]>; 4991 def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), 4992 "movd\t{$src, $dst|$dst, $src}", 4993 [(set FR32:$dst, (bitconvert GR32:$src))], 4994 IIC_SSE_MOVDQ>, Sched<[WriteMove]>; 4995 4996 def MOVDI2SSrm : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), 4997 "movd\t{$src, $dst|$dst, $src}", 4998 [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))], 4999 IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; 5000} 5001 5002//===---------------------------------------------------------------------===// 5003// Move Packed Doubleword Int to Packed Double Int 5004// 5005def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), 5006 "movd\t{$src, $dst|$dst, $src}", 5007 [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), 5008 (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX, 5009 Sched<[WriteMove]>; 5010def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs), 5011 (ins i32mem:$dst, VR128:$src), 5012 "movd\t{$src, $dst|$dst, $src}", 5013 [(store (i32 (vector_extract (v4i32 VR128:$src), 5014 (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, 5015 VEX, Sched<[WriteStore]>; 5016def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), 5017 "movd\t{$src, $dst|$dst, $src}", 5018 [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), 5019 (iPTR 0)))], IIC_SSE_MOVD_ToGP>, 5020 Sched<[WriteMove]>; 5021def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), 5022 "movd\t{$src, $dst|$dst, $src}", 5023 [(store (i32 (vector_extract (v4i32 VR128:$src), 5024 (iPTR 0))), addr:$dst)], 5025 IIC_SSE_MOVDQ>, Sched<[WriteStore]>; 5026 5027def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))), 5028 (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>; 5029 5030def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))), 5031 (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>; 5032 5033def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))), 5034 (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>; 5035 5036def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))), 5037 (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>; 5038 5039//===---------------------------------------------------------------------===// 5040// Move Packed Doubleword Int first element to Doubleword Int 5041// 5042let SchedRW = [WriteMove] in { 5043def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), 5044 "movq\t{$src, $dst|$dst, $src}", 5045 [(set GR64:$dst, (vector_extract (v2i64 VR128:$src), 5046 (iPTR 0)))], 5047 IIC_SSE_MOVD_ToGP>, 5048 VEX; 5049 5050def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), 5051 "mov{d|q}\t{$src, $dst|$dst, $src}", 5052 [(set GR64:$dst, (vector_extract (v2i64 VR128:$src), 5053 (iPTR 0)))], 5054 IIC_SSE_MOVD_ToGP>; 5055} //SchedRW 5056 5057//===---------------------------------------------------------------------===// 5058// Bitcast FR64 <-> GR64 5059// 5060let isCodeGenOnly = 1 in { 5061 let Predicates = [UseAVX] in 5062 def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), 5063 "movq\t{$src, $dst|$dst, $src}", 5064 [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>, 5065 VEX, Sched<[WriteLoad]>; 5066 def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), 5067 "movq\t{$src, $dst|$dst, $src}", 5068 [(set GR64:$dst, (bitconvert FR64:$src))], 5069 IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; 5070 def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), 5071 "movq\t{$src, $dst|$dst, $src}", 5072 [(store (i64 (bitconvert FR64:$src)), addr:$dst)], 5073 IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; 5074 5075 def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), 5076 "movq\t{$src, $dst|$dst, $src}", 5077 [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))], 5078 IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; 5079 def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), 5080 "mov{d|q}\t{$src, $dst|$dst, $src}", 5081 [(set GR64:$dst, (bitconvert FR64:$src))], 5082 IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; 5083 def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), 5084 "movq\t{$src, $dst|$dst, $src}", 5085 [(store (i64 (bitconvert FR64:$src)), addr:$dst)], 5086 IIC_SSE_MOVDQ>, Sched<[WriteStore]>; 5087} 5088 5089//===---------------------------------------------------------------------===// 5090// Move Scalar Single to Double Int 5091// 5092let isCodeGenOnly = 1 in { 5093 def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), 5094 "movd\t{$src, $dst|$dst, $src}", 5095 [(set GR32:$dst, (bitconvert FR32:$src))], 5096 IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>; 5097 def VMOVSS2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), 5098 "movd\t{$src, $dst|$dst, $src}", 5099 [(store (i32 (bitconvert FR32:$src)), addr:$dst)], 5100 IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; 5101 def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), 5102 "movd\t{$src, $dst|$dst, $src}", 5103 [(set GR32:$dst, (bitconvert FR32:$src))], 5104 IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; 5105 def MOVSS2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), 5106 "movd\t{$src, $dst|$dst, $src}", 5107 [(store (i32 (bitconvert FR32:$src)), addr:$dst)], 5108 IIC_SSE_MOVDQ>, Sched<[WriteStore]>; 5109} 5110 5111//===---------------------------------------------------------------------===// 5112// Patterns and instructions to describe movd/movq to XMM register zero-extends 5113// 5114let isCodeGenOnly = 1, SchedRW = [WriteMove] in { 5115let AddedComplexity = 15 in { 5116def VMOVZQI2PQIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 5117 "movq\t{$src, $dst|$dst, $src}", // X86-64 only 5118 [(set VR128:$dst, (v2i64 (X86vzmovl 5119 (v2i64 (scalar_to_vector GR64:$src)))))], 5120 IIC_SSE_MOVDQ>, 5121 VEX, VEX_W; 5122def MOVZQI2PQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 5123 "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only 5124 [(set VR128:$dst, (v2i64 (X86vzmovl 5125 (v2i64 (scalar_to_vector GR64:$src)))))], 5126 IIC_SSE_MOVDQ>; 5127} 5128} // isCodeGenOnly, SchedRW 5129 5130let Predicates = [UseAVX] in { 5131 let AddedComplexity = 15 in 5132 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), 5133 (VMOVDI2PDIrr GR32:$src)>; 5134 5135 // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part. 5136 let AddedComplexity = 20 in { 5137 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), 5138 (VMOVDI2PDIrm addr:$src)>; 5139 def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), 5140 (VMOVDI2PDIrm addr:$src)>; 5141 def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), 5142 (VMOVDI2PDIrm addr:$src)>; 5143 } 5144 // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. 5145 def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, 5146 (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), 5147 (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>; 5148 def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, 5149 (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), 5150 (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>; 5151} 5152 5153let Predicates = [UseSSE2] in { 5154 let AddedComplexity = 15 in 5155 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), 5156 (MOVDI2PDIrr GR32:$src)>; 5157 5158 let AddedComplexity = 20 in { 5159 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), 5160 (MOVDI2PDIrm addr:$src)>; 5161 def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), 5162 (MOVDI2PDIrm addr:$src)>; 5163 def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), 5164 (MOVDI2PDIrm addr:$src)>; 5165 } 5166} 5167 5168// These are the correct encodings of the instructions so that we know how to 5169// read correct assembly, even though we continue to emit the wrong ones for 5170// compatibility with Darwin's buggy assembler. 5171def : InstAlias<"movq\t{$src, $dst|$dst, $src}", 5172 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>; 5173def : InstAlias<"movq\t{$src, $dst|$dst, $src}", 5174 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>; 5175// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX. 5176def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", 5177 (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>; 5178def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", 5179 (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>; 5180 5181//===---------------------------------------------------------------------===// 5182// SSE2 - Move Quadword 5183//===---------------------------------------------------------------------===// 5184 5185//===---------------------------------------------------------------------===// 5186// Move Quadword Int to Packed Quadword Int 5187// 5188 5189let SchedRW = [WriteLoad] in { 5190def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 5191 "vmovq\t{$src, $dst|$dst, $src}", 5192 [(set VR128:$dst, 5193 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS, 5194 VEX, Requires<[UseAVX]>; 5195def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 5196 "movq\t{$src, $dst|$dst, $src}", 5197 [(set VR128:$dst, 5198 (v2i64 (scalar_to_vector (loadi64 addr:$src))))], 5199 IIC_SSE_MOVDQ>, XS, 5200 Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix 5201} // SchedRW 5202 5203//===---------------------------------------------------------------------===// 5204// Move Packed Quadword Int to Quadword Int 5205// 5206let SchedRW = [WriteStore] in { 5207def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 5208 "movq\t{$src, $dst|$dst, $src}", 5209 [(store (i64 (vector_extract (v2i64 VR128:$src), 5210 (iPTR 0))), addr:$dst)], 5211 IIC_SSE_MOVDQ>, VEX; 5212def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 5213 "movq\t{$src, $dst|$dst, $src}", 5214 [(store (i64 (vector_extract (v2i64 VR128:$src), 5215 (iPTR 0))), addr:$dst)], 5216 IIC_SSE_MOVDQ>; 5217} // SchedRW 5218 5219// For disassembler only 5220let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 5221 SchedRW = [WriteVecLogic] in { 5222def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 5223 "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX; 5224def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 5225 "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>; 5226} 5227 5228//===---------------------------------------------------------------------===// 5229// Store / copy lower 64-bits of a XMM register. 5230// 5231let Predicates = [UseAVX] in 5232def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src), 5233 (VMOVPQI2QImr addr:$dst, VR128:$src)>; 5234let Predicates = [UseSSE2] in 5235def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src), 5236 (MOVPQI2QImr addr:$dst, VR128:$src)>; 5237 5238let isCodeGenOnly = 1, AddedComplexity = 20 in { 5239def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 5240 "vmovq\t{$src, $dst|$dst, $src}", 5241 [(set VR128:$dst, 5242 (v2i64 (X86vzmovl (v2i64 (scalar_to_vector 5243 (loadi64 addr:$src))))))], 5244 IIC_SSE_MOVDQ>, 5245 XS, VEX, Requires<[UseAVX]>, Sched<[WriteLoad]>; 5246 5247def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 5248 "movq\t{$src, $dst|$dst, $src}", 5249 [(set VR128:$dst, 5250 (v2i64 (X86vzmovl (v2i64 (scalar_to_vector 5251 (loadi64 addr:$src))))))], 5252 IIC_SSE_MOVDQ>, 5253 XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>; 5254} 5255 5256let Predicates = [UseAVX], AddedComplexity = 20 in { 5257 def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))), 5258 (VMOVZQI2PQIrm addr:$src)>; 5259 def : Pat<(v2i64 (X86vzload addr:$src)), 5260 (VMOVZQI2PQIrm addr:$src)>; 5261} 5262 5263let Predicates = [UseSSE2], AddedComplexity = 20 in { 5264 def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))), 5265 (MOVZQI2PQIrm addr:$src)>; 5266 def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>; 5267} 5268 5269let Predicates = [HasAVX] in { 5270def : Pat<(v4i64 (alignedX86vzload addr:$src)), 5271 (SUBREG_TO_REG (i32 0), (VMOVAPSrm addr:$src), sub_xmm)>; 5272def : Pat<(v4i64 (X86vzload addr:$src)), 5273 (SUBREG_TO_REG (i32 0), (VMOVUPSrm addr:$src), sub_xmm)>; 5274} 5275 5276//===---------------------------------------------------------------------===// 5277// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in 5278// IA32 document. movq xmm1, xmm2 does clear the high bits. 5279// 5280let SchedRW = [WriteVecLogic] in { 5281let AddedComplexity = 15 in 5282def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 5283 "vmovq\t{$src, $dst|$dst, $src}", 5284 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))], 5285 IIC_SSE_MOVQ_RR>, 5286 XS, VEX, Requires<[UseAVX]>; 5287let AddedComplexity = 15 in 5288def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 5289 "movq\t{$src, $dst|$dst, $src}", 5290 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))], 5291 IIC_SSE_MOVQ_RR>, 5292 XS, Requires<[UseSSE2]>; 5293} // SchedRW 5294 5295let isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in { 5296let AddedComplexity = 20 in 5297def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 5298 "vmovq\t{$src, $dst|$dst, $src}", 5299 [(set VR128:$dst, (v2i64 (X86vzmovl 5300 (loadv2i64 addr:$src))))], 5301 IIC_SSE_MOVDQ>, 5302 XS, VEX, Requires<[UseAVX]>; 5303let AddedComplexity = 20 in { 5304def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 5305 "movq\t{$src, $dst|$dst, $src}", 5306 [(set VR128:$dst, (v2i64 (X86vzmovl 5307 (loadv2i64 addr:$src))))], 5308 IIC_SSE_MOVDQ>, 5309 XS, Requires<[UseSSE2]>; 5310} 5311} // isCodeGenOnly, SchedRW 5312 5313let AddedComplexity = 20 in { 5314 let Predicates = [UseAVX] in { 5315 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 5316 (VMOVZPQILo2PQIrr VR128:$src)>; 5317 } 5318 let Predicates = [UseSSE2] in { 5319 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 5320 (MOVZPQILo2PQIrr VR128:$src)>; 5321 } 5322} 5323 5324//===---------------------------------------------------------------------===// 5325// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP 5326//===---------------------------------------------------------------------===// 5327multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, 5328 ValueType vt, RegisterClass RC, PatFrag mem_frag, 5329 X86MemOperand x86memop> { 5330def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), 5331 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5332 [(set RC:$dst, (vt (OpNode RC:$src)))], 5333 IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; 5334def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 5335 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5336 [(set RC:$dst, (OpNode (mem_frag addr:$src)))], 5337 IIC_SSE_MOV_LH>, Sched<[WriteLoad]>; 5338} 5339 5340let Predicates = [HasAVX] in { 5341 defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", 5342 v4f32, VR128, loadv4f32, f128mem>, VEX; 5343 defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", 5344 v4f32, VR128, loadv4f32, f128mem>, VEX; 5345 defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", 5346 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L; 5347 defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", 5348 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L; 5349} 5350defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128, 5351 memopv4f32, f128mem>; 5352defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128, 5353 memopv4f32, f128mem>; 5354 5355let Predicates = [HasAVX] in { 5356 def : Pat<(v4i32 (X86Movshdup VR128:$src)), 5357 (VMOVSHDUPrr VR128:$src)>; 5358 def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))), 5359 (VMOVSHDUPrm addr:$src)>; 5360 def : Pat<(v4i32 (X86Movsldup VR128:$src)), 5361 (VMOVSLDUPrr VR128:$src)>; 5362 def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))), 5363 (VMOVSLDUPrm addr:$src)>; 5364 def : Pat<(v8i32 (X86Movshdup VR256:$src)), 5365 (VMOVSHDUPYrr VR256:$src)>; 5366 def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))), 5367 (VMOVSHDUPYrm addr:$src)>; 5368 def : Pat<(v8i32 (X86Movsldup VR256:$src)), 5369 (VMOVSLDUPYrr VR256:$src)>; 5370 def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))), 5371 (VMOVSLDUPYrm addr:$src)>; 5372} 5373 5374let Predicates = [UseSSE3] in { 5375 def : Pat<(v4i32 (X86Movshdup VR128:$src)), 5376 (MOVSHDUPrr VR128:$src)>; 5377 def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))), 5378 (MOVSHDUPrm addr:$src)>; 5379 def : Pat<(v4i32 (X86Movsldup VR128:$src)), 5380 (MOVSLDUPrr VR128:$src)>; 5381 def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))), 5382 (MOVSLDUPrm addr:$src)>; 5383} 5384 5385//===---------------------------------------------------------------------===// 5386// SSE3 - Replicate Double FP - MOVDDUP 5387//===---------------------------------------------------------------------===// 5388 5389multiclass sse3_replicate_dfp<string OpcodeStr> { 5390let hasSideEffects = 0 in 5391def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 5392 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5393 [], IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; 5394def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 5395 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5396 [(set VR128:$dst, 5397 (v2f64 (X86Movddup 5398 (scalar_to_vector (loadf64 addr:$src)))))], 5399 IIC_SSE_MOV_LH>, Sched<[WriteLoad]>; 5400} 5401 5402// FIXME: Merge with above classe when there're patterns for the ymm version 5403multiclass sse3_replicate_dfp_y<string OpcodeStr> { 5404def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 5405 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5406 [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>, 5407 Sched<[WriteFShuffle]>; 5408def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 5409 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5410 [(set VR256:$dst, 5411 (v4f64 (X86Movddup 5412 (scalar_to_vector (loadf64 addr:$src)))))]>, 5413 Sched<[WriteLoad]>; 5414} 5415 5416let Predicates = [HasAVX] in { 5417 defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX; 5418 defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L; 5419} 5420 5421defm MOVDDUP : sse3_replicate_dfp<"movddup">; 5422 5423let Predicates = [HasAVX] in { 5424 def : Pat<(X86Movddup (loadv2f64 addr:$src)), 5425 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; 5426 def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))), 5427 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; 5428 def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))), 5429 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; 5430 def : Pat<(X86Movddup (bc_v2f64 5431 (v2i64 (scalar_to_vector (loadi64 addr:$src))))), 5432 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; 5433 5434 // 256-bit version 5435 def : Pat<(X86Movddup (loadv4f64 addr:$src)), 5436 (VMOVDDUPYrm addr:$src)>; 5437 def : Pat<(X86Movddup (loadv4i64 addr:$src)), 5438 (VMOVDDUPYrm addr:$src)>; 5439 def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))), 5440 (VMOVDDUPYrm addr:$src)>; 5441 def : Pat<(X86Movddup (v4i64 VR256:$src)), 5442 (VMOVDDUPYrr VR256:$src)>; 5443} 5444 5445let Predicates = [UseAVX, OptForSize] in { 5446 def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), 5447 (VMOVDDUPrm addr:$src)>; 5448 def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), 5449 (VMOVDDUPrm addr:$src)>; 5450} 5451 5452let Predicates = [UseSSE3] in { 5453 def : Pat<(X86Movddup (memopv2f64 addr:$src)), 5454 (MOVDDUPrm addr:$src)>; 5455 def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))), 5456 (MOVDDUPrm addr:$src)>; 5457 def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))), 5458 (MOVDDUPrm addr:$src)>; 5459 def : Pat<(X86Movddup (bc_v2f64 5460 (v2i64 (scalar_to_vector (loadi64 addr:$src))))), 5461 (MOVDDUPrm addr:$src)>; 5462} 5463 5464//===---------------------------------------------------------------------===// 5465// SSE3 - Move Unaligned Integer 5466//===---------------------------------------------------------------------===// 5467 5468let SchedRW = [WriteLoad] in { 5469let Predicates = [HasAVX] in { 5470 def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 5471 "vlddqu\t{$src, $dst|$dst, $src}", 5472 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX; 5473 def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 5474 "vlddqu\t{$src, $dst|$dst, $src}", 5475 [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, 5476 VEX, VEX_L; 5477} 5478def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 5479 "lddqu\t{$src, $dst|$dst, $src}", 5480 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))], 5481 IIC_SSE_LDDQU>; 5482} 5483 5484//===---------------------------------------------------------------------===// 5485// SSE3 - Arithmetic 5486//===---------------------------------------------------------------------===// 5487 5488multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC, 5489 X86MemOperand x86memop, OpndItins itins, 5490 bit Is2Addr = 1> { 5491 def rr : I<0xD0, MRMSrcReg, 5492 (outs RC:$dst), (ins RC:$src1, RC:$src2), 5493 !if(Is2Addr, 5494 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5495 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5496 [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>, 5497 Sched<[itins.Sched]>; 5498 def rm : I<0xD0, MRMSrcMem, 5499 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 5500 !if(Is2Addr, 5501 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5502 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5503 [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))], itins.rr>, 5504 Sched<[itins.Sched.Folded, ReadAfterLd]>; 5505} 5506 5507let Predicates = [HasAVX] in { 5508 let ExeDomain = SSEPackedSingle in { 5509 defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128, 5510 f128mem, SSE_ALU_F32P, 0>, XD, VEX_4V; 5511 defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256, 5512 f256mem, SSE_ALU_F32P, 0>, XD, VEX_4V, VEX_L; 5513 } 5514 let ExeDomain = SSEPackedDouble in { 5515 defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128, 5516 f128mem, SSE_ALU_F64P, 0>, PD, VEX_4V; 5517 defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256, 5518 f256mem, SSE_ALU_F64P, 0>, PD, VEX_4V, VEX_L; 5519 } 5520} 5521let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { 5522 let ExeDomain = SSEPackedSingle in 5523 defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128, 5524 f128mem, SSE_ALU_F32P>, XD; 5525 let ExeDomain = SSEPackedDouble in 5526 defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128, 5527 f128mem, SSE_ALU_F64P>, PD; 5528} 5529 5530// Patterns used to select 'addsub' instructions. 5531let Predicates = [HasAVX] in { 5532 def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))), 5533 (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>; 5534 def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 (memop addr:$rhs)))), 5535 (VADDSUBPSrm VR128:$lhs, f128mem:$rhs)>; 5536 def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))), 5537 (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>; 5538 def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 (memop addr:$rhs)))), 5539 (VADDSUBPDrm VR128:$lhs, f128mem:$rhs)>; 5540 5541 def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 VR256:$rhs))), 5542 (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>; 5543 def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 (memop addr:$rhs)))), 5544 (VADDSUBPSYrm VR256:$lhs, f256mem:$rhs)>; 5545 def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 VR256:$rhs))), 5546 (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; 5547 def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 (memop addr:$rhs)))), 5548 (VADDSUBPDYrm VR256:$lhs, f256mem:$rhs)>; 5549} 5550 5551let Predicates = [UseSSE3] in { 5552 def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))), 5553 (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>; 5554 def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 (memop addr:$rhs)))), 5555 (ADDSUBPSrm VR128:$lhs, f128mem:$rhs)>; 5556 def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))), 5557 (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>; 5558 def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 (memop addr:$rhs)))), 5559 (ADDSUBPDrm VR128:$lhs, f128mem:$rhs)>; 5560} 5561 5562//===---------------------------------------------------------------------===// 5563// SSE3 Instructions 5564//===---------------------------------------------------------------------===// 5565 5566// Horizontal ops 5567multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, 5568 X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> { 5569 def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 5570 !if(Is2Addr, 5571 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5572 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5573 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>, 5574 Sched<[WriteFAdd]>; 5575 5576 def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 5577 !if(Is2Addr, 5578 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5579 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5580 [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))], 5581 IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>; 5582} 5583multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, 5584 X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> { 5585 def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 5586 !if(Is2Addr, 5587 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5588 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5589 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>, 5590 Sched<[WriteFAdd]>; 5591 5592 def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 5593 !if(Is2Addr, 5594 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5595 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5596 [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))], 5597 IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>; 5598} 5599 5600let Predicates = [HasAVX] in { 5601 let ExeDomain = SSEPackedSingle in { 5602 defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, 5603 X86fhadd, 0>, VEX_4V; 5604 defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, 5605 X86fhsub, 0>, VEX_4V; 5606 defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, 5607 X86fhadd, 0>, VEX_4V, VEX_L; 5608 defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, 5609 X86fhsub, 0>, VEX_4V, VEX_L; 5610 } 5611 let ExeDomain = SSEPackedDouble in { 5612 defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem, 5613 X86fhadd, 0>, VEX_4V; 5614 defm VHSUBPD : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem, 5615 X86fhsub, 0>, VEX_4V; 5616 defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem, 5617 X86fhadd, 0>, VEX_4V, VEX_L; 5618 defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem, 5619 X86fhsub, 0>, VEX_4V, VEX_L; 5620 } 5621} 5622 5623let Constraints = "$src1 = $dst" in { 5624 let ExeDomain = SSEPackedSingle in { 5625 defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd>; 5626 defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub>; 5627 } 5628 let ExeDomain = SSEPackedDouble in { 5629 defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd>; 5630 defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub>; 5631 } 5632} 5633 5634//===---------------------------------------------------------------------===// 5635// SSSE3 - Packed Absolute Instructions 5636//===---------------------------------------------------------------------===// 5637 5638 5639/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. 5640multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, 5641 Intrinsic IntId128> { 5642 def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst), 5643 (ins VR128:$src), 5644 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5645 [(set VR128:$dst, (IntId128 VR128:$src))], IIC_SSE_PABS_RR>, 5646 Sched<[WriteVecALU]>; 5647 5648 def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst), 5649 (ins i128mem:$src), 5650 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5651 [(set VR128:$dst, 5652 (IntId128 5653 (bitconvert (memopv2i64 addr:$src))))], IIC_SSE_PABS_RM>, 5654 Sched<[WriteVecALULd]>; 5655} 5656 5657/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. 5658multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr, 5659 Intrinsic IntId256> { 5660 def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst), 5661 (ins VR256:$src), 5662 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5663 [(set VR256:$dst, (IntId256 VR256:$src))]>, 5664 Sched<[WriteVecALU]>; 5665 5666 def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst), 5667 (ins i256mem:$src), 5668 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5669 [(set VR256:$dst, 5670 (IntId256 5671 (bitconvert (memopv4i64 addr:$src))))]>, 5672 Sched<[WriteVecALULd]>; 5673} 5674 5675// Helper fragments to match sext vXi1 to vXiY. 5676def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)), 5677 VR128:$src))>; 5678def v8i1sextv8i16 : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i8 15)))>; 5679def v4i1sextv4i32 : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i8 31)))>; 5680def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)), 5681 VR256:$src))>; 5682def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>; 5683def v8i1sextv8i32 : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>; 5684 5685let Predicates = [HasAVX] in { 5686 defm VPABSB : SS3I_unop_rm_int<0x1C, "vpabsb", 5687 int_x86_ssse3_pabs_b_128>, VEX; 5688 defm VPABSW : SS3I_unop_rm_int<0x1D, "vpabsw", 5689 int_x86_ssse3_pabs_w_128>, VEX; 5690 defm VPABSD : SS3I_unop_rm_int<0x1E, "vpabsd", 5691 int_x86_ssse3_pabs_d_128>, VEX; 5692 5693 def : Pat<(xor 5694 (bc_v2i64 (v16i1sextv16i8)), 5695 (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))), 5696 (VPABSBrr128 VR128:$src)>; 5697 def : Pat<(xor 5698 (bc_v2i64 (v8i1sextv8i16)), 5699 (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))), 5700 (VPABSWrr128 VR128:$src)>; 5701 def : Pat<(xor 5702 (bc_v2i64 (v4i1sextv4i32)), 5703 (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))), 5704 (VPABSDrr128 VR128:$src)>; 5705} 5706 5707let Predicates = [HasAVX2] in { 5708 defm VPABSB : SS3I_unop_rm_int_y<0x1C, "vpabsb", 5709 int_x86_avx2_pabs_b>, VEX, VEX_L; 5710 defm VPABSW : SS3I_unop_rm_int_y<0x1D, "vpabsw", 5711 int_x86_avx2_pabs_w>, VEX, VEX_L; 5712 defm VPABSD : SS3I_unop_rm_int_y<0x1E, "vpabsd", 5713 int_x86_avx2_pabs_d>, VEX, VEX_L; 5714 5715 def : Pat<(xor 5716 (bc_v4i64 (v32i1sextv32i8)), 5717 (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))), 5718 (VPABSBrr256 VR256:$src)>; 5719 def : Pat<(xor 5720 (bc_v4i64 (v16i1sextv16i16)), 5721 (bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))), 5722 (VPABSWrr256 VR256:$src)>; 5723 def : Pat<(xor 5724 (bc_v4i64 (v8i1sextv8i32)), 5725 (bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))), 5726 (VPABSDrr256 VR256:$src)>; 5727} 5728 5729defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", 5730 int_x86_ssse3_pabs_b_128>; 5731defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw", 5732 int_x86_ssse3_pabs_w_128>; 5733defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", 5734 int_x86_ssse3_pabs_d_128>; 5735 5736let Predicates = [HasSSSE3] in { 5737 def : Pat<(xor 5738 (bc_v2i64 (v16i1sextv16i8)), 5739 (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))), 5740 (PABSBrr128 VR128:$src)>; 5741 def : Pat<(xor 5742 (bc_v2i64 (v8i1sextv8i16)), 5743 (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))), 5744 (PABSWrr128 VR128:$src)>; 5745 def : Pat<(xor 5746 (bc_v2i64 (v4i1sextv4i32)), 5747 (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))), 5748 (PABSDrr128 VR128:$src)>; 5749} 5750 5751//===---------------------------------------------------------------------===// 5752// SSSE3 - Packed Binary Operator Instructions 5753//===---------------------------------------------------------------------===// 5754 5755let Sched = WriteVecALU in { 5756def SSE_PHADDSUBD : OpndItins< 5757 IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM 5758>; 5759def SSE_PHADDSUBSW : OpndItins< 5760 IIC_SSE_PHADDSUBSW_RR, IIC_SSE_PHADDSUBSW_RM 5761>; 5762def SSE_PHADDSUBW : OpndItins< 5763 IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM 5764>; 5765} 5766let Sched = WriteShuffle in 5767def SSE_PSHUFB : OpndItins< 5768 IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM 5769>; 5770let Sched = WriteVecALU in 5771def SSE_PSIGN : OpndItins< 5772 IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM 5773>; 5774let Sched = WriteVecIMul in 5775def SSE_PMULHRSW : OpndItins< 5776 IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW 5777>; 5778 5779/// SS3I_binop_rm - Simple SSSE3 bin op 5780multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 5781 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 5782 X86MemOperand x86memop, OpndItins itins, 5783 bit Is2Addr = 1> { 5784 let isCommutable = 1 in 5785 def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst), 5786 (ins RC:$src1, RC:$src2), 5787 !if(Is2Addr, 5788 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5789 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5790 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>, 5791 Sched<[itins.Sched]>; 5792 def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst), 5793 (ins RC:$src1, x86memop:$src2), 5794 !if(Is2Addr, 5795 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5796 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5797 [(set RC:$dst, 5798 (OpVT (OpNode RC:$src1, 5799 (bitconvert (memop_frag addr:$src2)))))], itins.rm>, 5800 Sched<[itins.Sched.Folded, ReadAfterLd]>; 5801} 5802 5803/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}. 5804multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, 5805 Intrinsic IntId128, OpndItins itins, 5806 bit Is2Addr = 1> { 5807 let isCommutable = 1 in 5808 def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst), 5809 (ins VR128:$src1, VR128:$src2), 5810 !if(Is2Addr, 5811 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5812 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5813 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, 5814 Sched<[itins.Sched]>; 5815 def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst), 5816 (ins VR128:$src1, i128mem:$src2), 5817 !if(Is2Addr, 5818 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5819 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5820 [(set VR128:$dst, 5821 (IntId128 VR128:$src1, 5822 (bitconvert (memopv2i64 addr:$src2))))]>, 5823 Sched<[itins.Sched.Folded, ReadAfterLd]>; 5824} 5825 5826multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr, 5827 Intrinsic IntId256, 5828 X86FoldableSchedWrite Sched> { 5829 let isCommutable = 1 in 5830 def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst), 5831 (ins VR256:$src1, VR256:$src2), 5832 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5833 [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, 5834 Sched<[Sched]>; 5835 def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst), 5836 (ins VR256:$src1, i256mem:$src2), 5837 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5838 [(set VR256:$dst, 5839 (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>, 5840 Sched<[Sched.Folded, ReadAfterLd]>; 5841} 5842 5843let ImmT = NoImm, Predicates = [HasAVX] in { 5844let isCommutable = 0 in { 5845 defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128, 5846 loadv2i64, i128mem, 5847 SSE_PHADDSUBW, 0>, VEX_4V; 5848 defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128, 5849 loadv2i64, i128mem, 5850 SSE_PHADDSUBD, 0>, VEX_4V; 5851 defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128, 5852 loadv2i64, i128mem, 5853 SSE_PHADDSUBW, 0>, VEX_4V; 5854 defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128, 5855 loadv2i64, i128mem, 5856 SSE_PHADDSUBD, 0>, VEX_4V; 5857 defm VPSIGNB : SS3I_binop_rm<0x08, "vpsignb", X86psign, v16i8, VR128, 5858 loadv2i64, i128mem, 5859 SSE_PSIGN, 0>, VEX_4V; 5860 defm VPSIGNW : SS3I_binop_rm<0x09, "vpsignw", X86psign, v8i16, VR128, 5861 loadv2i64, i128mem, 5862 SSE_PSIGN, 0>, VEX_4V; 5863 defm VPSIGND : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v4i32, VR128, 5864 loadv2i64, i128mem, 5865 SSE_PSIGN, 0>, VEX_4V; 5866 defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128, 5867 loadv2i64, i128mem, 5868 SSE_PSHUFB, 0>, VEX_4V; 5869 defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", 5870 int_x86_ssse3_phadd_sw_128, 5871 SSE_PHADDSUBSW, 0>, VEX_4V; 5872 defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", 5873 int_x86_ssse3_phsub_sw_128, 5874 SSE_PHADDSUBSW, 0>, VEX_4V; 5875 defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw", 5876 int_x86_ssse3_pmadd_ub_sw_128, 5877 SSE_PMADD, 0>, VEX_4V; 5878} 5879defm VPMULHRSW : SS3I_binop_rm_int<0x0B, "vpmulhrsw", 5880 int_x86_ssse3_pmul_hr_sw_128, 5881 SSE_PMULHRSW, 0>, VEX_4V; 5882} 5883 5884let ImmT = NoImm, Predicates = [HasAVX2] in { 5885let isCommutable = 0 in { 5886 defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256, 5887 loadv4i64, i256mem, 5888 SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; 5889 defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256, 5890 loadv4i64, i256mem, 5891 SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; 5892 defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256, 5893 loadv4i64, i256mem, 5894 SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; 5895 defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256, 5896 loadv4i64, i256mem, 5897 SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; 5898 defm VPSIGNBY : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256, 5899 loadv4i64, i256mem, 5900 SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; 5901 defm VPSIGNWY : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256, 5902 loadv4i64, i256mem, 5903 SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; 5904 defm VPSIGNDY : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256, 5905 loadv4i64, i256mem, 5906 SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; 5907 defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256, 5908 loadv4i64, i256mem, 5909 SSE_PSHUFB, 0>, VEX_4V, VEX_L; 5910 defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", 5911 int_x86_avx2_phadd_sw, 5912 WriteVecALU>, VEX_4V, VEX_L; 5913 defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", 5914 int_x86_avx2_phsub_sw, 5915 WriteVecALU>, VEX_4V, VEX_L; 5916 defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw", 5917 int_x86_avx2_pmadd_ub_sw, 5918 WriteVecIMul>, VEX_4V, VEX_L; 5919} 5920defm VPMULHRSW : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw", 5921 int_x86_avx2_pmul_hr_sw, 5922 WriteVecIMul>, VEX_4V, VEX_L; 5923} 5924 5925// None of these have i8 immediate fields. 5926let ImmT = NoImm, Constraints = "$src1 = $dst" in { 5927let isCommutable = 0 in { 5928 defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, VR128, 5929 memopv2i64, i128mem, SSE_PHADDSUBW>; 5930 defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, VR128, 5931 memopv2i64, i128mem, SSE_PHADDSUBD>; 5932 defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, VR128, 5933 memopv2i64, i128mem, SSE_PHADDSUBW>; 5934 defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, VR128, 5935 memopv2i64, i128mem, SSE_PHADDSUBD>; 5936 defm PSIGNB : SS3I_binop_rm<0x08, "psignb", X86psign, v16i8, VR128, 5937 memopv2i64, i128mem, SSE_PSIGN>; 5938 defm PSIGNW : SS3I_binop_rm<0x09, "psignw", X86psign, v8i16, VR128, 5939 memopv2i64, i128mem, SSE_PSIGN>; 5940 defm PSIGND : SS3I_binop_rm<0x0A, "psignd", X86psign, v4i32, VR128, 5941 memopv2i64, i128mem, SSE_PSIGN>; 5942 defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, VR128, 5943 memopv2i64, i128mem, SSE_PSHUFB>; 5944 defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", 5945 int_x86_ssse3_phadd_sw_128, 5946 SSE_PHADDSUBSW>; 5947 defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", 5948 int_x86_ssse3_phsub_sw_128, 5949 SSE_PHADDSUBSW>; 5950 defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw", 5951 int_x86_ssse3_pmadd_ub_sw_128, SSE_PMADD>; 5952} 5953defm PMULHRSW : SS3I_binop_rm_int<0x0B, "pmulhrsw", 5954 int_x86_ssse3_pmul_hr_sw_128, 5955 SSE_PMULHRSW>; 5956} 5957 5958//===---------------------------------------------------------------------===// 5959// SSSE3 - Packed Align Instruction Patterns 5960//===---------------------------------------------------------------------===// 5961 5962multiclass ssse3_palignr<string asm, bit Is2Addr = 1> { 5963 let hasSideEffects = 0 in { 5964 def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst), 5965 (ins VR128:$src1, VR128:$src2, i8imm:$src3), 5966 !if(Is2Addr, 5967 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5968 !strconcat(asm, 5969 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5970 [], IIC_SSE_PALIGNRR>, Sched<[WriteShuffle]>; 5971 let mayLoad = 1 in 5972 def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst), 5973 (ins VR128:$src1, i128mem:$src2, i8imm:$src3), 5974 !if(Is2Addr, 5975 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5976 !strconcat(asm, 5977 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5978 [], IIC_SSE_PALIGNRM>, Sched<[WriteShuffleLd, ReadAfterLd]>; 5979 } 5980} 5981 5982multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> { 5983 let hasSideEffects = 0 in { 5984 def R256rr : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst), 5985 (ins VR256:$src1, VR256:$src2, i8imm:$src3), 5986 !strconcat(asm, 5987 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5988 []>, Sched<[WriteShuffle]>; 5989 let mayLoad = 1 in 5990 def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst), 5991 (ins VR256:$src1, i256mem:$src2, i8imm:$src3), 5992 !strconcat(asm, 5993 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5994 []>, Sched<[WriteShuffleLd, ReadAfterLd]>; 5995 } 5996} 5997 5998let Predicates = [HasAVX] in 5999 defm VPALIGN : ssse3_palignr<"vpalignr", 0>, VEX_4V; 6000let Predicates = [HasAVX2] in 6001 defm VPALIGN : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L; 6002let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in 6003 defm PALIGN : ssse3_palignr<"palignr">; 6004 6005let Predicates = [HasAVX2] in { 6006def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), 6007 (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>; 6008def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), 6009 (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>; 6010def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), 6011 (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>; 6012def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), 6013 (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>; 6014} 6015 6016let Predicates = [HasAVX] in { 6017def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), 6018 (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; 6019def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), 6020 (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; 6021def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), 6022 (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; 6023def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), 6024 (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; 6025} 6026 6027let Predicates = [UseSSSE3] in { 6028def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), 6029 (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; 6030def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), 6031 (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; 6032def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), 6033 (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; 6034def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), 6035 (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; 6036} 6037 6038//===---------------------------------------------------------------------===// 6039// SSSE3 - Thread synchronization 6040//===---------------------------------------------------------------------===// 6041 6042let SchedRW = [WriteSystem] in { 6043let usesCustomInserter = 1 in { 6044def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3), 6045 [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>, 6046 Requires<[HasSSE3]>; 6047} 6048 6049let Uses = [EAX, ECX, EDX] in 6050def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>, 6051 TB, Requires<[HasSSE3]>; 6052let Uses = [ECX, EAX] in 6053def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", 6054 [(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>, 6055 TB, Requires<[HasSSE3]>; 6056} // SchedRW 6057 6058def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>; 6059def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>; 6060 6061def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>, 6062 Requires<[Not64BitMode]>; 6063def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>, 6064 Requires<[In64BitMode]>; 6065 6066//===----------------------------------------------------------------------===// 6067// SSE4.1 - Packed Move with Sign/Zero Extend 6068//===----------------------------------------------------------------------===// 6069 6070multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, 6071 RegisterClass OutRC, RegisterClass InRC, 6072 OpndItins itins> { 6073 def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src), 6074 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 6075 [], itins.rr>, 6076 Sched<[itins.Sched]>; 6077 6078 def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src), 6079 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 6080 [], 6081 itins.rm>, Sched<[itins.Sched.Folded]>; 6082} 6083 6084multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr, 6085 X86MemOperand MemOp, X86MemOperand MemYOp, 6086 OpndItins SSEItins, OpndItins AVXItins, 6087 OpndItins AVX2Itins> { 6088 defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, SSEItins>; 6089 let Predicates = [HasAVX] in 6090 defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp, 6091 VR128, VR128, AVXItins>, VEX; 6092 let Predicates = [HasAVX2] in 6093 defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp, 6094 VR256, VR128, AVX2Itins>, VEX, VEX_L; 6095} 6096 6097multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, 6098 X86MemOperand MemOp, X86MemOperand MemYOp> { 6099 defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr), 6100 MemOp, MemYOp, 6101 SSE_INTALU_ITINS_SHUFF_P, 6102 DEFAULT_ITINS_SHUFFLESCHED, 6103 DEFAULT_ITINS_SHUFFLESCHED>; 6104 defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10), 6105 !strconcat("pmovzx", OpcodeStr), 6106 MemOp, MemYOp, 6107 SSE_INTALU_ITINS_SHUFF_P, 6108 DEFAULT_ITINS_SHUFFLESCHED, 6109 DEFAULT_ITINS_SHUFFLESCHED>; 6110} 6111 6112defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem>; 6113defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem>; 6114defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem>; 6115 6116defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem>; 6117defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem>; 6118 6119defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem>; 6120 6121// AVX2 Patterns 6122multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, SDNode ExtOp> { 6123 // Register-Register patterns 6124 def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), 6125 (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>; 6126 def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))), 6127 (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>; 6128 def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))), 6129 (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>; 6130 6131 def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), 6132 (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>; 6133 def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))), 6134 (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>; 6135 6136 def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), 6137 (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>; 6138 6139 // On AVX2, we also support 256bit inputs. 6140 // FIXME: remove these patterns when the old shuffle lowering goes away. 6141 def : Pat<(v16i16 (ExtOp (v32i8 VR256:$src))), 6142 (!cast<I>(OpcPrefix#BWYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; 6143 def : Pat<(v8i32 (ExtOp (v32i8 VR256:$src))), 6144 (!cast<I>(OpcPrefix#BDYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; 6145 def : Pat<(v4i64 (ExtOp (v32i8 VR256:$src))), 6146 (!cast<I>(OpcPrefix#BQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; 6147 6148 def : Pat<(v8i32 (ExtOp (v16i16 VR256:$src))), 6149 (!cast<I>(OpcPrefix#WDYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; 6150 def : Pat<(v4i64 (ExtOp (v16i16 VR256:$src))), 6151 (!cast<I>(OpcPrefix#WQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; 6152 6153 def : Pat<(v4i64 (ExtOp (v8i32 VR256:$src))), 6154 (!cast<I>(OpcPrefix#DQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; 6155 6156 // AVX2 Register-Memory patterns 6157 def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 6158 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 6159 def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), 6160 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 6161 def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), 6162 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 6163 def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 6164 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 6165 6166 def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 6167 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 6168 def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), 6169 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 6170 def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), 6171 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 6172 def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 6173 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 6174 6175 def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 6176 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 6177 def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), 6178 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 6179 def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), 6180 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 6181 def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 6182 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 6183 6184 def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), 6185 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 6186 def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), 6187 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 6188 def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), 6189 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 6190 def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), 6191 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 6192 6193 def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 6194 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 6195 def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), 6196 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 6197 def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), 6198 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 6199 def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), 6200 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 6201 6202 def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), 6203 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 6204 def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), 6205 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 6206 def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), 6207 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 6208 def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), 6209 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 6210} 6211 6212let Predicates = [HasAVX2] in { 6213 defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", X86vsext>; 6214 defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", X86vzext>; 6215} 6216 6217// SSE4.1/AVX patterns. 6218multiclass SS41I_pmovx_patterns<string OpcPrefix, SDNode ExtOp, 6219 PatFrag ExtLoad16> { 6220 def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))), 6221 (!cast<I>(OpcPrefix#BWrr) VR128:$src)>; 6222 def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))), 6223 (!cast<I>(OpcPrefix#BDrr) VR128:$src)>; 6224 def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))), 6225 (!cast<I>(OpcPrefix#BQrr) VR128:$src)>; 6226 6227 def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))), 6228 (!cast<I>(OpcPrefix#WDrr) VR128:$src)>; 6229 def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))), 6230 (!cast<I>(OpcPrefix#WQrr) VR128:$src)>; 6231 6232 def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))), 6233 (!cast<I>(OpcPrefix#DQrr) VR128:$src)>; 6234 6235 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 6236 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 6237 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 6238 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 6239 def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), 6240 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 6241 def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), 6242 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 6243 def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 6244 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 6245 6246 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 6247 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 6248 def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), 6249 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 6250 def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), 6251 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 6252 def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 6253 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 6254 6255 def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))), 6256 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 6257 def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), 6258 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 6259 def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), 6260 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 6261 def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 6262 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 6263 6264 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 6265 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 6266 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 6267 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 6268 def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), 6269 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 6270 def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), 6271 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 6272 def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), 6273 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 6274 6275 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 6276 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 6277 def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))), 6278 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 6279 def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), 6280 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 6281 def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), 6282 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 6283 6284 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 6285 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 6286 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 6287 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 6288 def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), 6289 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 6290 def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), 6291 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 6292 def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), 6293 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 6294} 6295 6296let Predicates = [HasAVX] in { 6297 defm : SS41I_pmovx_patterns<"VPMOVSX", X86vsext, extloadi32i16>; 6298 defm : SS41I_pmovx_patterns<"VPMOVZX", X86vzext, loadi16_anyext>; 6299} 6300 6301let Predicates = [UseSSE41] in { 6302 defm : SS41I_pmovx_patterns<"PMOVSX", X86vsext, extloadi32i16>; 6303 defm : SS41I_pmovx_patterns<"PMOVZX", X86vzext, loadi16_anyext>; 6304} 6305 6306//===----------------------------------------------------------------------===// 6307// SSE4.1 - Extract Instructions 6308//===----------------------------------------------------------------------===// 6309 6310/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem 6311multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { 6312 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 6313 (ins VR128:$src1, i32i8imm:$src2), 6314 !strconcat(OpcodeStr, 6315 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6316 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1), 6317 imm:$src2))]>, 6318 Sched<[WriteShuffle]>; 6319 let hasSideEffects = 0, mayStore = 1, 6320 SchedRW = [WriteShuffleLd, WriteRMW] in 6321 def mr : SS4AIi8<opc, MRMDestMem, (outs), 6322 (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2), 6323 !strconcat(OpcodeStr, 6324 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6325 [(store (i8 (trunc (assertzext (X86pextrb (v16i8 VR128:$src1), 6326 imm:$src2)))), addr:$dst)]>; 6327} 6328 6329let Predicates = [HasAVX] in 6330 defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX; 6331 6332defm PEXTRB : SS41I_extract8<0x14, "pextrb">; 6333 6334 6335/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination 6336multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { 6337 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in 6338 def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 6339 (ins VR128:$src1, i32i8imm:$src2), 6340 !strconcat(OpcodeStr, 6341 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6342 []>, Sched<[WriteShuffle]>; 6343 6344 let hasSideEffects = 0, mayStore = 1, 6345 SchedRW = [WriteShuffleLd, WriteRMW] in 6346 def mr : SS4AIi8<opc, MRMDestMem, (outs), 6347 (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2), 6348 !strconcat(OpcodeStr, 6349 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6350 [(store (i16 (trunc (assertzext (X86pextrw (v8i16 VR128:$src1), 6351 imm:$src2)))), addr:$dst)]>; 6352} 6353 6354let Predicates = [HasAVX] in 6355 defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX; 6356 6357defm PEXTRW : SS41I_extract16<0x15, "pextrw">; 6358 6359 6360/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination 6361multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { 6362 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst), 6363 (ins VR128:$src1, i32i8imm:$src2), 6364 !strconcat(OpcodeStr, 6365 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6366 [(set GR32:$dst, 6367 (extractelt (v4i32 VR128:$src1), imm:$src2))]>, 6368 Sched<[WriteShuffle]>; 6369 let SchedRW = [WriteShuffleLd, WriteRMW] in 6370 def mr : SS4AIi8<opc, MRMDestMem, (outs), 6371 (ins i32mem:$dst, VR128:$src1, i32i8imm:$src2), 6372 !strconcat(OpcodeStr, 6373 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6374 [(store (extractelt (v4i32 VR128:$src1), imm:$src2), 6375 addr:$dst)]>; 6376} 6377 6378let Predicates = [HasAVX] in 6379 defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX; 6380 6381defm PEXTRD : SS41I_extract32<0x16, "pextrd">; 6382 6383/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination 6384multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { 6385 def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst), 6386 (ins VR128:$src1, i32i8imm:$src2), 6387 !strconcat(OpcodeStr, 6388 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6389 [(set GR64:$dst, 6390 (extractelt (v2i64 VR128:$src1), imm:$src2))]>, 6391 Sched<[WriteShuffle]>, REX_W; 6392 let SchedRW = [WriteShuffleLd, WriteRMW] in 6393 def mr : SS4AIi8<opc, MRMDestMem, (outs), 6394 (ins i64mem:$dst, VR128:$src1, i32i8imm:$src2), 6395 !strconcat(OpcodeStr, 6396 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6397 [(store (extractelt (v2i64 VR128:$src1), imm:$src2), 6398 addr:$dst)]>, REX_W; 6399} 6400 6401let Predicates = [HasAVX] in 6402 defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W; 6403 6404defm PEXTRQ : SS41I_extract64<0x16, "pextrq">; 6405 6406/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory 6407/// destination 6408multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr, 6409 OpndItins itins = DEFAULT_ITINS> { 6410 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 6411 (ins VR128:$src1, i32i8imm:$src2), 6412 !strconcat(OpcodeStr, 6413 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6414 [(set GR32orGR64:$dst, 6415 (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))], 6416 itins.rr>, Sched<[WriteFBlend]>; 6417 let SchedRW = [WriteFBlendLd, WriteRMW] in 6418 def mr : SS4AIi8<opc, MRMDestMem, (outs), 6419 (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2), 6420 !strconcat(OpcodeStr, 6421 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6422 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2), 6423 addr:$dst)], itins.rm>; 6424} 6425 6426let ExeDomain = SSEPackedSingle in { 6427 let Predicates = [UseAVX] in 6428 defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX; 6429 defm EXTRACTPS : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>; 6430} 6431 6432// Also match an EXTRACTPS store when the store is done as f32 instead of i32. 6433def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), 6434 imm:$src2))), 6435 addr:$dst), 6436 (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, 6437 Requires<[HasAVX]>; 6438def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), 6439 imm:$src2))), 6440 addr:$dst), 6441 (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, 6442 Requires<[UseSSE41]>; 6443 6444//===----------------------------------------------------------------------===// 6445// SSE4.1 - Insert Instructions 6446//===----------------------------------------------------------------------===// 6447 6448multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { 6449 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 6450 (ins VR128:$src1, GR32orGR64:$src2, i32i8imm:$src3), 6451 !if(Is2Addr, 6452 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6453 !strconcat(asm, 6454 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6455 [(set VR128:$dst, 6456 (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, 6457 Sched<[WriteShuffle]>; 6458 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 6459 (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3), 6460 !if(Is2Addr, 6461 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6462 !strconcat(asm, 6463 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6464 [(set VR128:$dst, 6465 (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), 6466 imm:$src3))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; 6467} 6468 6469let Predicates = [HasAVX] in 6470 defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V; 6471let Constraints = "$src1 = $dst" in 6472 defm PINSRB : SS41I_insert8<0x20, "pinsrb">; 6473 6474multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { 6475 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 6476 (ins VR128:$src1, GR32:$src2, i32i8imm:$src3), 6477 !if(Is2Addr, 6478 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6479 !strconcat(asm, 6480 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6481 [(set VR128:$dst, 6482 (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, 6483 Sched<[WriteShuffle]>; 6484 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 6485 (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3), 6486 !if(Is2Addr, 6487 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6488 !strconcat(asm, 6489 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6490 [(set VR128:$dst, 6491 (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), 6492 imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; 6493} 6494 6495let Predicates = [HasAVX] in 6496 defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V; 6497let Constraints = "$src1 = $dst" in 6498 defm PINSRD : SS41I_insert32<0x22, "pinsrd">; 6499 6500multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { 6501 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 6502 (ins VR128:$src1, GR64:$src2, i32i8imm:$src3), 6503 !if(Is2Addr, 6504 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6505 !strconcat(asm, 6506 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6507 [(set VR128:$dst, 6508 (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, 6509 Sched<[WriteShuffle]>; 6510 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 6511 (ins VR128:$src1, i64mem:$src2, i32i8imm:$src3), 6512 !if(Is2Addr, 6513 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6514 !strconcat(asm, 6515 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6516 [(set VR128:$dst, 6517 (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), 6518 imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; 6519} 6520 6521let Predicates = [HasAVX] in 6522 defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W; 6523let Constraints = "$src1 = $dst" in 6524 defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W; 6525 6526// insertps has a few different modes, there's the first two here below which 6527// are optimized inserts that won't zero arbitrary elements in the destination 6528// vector. The next one matches the intrinsic and could zero arbitrary elements 6529// in the target vector. 6530multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1, 6531 OpndItins itins = DEFAULT_ITINS> { 6532 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 6533 (ins VR128:$src1, VR128:$src2, i8imm:$src3), 6534 !if(Is2Addr, 6535 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6536 !strconcat(asm, 6537 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6538 [(set VR128:$dst, 6539 (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>, 6540 Sched<[WriteFShuffle]>; 6541 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 6542 (ins VR128:$src1, f32mem:$src2, i8imm:$src3), 6543 !if(Is2Addr, 6544 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6545 !strconcat(asm, 6546 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6547 [(set VR128:$dst, 6548 (X86insertps VR128:$src1, 6549 (v4f32 (scalar_to_vector (loadf32 addr:$src2))), 6550 imm:$src3))], itins.rm>, 6551 Sched<[WriteFShuffleLd, ReadAfterLd]>; 6552} 6553 6554let ExeDomain = SSEPackedSingle in { 6555 let Predicates = [UseAVX] in 6556 defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V; 6557 let Constraints = "$src1 = $dst" in 6558 defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>; 6559} 6560 6561let Predicates = [UseSSE41] in { 6562 // If we're inserting an element from a load or a null pshuf of a load, 6563 // fold the load into the insertps instruction. 6564 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32 6565 (scalar_to_vector (loadf32 addr:$src2))), (i8 0)), 6566 imm:$src3)), 6567 (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; 6568 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd 6569 (loadv4f32 addr:$src2), (i8 0)), imm:$src3)), 6570 (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; 6571} 6572 6573let Predicates = [UseAVX] in { 6574 // If we're inserting an element from a vbroadcast of a load, fold the 6575 // load into the X86insertps instruction. 6576 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), 6577 (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)), 6578 (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; 6579 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), 6580 (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)), 6581 (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; 6582} 6583 6584//===----------------------------------------------------------------------===// 6585// SSE4.1 - Round Instructions 6586//===----------------------------------------------------------------------===// 6587 6588multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr, 6589 X86MemOperand x86memop, RegisterClass RC, 6590 PatFrag mem_frag32, PatFrag mem_frag64, 6591 Intrinsic V4F32Int, Intrinsic V2F64Int> { 6592let ExeDomain = SSEPackedSingle in { 6593 // Intrinsic operation, reg. 6594 // Vector intrinsic operation, reg 6595 def PSr : SS4AIi8<opcps, MRMSrcReg, 6596 (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), 6597 !strconcat(OpcodeStr, 6598 "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6599 [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))], 6600 IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>; 6601 6602 // Vector intrinsic operation, mem 6603 def PSm : SS4AIi8<opcps, MRMSrcMem, 6604 (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2), 6605 !strconcat(OpcodeStr, 6606 "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6607 [(set RC:$dst, 6608 (V4F32Int (mem_frag32 addr:$src1),imm:$src2))], 6609 IIC_SSE_ROUNDPS_MEM>, Sched<[WriteFAddLd]>; 6610} // ExeDomain = SSEPackedSingle 6611 6612let ExeDomain = SSEPackedDouble in { 6613 // Vector intrinsic operation, reg 6614 def PDr : SS4AIi8<opcpd, MRMSrcReg, 6615 (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), 6616 !strconcat(OpcodeStr, 6617 "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6618 [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))], 6619 IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>; 6620 6621 // Vector intrinsic operation, mem 6622 def PDm : SS4AIi8<opcpd, MRMSrcMem, 6623 (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2), 6624 !strconcat(OpcodeStr, 6625 "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 6626 [(set RC:$dst, 6627 (V2F64Int (mem_frag64 addr:$src1),imm:$src2))], 6628 IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAddLd]>; 6629} // ExeDomain = SSEPackedDouble 6630} 6631 6632multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd, 6633 string OpcodeStr, 6634 Intrinsic F32Int, 6635 Intrinsic F64Int, bit Is2Addr = 1> { 6636let ExeDomain = GenericDomain in { 6637 // Operation, reg. 6638 let hasSideEffects = 0 in 6639 def SSr : SS4AIi8<opcss, MRMSrcReg, 6640 (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32i8imm:$src3), 6641 !if(Is2Addr, 6642 !strconcat(OpcodeStr, 6643 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6644 !strconcat(OpcodeStr, 6645 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6646 []>, Sched<[WriteFAdd]>; 6647 6648 // Intrinsic operation, reg. 6649 let isCodeGenOnly = 1 in 6650 def SSr_Int : SS4AIi8<opcss, MRMSrcReg, 6651 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3), 6652 !if(Is2Addr, 6653 !strconcat(OpcodeStr, 6654 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6655 !strconcat(OpcodeStr, 6656 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6657 [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>, 6658 Sched<[WriteFAdd]>; 6659 6660 // Intrinsic operation, mem. 6661 def SSm : SS4AIi8<opcss, MRMSrcMem, 6662 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3), 6663 !if(Is2Addr, 6664 !strconcat(OpcodeStr, 6665 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6666 !strconcat(OpcodeStr, 6667 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6668 [(set VR128:$dst, 6669 (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>, 6670 Sched<[WriteFAddLd, ReadAfterLd]>; 6671 6672 // Operation, reg. 6673 let hasSideEffects = 0 in 6674 def SDr : SS4AIi8<opcsd, MRMSrcReg, 6675 (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32i8imm:$src3), 6676 !if(Is2Addr, 6677 !strconcat(OpcodeStr, 6678 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6679 !strconcat(OpcodeStr, 6680 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6681 []>, Sched<[WriteFAdd]>; 6682 6683 // Intrinsic operation, reg. 6684 let isCodeGenOnly = 1 in 6685 def SDr_Int : SS4AIi8<opcsd, MRMSrcReg, 6686 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3), 6687 !if(Is2Addr, 6688 !strconcat(OpcodeStr, 6689 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6690 !strconcat(OpcodeStr, 6691 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6692 [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>, 6693 Sched<[WriteFAdd]>; 6694 6695 // Intrinsic operation, mem. 6696 def SDm : SS4AIi8<opcsd, MRMSrcMem, 6697 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3), 6698 !if(Is2Addr, 6699 !strconcat(OpcodeStr, 6700 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6701 !strconcat(OpcodeStr, 6702 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6703 [(set VR128:$dst, 6704 (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>, 6705 Sched<[WriteFAddLd, ReadAfterLd]>; 6706} // ExeDomain = GenericDomain 6707} 6708 6709// FP round - roundss, roundps, roundsd, roundpd 6710let Predicates = [HasAVX] in { 6711 // Intrinsic form 6712 defm VROUND : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128, 6713 loadv4f32, loadv2f64, 6714 int_x86_sse41_round_ps, 6715 int_x86_sse41_round_pd>, VEX; 6716 defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256, 6717 loadv8f32, loadv4f64, 6718 int_x86_avx_round_ps_256, 6719 int_x86_avx_round_pd_256>, VEX, VEX_L; 6720 defm VROUND : sse41_fp_binop_rm<0x0A, 0x0B, "vround", 6721 int_x86_sse41_round_ss, 6722 int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG; 6723 6724 def : Pat<(ffloor FR32:$src), 6725 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>; 6726 def : Pat<(f64 (ffloor FR64:$src)), 6727 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>; 6728 def : Pat<(f32 (fnearbyint FR32:$src)), 6729 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; 6730 def : Pat<(f64 (fnearbyint FR64:$src)), 6731 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; 6732 def : Pat<(f32 (fceil FR32:$src)), 6733 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>; 6734 def : Pat<(f64 (fceil FR64:$src)), 6735 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>; 6736 def : Pat<(f32 (frint FR32:$src)), 6737 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; 6738 def : Pat<(f64 (frint FR64:$src)), 6739 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; 6740 def : Pat<(f32 (ftrunc FR32:$src)), 6741 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>; 6742 def : Pat<(f64 (ftrunc FR64:$src)), 6743 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>; 6744 6745 def : Pat<(v4f32 (ffloor VR128:$src)), 6746 (VROUNDPSr VR128:$src, (i32 0x1))>; 6747 def : Pat<(v4f32 (fnearbyint VR128:$src)), 6748 (VROUNDPSr VR128:$src, (i32 0xC))>; 6749 def : Pat<(v4f32 (fceil VR128:$src)), 6750 (VROUNDPSr VR128:$src, (i32 0x2))>; 6751 def : Pat<(v4f32 (frint VR128:$src)), 6752 (VROUNDPSr VR128:$src, (i32 0x4))>; 6753 def : Pat<(v4f32 (ftrunc VR128:$src)), 6754 (VROUNDPSr VR128:$src, (i32 0x3))>; 6755 6756 def : Pat<(v2f64 (ffloor VR128:$src)), 6757 (VROUNDPDr VR128:$src, (i32 0x1))>; 6758 def : Pat<(v2f64 (fnearbyint VR128:$src)), 6759 (VROUNDPDr VR128:$src, (i32 0xC))>; 6760 def : Pat<(v2f64 (fceil VR128:$src)), 6761 (VROUNDPDr VR128:$src, (i32 0x2))>; 6762 def : Pat<(v2f64 (frint VR128:$src)), 6763 (VROUNDPDr VR128:$src, (i32 0x4))>; 6764 def : Pat<(v2f64 (ftrunc VR128:$src)), 6765 (VROUNDPDr VR128:$src, (i32 0x3))>; 6766 6767 def : Pat<(v8f32 (ffloor VR256:$src)), 6768 (VROUNDYPSr VR256:$src, (i32 0x1))>; 6769 def : Pat<(v8f32 (fnearbyint VR256:$src)), 6770 (VROUNDYPSr VR256:$src, (i32 0xC))>; 6771 def : Pat<(v8f32 (fceil VR256:$src)), 6772 (VROUNDYPSr VR256:$src, (i32 0x2))>; 6773 def : Pat<(v8f32 (frint VR256:$src)), 6774 (VROUNDYPSr VR256:$src, (i32 0x4))>; 6775 def : Pat<(v8f32 (ftrunc VR256:$src)), 6776 (VROUNDYPSr VR256:$src, (i32 0x3))>; 6777 6778 def : Pat<(v4f64 (ffloor VR256:$src)), 6779 (VROUNDYPDr VR256:$src, (i32 0x1))>; 6780 def : Pat<(v4f64 (fnearbyint VR256:$src)), 6781 (VROUNDYPDr VR256:$src, (i32 0xC))>; 6782 def : Pat<(v4f64 (fceil VR256:$src)), 6783 (VROUNDYPDr VR256:$src, (i32 0x2))>; 6784 def : Pat<(v4f64 (frint VR256:$src)), 6785 (VROUNDYPDr VR256:$src, (i32 0x4))>; 6786 def : Pat<(v4f64 (ftrunc VR256:$src)), 6787 (VROUNDYPDr VR256:$src, (i32 0x3))>; 6788} 6789 6790defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128, 6791 memopv4f32, memopv2f64, 6792 int_x86_sse41_round_ps, int_x86_sse41_round_pd>; 6793let Constraints = "$src1 = $dst" in 6794defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round", 6795 int_x86_sse41_round_ss, int_x86_sse41_round_sd>; 6796 6797let Predicates = [UseSSE41] in { 6798 def : Pat<(ffloor FR32:$src), 6799 (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>; 6800 def : Pat<(f64 (ffloor FR64:$src)), 6801 (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>; 6802 def : Pat<(f32 (fnearbyint FR32:$src)), 6803 (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; 6804 def : Pat<(f64 (fnearbyint FR64:$src)), 6805 (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; 6806 def : Pat<(f32 (fceil FR32:$src)), 6807 (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>; 6808 def : Pat<(f64 (fceil FR64:$src)), 6809 (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>; 6810 def : Pat<(f32 (frint FR32:$src)), 6811 (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; 6812 def : Pat<(f64 (frint FR64:$src)), 6813 (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; 6814 def : Pat<(f32 (ftrunc FR32:$src)), 6815 (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>; 6816 def : Pat<(f64 (ftrunc FR64:$src)), 6817 (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>; 6818 6819 def : Pat<(v4f32 (ffloor VR128:$src)), 6820 (ROUNDPSr VR128:$src, (i32 0x1))>; 6821 def : Pat<(v4f32 (fnearbyint VR128:$src)), 6822 (ROUNDPSr VR128:$src, (i32 0xC))>; 6823 def : Pat<(v4f32 (fceil VR128:$src)), 6824 (ROUNDPSr VR128:$src, (i32 0x2))>; 6825 def : Pat<(v4f32 (frint VR128:$src)), 6826 (ROUNDPSr VR128:$src, (i32 0x4))>; 6827 def : Pat<(v4f32 (ftrunc VR128:$src)), 6828 (ROUNDPSr VR128:$src, (i32 0x3))>; 6829 6830 def : Pat<(v2f64 (ffloor VR128:$src)), 6831 (ROUNDPDr VR128:$src, (i32 0x1))>; 6832 def : Pat<(v2f64 (fnearbyint VR128:$src)), 6833 (ROUNDPDr VR128:$src, (i32 0xC))>; 6834 def : Pat<(v2f64 (fceil VR128:$src)), 6835 (ROUNDPDr VR128:$src, (i32 0x2))>; 6836 def : Pat<(v2f64 (frint VR128:$src)), 6837 (ROUNDPDr VR128:$src, (i32 0x4))>; 6838 def : Pat<(v2f64 (ftrunc VR128:$src)), 6839 (ROUNDPDr VR128:$src, (i32 0x3))>; 6840} 6841 6842//===----------------------------------------------------------------------===// 6843// SSE4.1 - Packed Bit Test 6844//===----------------------------------------------------------------------===// 6845 6846// ptest instruction we'll lower to this in X86ISelLowering primarily from 6847// the intel intrinsic that corresponds to this. 6848let Defs = [EFLAGS], Predicates = [HasAVX] in { 6849def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), 6850 "vptest\t{$src2, $src1|$src1, $src2}", 6851 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, 6852 Sched<[WriteVecLogic]>, VEX; 6853def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), 6854 "vptest\t{$src2, $src1|$src1, $src2}", 6855 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>, 6856 Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX; 6857 6858def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), 6859 "vptest\t{$src2, $src1|$src1, $src2}", 6860 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>, 6861 Sched<[WriteVecLogic]>, VEX, VEX_L; 6862def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2), 6863 "vptest\t{$src2, $src1|$src1, $src2}", 6864 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>, 6865 Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L; 6866} 6867 6868let Defs = [EFLAGS] in { 6869def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), 6870 "ptest\t{$src2, $src1|$src1, $src2}", 6871 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, 6872 Sched<[WriteVecLogic]>; 6873def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), 6874 "ptest\t{$src2, $src1|$src1, $src2}", 6875 [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>, 6876 Sched<[WriteVecLogicLd, ReadAfterLd]>; 6877} 6878 6879// The bit test instructions below are AVX only 6880multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, 6881 X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> { 6882 def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 6883 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 6884 [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, 6885 Sched<[WriteVecLogic]>, VEX; 6886 def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), 6887 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 6888 [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>, 6889 Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX; 6890} 6891 6892let Defs = [EFLAGS], Predicates = [HasAVX] in { 6893let ExeDomain = SSEPackedSingle in { 6894defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32>; 6895defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32>, 6896 VEX_L; 6897} 6898let ExeDomain = SSEPackedDouble in { 6899defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64>; 6900defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64>, 6901 VEX_L; 6902} 6903} 6904 6905//===----------------------------------------------------------------------===// 6906// SSE4.1 - Misc Instructions 6907//===----------------------------------------------------------------------===// 6908 6909let Defs = [EFLAGS], Predicates = [HasPOPCNT] in { 6910 def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), 6911 "popcnt{w}\t{$src, $dst|$dst, $src}", 6912 [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)], 6913 IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, 6914 OpSize16, XS; 6915 def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), 6916 "popcnt{w}\t{$src, $dst|$dst, $src}", 6917 [(set GR16:$dst, (ctpop (loadi16 addr:$src))), 6918 (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, 6919 Sched<[WriteFAddLd]>, OpSize16, XS; 6920 6921 def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), 6922 "popcnt{l}\t{$src, $dst|$dst, $src}", 6923 [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)], 6924 IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, 6925 OpSize32, XS; 6926 6927 def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), 6928 "popcnt{l}\t{$src, $dst|$dst, $src}", 6929 [(set GR32:$dst, (ctpop (loadi32 addr:$src))), 6930 (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, 6931 Sched<[WriteFAddLd]>, OpSize32, XS; 6932 6933 def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), 6934 "popcnt{q}\t{$src, $dst|$dst, $src}", 6935 [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)], 6936 IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, XS; 6937 def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), 6938 "popcnt{q}\t{$src, $dst|$dst, $src}", 6939 [(set GR64:$dst, (ctpop (loadi64 addr:$src))), 6940 (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, 6941 Sched<[WriteFAddLd]>, XS; 6942} 6943 6944 6945 6946// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16. 6947multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, 6948 Intrinsic IntId128, 6949 X86FoldableSchedWrite Sched> { 6950 def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 6951 (ins VR128:$src), 6952 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 6953 [(set VR128:$dst, (IntId128 VR128:$src))]>, 6954 Sched<[Sched]>; 6955 def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 6956 (ins i128mem:$src), 6957 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 6958 [(set VR128:$dst, 6959 (IntId128 (bitconvert (memopv2i64 addr:$src))))]>, 6960 Sched<[Sched.Folded]>; 6961} 6962 6963// PHMIN has the same profile as PSAD, thus we use the same scheduling 6964// model, although the naming is misleading. 6965let Predicates = [HasAVX] in 6966defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw", 6967 int_x86_sse41_phminposuw, 6968 WriteVecIMul>, VEX; 6969defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw", 6970 int_x86_sse41_phminposuw, 6971 WriteVecIMul>; 6972 6973/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator 6974multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr, 6975 Intrinsic IntId128, bit Is2Addr = 1, 6976 OpndItins itins = DEFAULT_ITINS> { 6977 let isCommutable = 1 in 6978 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 6979 (ins VR128:$src1, VR128:$src2), 6980 !if(Is2Addr, 6981 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6982 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6983 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))], 6984 itins.rr>, Sched<[itins.Sched]>; 6985 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 6986 (ins VR128:$src1, i128mem:$src2), 6987 !if(Is2Addr, 6988 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6989 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6990 [(set VR128:$dst, 6991 (IntId128 VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))], 6992 itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; 6993} 6994 6995/// SS41I_binop_rm_int_y - Simple SSE 4.1 binary operator 6996multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr, 6997 Intrinsic IntId256, 6998 X86FoldableSchedWrite Sched> { 6999 let isCommutable = 1 in 7000 def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), 7001 (ins VR256:$src1, VR256:$src2), 7002 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7003 [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, 7004 Sched<[Sched]>; 7005 def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), 7006 (ins VR256:$src1, i256mem:$src2), 7007 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7008 [(set VR256:$dst, 7009 (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>, 7010 Sched<[Sched.Folded, ReadAfterLd]>; 7011} 7012 7013 7014/// SS48I_binop_rm - Simple SSE41 binary operator. 7015multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 7016 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 7017 X86MemOperand x86memop, bit Is2Addr = 1, 7018 OpndItins itins = SSE_INTALU_ITINS_P> { 7019 let isCommutable = 1 in 7020 def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst), 7021 (ins RC:$src1, RC:$src2), 7022 !if(Is2Addr, 7023 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 7024 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 7025 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 7026 Sched<[itins.Sched]>; 7027 def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst), 7028 (ins RC:$src1, x86memop:$src2), 7029 !if(Is2Addr, 7030 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 7031 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 7032 [(set RC:$dst, 7033 (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>, 7034 Sched<[itins.Sched.Folded, ReadAfterLd]>; 7035} 7036 7037/// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst 7038/// types. 7039multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, 7040 ValueType DstVT, ValueType SrcVT, RegisterClass RC, 7041 PatFrag memop_frag, X86MemOperand x86memop, 7042 OpndItins itins, 7043 bit IsCommutable = 0, bit Is2Addr = 1> { 7044 let isCommutable = IsCommutable in 7045 def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst), 7046 (ins RC:$src1, RC:$src2), 7047 !if(Is2Addr, 7048 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 7049 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 7050 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>, 7051 Sched<[itins.Sched]>; 7052 def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst), 7053 (ins RC:$src1, x86memop:$src2), 7054 !if(Is2Addr, 7055 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 7056 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 7057 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), 7058 (bitconvert (memop_frag addr:$src2)))))]>, 7059 Sched<[itins.Sched.Folded, ReadAfterLd]>; 7060} 7061 7062let Predicates = [HasAVX, NoVLX] in { 7063 let isCommutable = 0 in 7064 defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128, 7065 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, 7066 VEX_4V; 7067 defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", X86smin, v4i32, VR128, 7068 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, 7069 VEX_4V; 7070 defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", X86umin, v4i32, VR128, 7071 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, 7072 VEX_4V; 7073 defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v8i16, VR128, 7074 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, 7075 VEX_4V; 7076 defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v16i8, VR128, 7077 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, 7078 VEX_4V; 7079 defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v4i32, VR128, 7080 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, 7081 VEX_4V; 7082 defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v4i32, VR128, 7083 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, 7084 VEX_4V; 7085 defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128, 7086 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, 7087 VEX_4V; 7088 defm VPMULDQ : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32, 7089 VR128, loadv2i64, i128mem, 7090 SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; 7091} 7092 7093let Predicates = [HasAVX2, NoVLX] in { 7094 let isCommutable = 0 in 7095 defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256, 7096 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, 7097 VEX_4V, VEX_L; 7098 defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", X86smin, v8i32, VR256, 7099 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, 7100 VEX_4V, VEX_L; 7101 defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", X86umin, v8i32, VR256, 7102 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, 7103 VEX_4V, VEX_L; 7104 defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v16i16, VR256, 7105 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, 7106 VEX_4V, VEX_L; 7107 defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v32i8, VR256, 7108 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, 7109 VEX_4V, VEX_L; 7110 defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v8i32, VR256, 7111 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, 7112 VEX_4V, VEX_L; 7113 defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v8i32, VR256, 7114 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, 7115 VEX_4V, VEX_L; 7116 defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256, 7117 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, 7118 VEX_4V, VEX_L; 7119 defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32, 7120 VR256, loadv4i64, i256mem, 7121 SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L; 7122} 7123 7124let Constraints = "$src1 = $dst" in { 7125 let isCommutable = 0 in 7126 defm PMINSB : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128, 7127 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; 7128 defm PMINSD : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128, 7129 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; 7130 defm PMINUD : SS48I_binop_rm<0x3B, "pminud", X86umin, v4i32, VR128, 7131 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; 7132 defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", X86umin, v8i16, VR128, 7133 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; 7134 defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", X86smax, v16i8, VR128, 7135 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; 7136 defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", X86smax, v4i32, VR128, 7137 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; 7138 defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", X86umax, v4i32, VR128, 7139 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; 7140 defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128, 7141 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; 7142 defm PMULDQ : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32, 7143 VR128, memopv2i64, i128mem, 7144 SSE_INTMUL_ITINS_P, 1>; 7145} 7146 7147let Predicates = [HasAVX, NoVLX] in { 7148 defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, 7149 memopv2i64, i128mem, 0, SSE_PMULLD_ITINS>, 7150 VEX_4V; 7151 defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, 7152 memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, 7153 VEX_4V; 7154} 7155let Predicates = [HasAVX2] in { 7156 defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, 7157 memopv4i64, i256mem, 0, SSE_PMULLD_ITINS>, 7158 VEX_4V, VEX_L; 7159 defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, 7160 memopv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, 7161 VEX_4V, VEX_L; 7162} 7163 7164let Constraints = "$src1 = $dst" in { 7165 defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128, 7166 memopv2i64, i128mem, 1, SSE_PMULLD_ITINS>; 7167 defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128, 7168 memopv2i64, i128mem, 1, SSE_INTALUQ_ITINS_P>; 7169} 7170 7171/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate 7172multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, 7173 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, 7174 X86MemOperand x86memop, bit Is2Addr = 1, 7175 OpndItins itins = DEFAULT_ITINS> { 7176 let isCommutable = 1 in 7177 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 7178 (ins RC:$src1, RC:$src2, i8imm:$src3), 7179 !if(Is2Addr, 7180 !strconcat(OpcodeStr, 7181 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 7182 !strconcat(OpcodeStr, 7183 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 7184 [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>, 7185 Sched<[itins.Sched]>; 7186 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 7187 (ins RC:$src1, x86memop:$src2, i8imm:$src3), 7188 !if(Is2Addr, 7189 !strconcat(OpcodeStr, 7190 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 7191 !strconcat(OpcodeStr, 7192 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 7193 [(set RC:$dst, 7194 (IntId RC:$src1, 7195 (bitconvert (memop_frag addr:$src2)), imm:$src3))], itins.rm>, 7196 Sched<[itins.Sched.Folded, ReadAfterLd]>; 7197} 7198 7199let Predicates = [HasAVX] in { 7200 let isCommutable = 0 in { 7201 defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, 7202 VR128, loadv2i64, i128mem, 0, 7203 DEFAULT_ITINS_MPSADSCHED>, VEX_4V; 7204 } 7205 7206 let ExeDomain = SSEPackedSingle in { 7207 defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, 7208 VR128, loadv4f32, f128mem, 0, 7209 DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; 7210 defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps", 7211 int_x86_avx_blend_ps_256, VR256, loadv8f32, 7212 f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, 7213 VEX_4V, VEX_L; 7214 } 7215 let ExeDomain = SSEPackedDouble in { 7216 defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd, 7217 VR128, loadv2f64, f128mem, 0, 7218 DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; 7219 defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd", 7220 int_x86_avx_blend_pd_256,VR256, loadv4f64, 7221 f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, 7222 VEX_4V, VEX_L; 7223 } 7224 defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw, 7225 VR128, loadv2i64, i128mem, 0, 7226 DEFAULT_ITINS_BLENDSCHED>, VEX_4V; 7227 7228 let ExeDomain = SSEPackedSingle in 7229 defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, 7230 VR128, loadv4f32, f128mem, 0, 7231 SSE_DPPS_ITINS>, VEX_4V; 7232 let ExeDomain = SSEPackedDouble in 7233 defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, 7234 VR128, loadv2f64, f128mem, 0, 7235 SSE_DPPS_ITINS>, VEX_4V; 7236 let ExeDomain = SSEPackedSingle in 7237 defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, 7238 VR256, loadv8f32, i256mem, 0, 7239 SSE_DPPS_ITINS>, VEX_4V, VEX_L; 7240} 7241 7242let Predicates = [HasAVX2] in { 7243 let isCommutable = 0 in { 7244 defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, 7245 VR256, loadv4i64, i256mem, 0, 7246 DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L; 7247 } 7248 defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw, 7249 VR256, loadv4i64, i256mem, 0, 7250 DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L; 7251} 7252 7253let Constraints = "$src1 = $dst" in { 7254 let isCommutable = 0 in { 7255 defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, 7256 VR128, memopv2i64, i128mem, 7257 1, SSE_MPSADBW_ITINS>; 7258 } 7259 let ExeDomain = SSEPackedSingle in 7260 defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps, 7261 VR128, memopv4f32, f128mem, 7262 1, SSE_INTALU_ITINS_FBLEND_P>; 7263 let ExeDomain = SSEPackedDouble in 7264 defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd, 7265 VR128, memopv2f64, f128mem, 7266 1, SSE_INTALU_ITINS_FBLEND_P>; 7267 defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw, 7268 VR128, memopv2i64, i128mem, 7269 1, SSE_INTALU_ITINS_BLEND_P>; 7270 let ExeDomain = SSEPackedSingle in 7271 defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, 7272 VR128, memopv4f32, f128mem, 1, 7273 SSE_DPPS_ITINS>; 7274 let ExeDomain = SSEPackedDouble in 7275 defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, 7276 VR128, memopv2f64, f128mem, 1, 7277 SSE_DPPD_ITINS>; 7278} 7279 7280/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators 7281multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, 7282 RegisterClass RC, X86MemOperand x86memop, 7283 PatFrag mem_frag, Intrinsic IntId, 7284 X86FoldableSchedWrite Sched> { 7285 def rr : Ii8<opc, MRMSrcReg, (outs RC:$dst), 7286 (ins RC:$src1, RC:$src2, RC:$src3), 7287 !strconcat(OpcodeStr, 7288 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 7289 [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))], 7290 NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM, 7291 Sched<[Sched]>; 7292 7293 def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst), 7294 (ins RC:$src1, x86memop:$src2, RC:$src3), 7295 !strconcat(OpcodeStr, 7296 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 7297 [(set RC:$dst, 7298 (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)), 7299 RC:$src3))], 7300 NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM, 7301 Sched<[Sched.Folded, ReadAfterLd]>; 7302} 7303 7304let Predicates = [HasAVX] in { 7305let ExeDomain = SSEPackedDouble in { 7306defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem, 7307 loadv2f64, int_x86_sse41_blendvpd, 7308 WriteFVarBlend>; 7309defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem, 7310 loadv4f64, int_x86_avx_blendv_pd_256, 7311 WriteFVarBlend>, VEX_L; 7312} // ExeDomain = SSEPackedDouble 7313let ExeDomain = SSEPackedSingle in { 7314defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem, 7315 loadv4f32, int_x86_sse41_blendvps, 7316 WriteFVarBlend>; 7317defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem, 7318 loadv8f32, int_x86_avx_blendv_ps_256, 7319 WriteFVarBlend>, VEX_L; 7320} // ExeDomain = SSEPackedSingle 7321defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem, 7322 loadv2i64, int_x86_sse41_pblendvb, 7323 WriteVarBlend>; 7324} 7325 7326let Predicates = [HasAVX2] in { 7327defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem, 7328 loadv4i64, int_x86_avx2_pblendvb, 7329 WriteVarBlend>, VEX_L; 7330} 7331 7332let Predicates = [HasAVX] in { 7333 def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1), 7334 (v16i8 VR128:$src2))), 7335 (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>; 7336 def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1), 7337 (v4i32 VR128:$src2))), 7338 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; 7339 def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1), 7340 (v4f32 VR128:$src2))), 7341 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; 7342 def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1), 7343 (v2i64 VR128:$src2))), 7344 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; 7345 def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1), 7346 (v2f64 VR128:$src2))), 7347 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; 7348 def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1), 7349 (v8i32 VR256:$src2))), 7350 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 7351 def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1), 7352 (v8f32 VR256:$src2))), 7353 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 7354 def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1), 7355 (v4i64 VR256:$src2))), 7356 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 7357 def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1), 7358 (v4f64 VR256:$src2))), 7359 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 7360 7361 def : Pat<(v8f32 (X86Blendi (v8f32 VR256:$src1), (v8f32 VR256:$src2), 7362 (imm:$mask))), 7363 (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$mask)>; 7364 def : Pat<(v4f64 (X86Blendi (v4f64 VR256:$src1), (v4f64 VR256:$src2), 7365 (imm:$mask))), 7366 (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$mask)>; 7367 7368 def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2), 7369 (imm:$mask))), 7370 (VPBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>; 7371 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2), 7372 (imm:$mask))), 7373 (VBLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>; 7374 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2), 7375 (imm:$mask))), 7376 (VBLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>; 7377} 7378 7379let Predicates = [HasAVX2] in { 7380 def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1), 7381 (v32i8 VR256:$src2))), 7382 (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 7383 def : Pat<(v16i16 (X86Blendi (v16i16 VR256:$src1), (v16i16 VR256:$src2), 7384 (imm:$mask))), 7385 (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>; 7386} 7387 7388// Patterns 7389let Predicates = [UseAVX] in { 7390 let AddedComplexity = 15 in { 7391 // Move scalar to XMM zero-extended, zeroing a VR128 then do a 7392 // MOVS{S,D} to the lower bits. 7393 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), 7394 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>; 7395 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 7396 (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; 7397 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 7398 (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; 7399 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), 7400 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>; 7401 7402 // Move low f32 and clear high bits. 7403 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), 7404 (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>; 7405 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), 7406 (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>; 7407 } 7408 7409 def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, 7410 (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))), 7411 (SUBREG_TO_REG (i32 0), 7412 (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)), 7413 sub_xmm)>; 7414 def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, 7415 (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))), 7416 (SUBREG_TO_REG (i64 0), 7417 (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), 7418 sub_xmm)>; 7419 7420 // Move low f64 and clear high bits. 7421 def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), 7422 (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>; 7423 7424 def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), 7425 (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>; 7426} 7427 7428let Predicates = [UseSSE41] in { 7429 // With SSE41 we can use blends for these patterns. 7430 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 7431 (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; 7432 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 7433 (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; 7434 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 7435 (BLENDPDrri (v2f64 (V_SET0)), VR128:$src, (i8 1))>; 7436} 7437 7438 7439/// SS41I_ternary_int - SSE 4.1 ternary operator 7440let Uses = [XMM0], Constraints = "$src1 = $dst" in { 7441 multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag, 7442 X86MemOperand x86memop, Intrinsic IntId, 7443 OpndItins itins = DEFAULT_ITINS> { 7444 def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 7445 (ins VR128:$src1, VR128:$src2), 7446 !strconcat(OpcodeStr, 7447 "\t{$src2, $dst|$dst, $src2}"), 7448 [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))], 7449 itins.rr>, Sched<[itins.Sched]>; 7450 7451 def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 7452 (ins VR128:$src1, x86memop:$src2), 7453 !strconcat(OpcodeStr, 7454 "\t{$src2, $dst|$dst, $src2}"), 7455 [(set VR128:$dst, 7456 (IntId VR128:$src1, 7457 (bitconvert (mem_frag addr:$src2)), XMM0))], 7458 itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; 7459 } 7460} 7461 7462let ExeDomain = SSEPackedDouble in 7463defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem, 7464 int_x86_sse41_blendvpd, 7465 DEFAULT_ITINS_FBLENDSCHED>; 7466let ExeDomain = SSEPackedSingle in 7467defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem, 7468 int_x86_sse41_blendvps, 7469 DEFAULT_ITINS_FBLENDSCHED>; 7470defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem, 7471 int_x86_sse41_pblendvb, 7472 DEFAULT_ITINS_VARBLENDSCHED>; 7473 7474// Aliases with the implicit xmm0 argument 7475def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", 7476 (BLENDVPDrr0 VR128:$dst, VR128:$src2)>; 7477def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", 7478 (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>; 7479def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", 7480 (BLENDVPSrr0 VR128:$dst, VR128:$src2)>; 7481def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", 7482 (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>; 7483def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", 7484 (PBLENDVBrr0 VR128:$dst, VR128:$src2)>; 7485def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", 7486 (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>; 7487 7488let Predicates = [UseSSE41] in { 7489 def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1), 7490 (v16i8 VR128:$src2))), 7491 (PBLENDVBrr0 VR128:$src2, VR128:$src1)>; 7492 def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1), 7493 (v4i32 VR128:$src2))), 7494 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; 7495 def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1), 7496 (v4f32 VR128:$src2))), 7497 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; 7498 def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1), 7499 (v2i64 VR128:$src2))), 7500 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; 7501 def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1), 7502 (v2f64 VR128:$src2))), 7503 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; 7504 7505 def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2), 7506 (imm:$mask))), 7507 (PBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>; 7508 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2), 7509 (imm:$mask))), 7510 (BLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>; 7511 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2), 7512 (imm:$mask))), 7513 (BLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>; 7514 7515} 7516 7517let SchedRW = [WriteLoad] in { 7518let Predicates = [HasAVX] in 7519def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 7520 "vmovntdqa\t{$src, $dst|$dst, $src}", 7521 [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>, 7522 VEX; 7523let Predicates = [HasAVX2] in 7524def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 7525 "vmovntdqa\t{$src, $dst|$dst, $src}", 7526 [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>, 7527 VEX, VEX_L; 7528def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 7529 "movntdqa\t{$src, $dst|$dst, $src}", 7530 [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>; 7531} // SchedRW 7532 7533//===----------------------------------------------------------------------===// 7534// SSE4.2 - Compare Instructions 7535//===----------------------------------------------------------------------===// 7536 7537/// SS42I_binop_rm - Simple SSE 4.2 binary operator 7538multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 7539 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 7540 X86MemOperand x86memop, bit Is2Addr = 1> { 7541 def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst), 7542 (ins RC:$src1, RC:$src2), 7543 !if(Is2Addr, 7544 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 7545 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 7546 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>; 7547 def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst), 7548 (ins RC:$src1, x86memop:$src2), 7549 !if(Is2Addr, 7550 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 7551 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 7552 [(set RC:$dst, 7553 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>; 7554} 7555 7556let Predicates = [HasAVX] in 7557 defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128, 7558 loadv2i64, i128mem, 0>, VEX_4V; 7559 7560let Predicates = [HasAVX2] in 7561 defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256, 7562 loadv4i64, i256mem, 0>, VEX_4V, VEX_L; 7563 7564let Constraints = "$src1 = $dst" in 7565 defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128, 7566 memopv2i64, i128mem>; 7567 7568//===----------------------------------------------------------------------===// 7569// SSE4.2 - String/text Processing Instructions 7570//===----------------------------------------------------------------------===// 7571 7572// Packed Compare Implicit Length Strings, Return Mask 7573multiclass pseudo_pcmpistrm<string asm> { 7574 def REG : PseudoI<(outs VR128:$dst), 7575 (ins VR128:$src1, VR128:$src2, i8imm:$src3), 7576 [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2, 7577 imm:$src3))]>; 7578 def MEM : PseudoI<(outs VR128:$dst), 7579 (ins VR128:$src1, i128mem:$src2, i8imm:$src3), 7580 [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, 7581 (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>; 7582} 7583 7584let Defs = [EFLAGS], usesCustomInserter = 1 in { 7585 defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>; 7586 defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[UseSSE42]>; 7587} 7588 7589multiclass pcmpistrm_SS42AI<string asm> { 7590 def rr : SS42AI<0x62, MRMSrcReg, (outs), 7591 (ins VR128:$src1, VR128:$src2, i8imm:$src3), 7592 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 7593 []>, Sched<[WritePCmpIStrM]>; 7594 let mayLoad = 1 in 7595 def rm :SS42AI<0x62, MRMSrcMem, (outs), 7596 (ins VR128:$src1, i128mem:$src2, i8imm:$src3), 7597 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 7598 []>, Sched<[WritePCmpIStrMLd, ReadAfterLd]>; 7599} 7600 7601let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in { 7602 let Predicates = [HasAVX] in 7603 defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX; 7604 defm PCMPISTRM128 : pcmpistrm_SS42AI<"pcmpistrm"> ; 7605} 7606 7607// Packed Compare Explicit Length Strings, Return Mask 7608multiclass pseudo_pcmpestrm<string asm> { 7609 def REG : PseudoI<(outs VR128:$dst), 7610 (ins VR128:$src1, VR128:$src3, i8imm:$src5), 7611 [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 7612 VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>; 7613 def MEM : PseudoI<(outs VR128:$dst), 7614 (ins VR128:$src1, i128mem:$src3, i8imm:$src5), 7615 [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX, 7616 (bc_v16i8 (memopv2i64 addr:$src3)), EDX, imm:$src5))]>; 7617} 7618 7619let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { 7620 defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>; 7621 defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[UseSSE42]>; 7622} 7623 7624multiclass SS42AI_pcmpestrm<string asm> { 7625 def rr : SS42AI<0x60, MRMSrcReg, (outs), 7626 (ins VR128:$src1, VR128:$src3, i8imm:$src5), 7627 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 7628 []>, Sched<[WritePCmpEStrM]>; 7629 let mayLoad = 1 in 7630 def rm : SS42AI<0x60, MRMSrcMem, (outs), 7631 (ins VR128:$src1, i128mem:$src3, i8imm:$src5), 7632 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 7633 []>, Sched<[WritePCmpEStrMLd, ReadAfterLd]>; 7634} 7635 7636let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { 7637 let Predicates = [HasAVX] in 7638 defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX; 7639 defm PCMPESTRM128 : SS42AI_pcmpestrm<"pcmpestrm">; 7640} 7641 7642// Packed Compare Implicit Length Strings, Return Index 7643multiclass pseudo_pcmpistri<string asm> { 7644 def REG : PseudoI<(outs GR32:$dst), 7645 (ins VR128:$src1, VR128:$src2, i8imm:$src3), 7646 [(set GR32:$dst, EFLAGS, 7647 (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>; 7648 def MEM : PseudoI<(outs GR32:$dst), 7649 (ins VR128:$src1, i128mem:$src2, i8imm:$src3), 7650 [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1, 7651 (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>; 7652} 7653 7654let Defs = [EFLAGS], usesCustomInserter = 1 in { 7655 defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI">, Requires<[HasAVX]>; 7656 defm PCMPISTRI : pseudo_pcmpistri<"#PCMPISTRI">, Requires<[UseSSE42]>; 7657} 7658 7659multiclass SS42AI_pcmpistri<string asm> { 7660 def rr : SS42AI<0x63, MRMSrcReg, (outs), 7661 (ins VR128:$src1, VR128:$src2, i8imm:$src3), 7662 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 7663 []>, Sched<[WritePCmpIStrI]>; 7664 let mayLoad = 1 in 7665 def rm : SS42AI<0x63, MRMSrcMem, (outs), 7666 (ins VR128:$src1, i128mem:$src2, i8imm:$src3), 7667 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 7668 []>, Sched<[WritePCmpIStrILd, ReadAfterLd]>; 7669} 7670 7671let Defs = [ECX, EFLAGS], hasSideEffects = 0 in { 7672 let Predicates = [HasAVX] in 7673 defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX; 7674 defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">; 7675} 7676 7677// Packed Compare Explicit Length Strings, Return Index 7678multiclass pseudo_pcmpestri<string asm> { 7679 def REG : PseudoI<(outs GR32:$dst), 7680 (ins VR128:$src1, VR128:$src3, i8imm:$src5), 7681 [(set GR32:$dst, EFLAGS, 7682 (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>; 7683 def MEM : PseudoI<(outs GR32:$dst), 7684 (ins VR128:$src1, i128mem:$src3, i8imm:$src5), 7685 [(set GR32:$dst, EFLAGS, 7686 (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (memopv2i64 addr:$src3)), EDX, 7687 imm:$src5))]>; 7688} 7689 7690let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { 7691 defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI">, Requires<[HasAVX]>; 7692 defm PCMPESTRI : pseudo_pcmpestri<"#PCMPESTRI">, Requires<[UseSSE42]>; 7693} 7694 7695multiclass SS42AI_pcmpestri<string asm> { 7696 def rr : SS42AI<0x61, MRMSrcReg, (outs), 7697 (ins VR128:$src1, VR128:$src3, i8imm:$src5), 7698 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 7699 []>, Sched<[WritePCmpEStrI]>; 7700 let mayLoad = 1 in 7701 def rm : SS42AI<0x61, MRMSrcMem, (outs), 7702 (ins VR128:$src1, i128mem:$src3, i8imm:$src5), 7703 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 7704 []>, Sched<[WritePCmpEStrILd, ReadAfterLd]>; 7705} 7706 7707let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { 7708 let Predicates = [HasAVX] in 7709 defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX; 7710 defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">; 7711} 7712 7713//===----------------------------------------------------------------------===// 7714// SSE4.2 - CRC Instructions 7715//===----------------------------------------------------------------------===// 7716 7717// No CRC instructions have AVX equivalents 7718 7719// crc intrinsic instruction 7720// This set of instructions are only rm, the only difference is the size 7721// of r and m. 7722class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut, 7723 RegisterClass RCIn, SDPatternOperator Int> : 7724 SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2), 7725 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), 7726 [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>, 7727 Sched<[WriteFAdd]>; 7728 7729class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut, 7730 X86MemOperand x86memop, SDPatternOperator Int> : 7731 SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2), 7732 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), 7733 [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))], 7734 IIC_CRC32_MEM>, Sched<[WriteFAddLd, ReadAfterLd]>; 7735 7736let Constraints = "$src1 = $dst" in { 7737 def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem, 7738 int_x86_sse42_crc32_32_8>; 7739 def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8, 7740 int_x86_sse42_crc32_32_8>; 7741 def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem, 7742 int_x86_sse42_crc32_32_16>, OpSize16; 7743 def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16, 7744 int_x86_sse42_crc32_32_16>, OpSize16; 7745 def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem, 7746 int_x86_sse42_crc32_32_32>, OpSize32; 7747 def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32, 7748 int_x86_sse42_crc32_32_32>, OpSize32; 7749 def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem, 7750 int_x86_sse42_crc32_64_64>, REX_W; 7751 def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64, 7752 int_x86_sse42_crc32_64_64>, REX_W; 7753 let hasSideEffects = 0 in { 7754 let mayLoad = 1 in 7755 def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem, 7756 null_frag>, REX_W; 7757 def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8, 7758 null_frag>, REX_W; 7759 } 7760} 7761 7762//===----------------------------------------------------------------------===// 7763// SHA-NI Instructions 7764//===----------------------------------------------------------------------===// 7765 7766multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId, 7767 bit UsesXMM0 = 0> { 7768 def rr : I<Opc, MRMSrcReg, (outs VR128:$dst), 7769 (ins VR128:$src1, VR128:$src2), 7770 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 7771 [!if(UsesXMM0, 7772 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)), 7773 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8; 7774 7775 def rm : I<Opc, MRMSrcMem, (outs VR128:$dst), 7776 (ins VR128:$src1, i128mem:$src2), 7777 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 7778 [!if(UsesXMM0, 7779 (set VR128:$dst, (IntId VR128:$src1, 7780 (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)), 7781 (set VR128:$dst, (IntId VR128:$src1, 7782 (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8; 7783} 7784 7785let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { 7786 def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst), 7787 (ins VR128:$src1, VR128:$src2, i8imm:$src3), 7788 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 7789 [(set VR128:$dst, 7790 (int_x86_sha1rnds4 VR128:$src1, VR128:$src2, 7791 (i8 imm:$src3)))]>, TA; 7792 def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst), 7793 (ins VR128:$src1, i128mem:$src2, i8imm:$src3), 7794 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 7795 [(set VR128:$dst, 7796 (int_x86_sha1rnds4 VR128:$src1, 7797 (bc_v4i32 (memopv2i64 addr:$src2)), 7798 (i8 imm:$src3)))]>, TA; 7799 7800 defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte>; 7801 defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1>; 7802 defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2>; 7803 7804 let Uses=[XMM0] in 7805 defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 1>; 7806 7807 defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1>; 7808 defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2>; 7809} 7810 7811// Aliases with explicit %xmm0 7812def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", 7813 (SHA256RNDS2rr VR128:$dst, VR128:$src2)>; 7814def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", 7815 (SHA256RNDS2rm VR128:$dst, i128mem:$src2)>; 7816 7817//===----------------------------------------------------------------------===// 7818// AES-NI Instructions 7819//===----------------------------------------------------------------------===// 7820 7821multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, 7822 Intrinsic IntId128, bit Is2Addr = 1> { 7823 def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst), 7824 (ins VR128:$src1, VR128:$src2), 7825 !if(Is2Addr, 7826 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 7827 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 7828 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, 7829 Sched<[WriteAESDecEnc]>; 7830 def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst), 7831 (ins VR128:$src1, i128mem:$src2), 7832 !if(Is2Addr, 7833 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 7834 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 7835 [(set VR128:$dst, 7836 (IntId128 VR128:$src1, (memopv2i64 addr:$src2)))]>, 7837 Sched<[WriteAESDecEncLd, ReadAfterLd]>; 7838} 7839 7840// Perform One Round of an AES Encryption/Decryption Flow 7841let Predicates = [HasAVX, HasAES] in { 7842 defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", 7843 int_x86_aesni_aesenc, 0>, VEX_4V; 7844 defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", 7845 int_x86_aesni_aesenclast, 0>, VEX_4V; 7846 defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec", 7847 int_x86_aesni_aesdec, 0>, VEX_4V; 7848 defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast", 7849 int_x86_aesni_aesdeclast, 0>, VEX_4V; 7850} 7851 7852let Constraints = "$src1 = $dst" in { 7853 defm AESENC : AESI_binop_rm_int<0xDC, "aesenc", 7854 int_x86_aesni_aesenc>; 7855 defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast", 7856 int_x86_aesni_aesenclast>; 7857 defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec", 7858 int_x86_aesni_aesdec>; 7859 defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast", 7860 int_x86_aesni_aesdeclast>; 7861} 7862 7863// Perform the AES InvMixColumn Transformation 7864let Predicates = [HasAVX, HasAES] in { 7865 def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), 7866 (ins VR128:$src1), 7867 "vaesimc\t{$src1, $dst|$dst, $src1}", 7868 [(set VR128:$dst, 7869 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>, 7870 VEX; 7871 def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), 7872 (ins i128mem:$src1), 7873 "vaesimc\t{$src1, $dst|$dst, $src1}", 7874 [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>, 7875 Sched<[WriteAESIMCLd]>, VEX; 7876} 7877def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), 7878 (ins VR128:$src1), 7879 "aesimc\t{$src1, $dst|$dst, $src1}", 7880 [(set VR128:$dst, 7881 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>; 7882def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), 7883 (ins i128mem:$src1), 7884 "aesimc\t{$src1, $dst|$dst, $src1}", 7885 [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>, 7886 Sched<[WriteAESIMCLd]>; 7887 7888// AES Round Key Generation Assist 7889let Predicates = [HasAVX, HasAES] in { 7890 def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), 7891 (ins VR128:$src1, i8imm:$src2), 7892 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7893 [(set VR128:$dst, 7894 (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, 7895 Sched<[WriteAESKeyGen]>, VEX; 7896 def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), 7897 (ins i128mem:$src1, i8imm:$src2), 7898 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7899 [(set VR128:$dst, 7900 (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>, 7901 Sched<[WriteAESKeyGenLd]>, VEX; 7902} 7903def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), 7904 (ins VR128:$src1, i8imm:$src2), 7905 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7906 [(set VR128:$dst, 7907 (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, 7908 Sched<[WriteAESKeyGen]>; 7909def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), 7910 (ins i128mem:$src1, i8imm:$src2), 7911 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7912 [(set VR128:$dst, 7913 (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>, 7914 Sched<[WriteAESKeyGenLd]>; 7915 7916//===----------------------------------------------------------------------===// 7917// PCLMUL Instructions 7918//===----------------------------------------------------------------------===// 7919 7920// AVX carry-less Multiplication instructions 7921def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), 7922 (ins VR128:$src1, VR128:$src2, i8imm:$src3), 7923 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7924 [(set VR128:$dst, 7925 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>, 7926 Sched<[WriteCLMul]>; 7927 7928def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), 7929 (ins VR128:$src1, i128mem:$src2, i8imm:$src3), 7930 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7931 [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, 7932 (loadv2i64 addr:$src2), imm:$src3))]>, 7933 Sched<[WriteCLMulLd, ReadAfterLd]>; 7934 7935// Carry-less Multiplication instructions 7936let Constraints = "$src1 = $dst" in { 7937def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), 7938 (ins VR128:$src1, VR128:$src2, i8imm:$src3), 7939 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", 7940 [(set VR128:$dst, 7941 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))], 7942 IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>; 7943 7944def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), 7945 (ins VR128:$src1, i128mem:$src2, i8imm:$src3), 7946 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", 7947 [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, 7948 (memopv2i64 addr:$src2), imm:$src3))], 7949 IIC_SSE_PCLMULQDQ_RM>, 7950 Sched<[WriteCLMulLd, ReadAfterLd]>; 7951} // Constraints = "$src1 = $dst" 7952 7953 7954multiclass pclmul_alias<string asm, int immop> { 7955 def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"), 7956 (PCLMULQDQrr VR128:$dst, VR128:$src, immop), 0>; 7957 7958 def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"), 7959 (PCLMULQDQrm VR128:$dst, i128mem:$src, immop), 0>; 7960 7961 def : InstAlias<!strconcat("vpclmul", asm, 7962 "dq {$src2, $src1, $dst|$dst, $src1, $src2}"), 7963 (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop), 7964 0>; 7965 7966 def : InstAlias<!strconcat("vpclmul", asm, 7967 "dq {$src2, $src1, $dst|$dst, $src1, $src2}"), 7968 (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop), 7969 0>; 7970} 7971defm : pclmul_alias<"hqhq", 0x11>; 7972defm : pclmul_alias<"hqlq", 0x01>; 7973defm : pclmul_alias<"lqhq", 0x10>; 7974defm : pclmul_alias<"lqlq", 0x00>; 7975 7976//===----------------------------------------------------------------------===// 7977// SSE4A Instructions 7978//===----------------------------------------------------------------------===// 7979 7980let Predicates = [HasSSE4A] in { 7981 7982let Constraints = "$src = $dst" in { 7983def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst), 7984 (ins VR128:$src, i8imm:$len, i8imm:$idx), 7985 "extrq\t{$idx, $len, $src|$src, $len, $idx}", 7986 [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len, 7987 imm:$idx))]>, PD; 7988def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), 7989 (ins VR128:$src, VR128:$mask), 7990 "extrq\t{$mask, $src|$src, $mask}", 7991 [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src, 7992 VR128:$mask))]>, PD; 7993 7994def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst), 7995 (ins VR128:$src, VR128:$src2, i8imm:$len, i8imm:$idx), 7996 "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}", 7997 [(set VR128:$dst, (int_x86_sse4a_insertqi VR128:$src, 7998 VR128:$src2, imm:$len, imm:$idx))]>, XD; 7999def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst), 8000 (ins VR128:$src, VR128:$mask), 8001 "insertq\t{$mask, $src|$src, $mask}", 8002 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src, 8003 VR128:$mask))]>, XD; 8004} 8005 8006def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src), 8007 "movntss\t{$src, $dst|$dst, $src}", 8008 [(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS; 8009 8010def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 8011 "movntsd\t{$src, $dst|$dst, $src}", 8012 [(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD; 8013} 8014 8015//===----------------------------------------------------------------------===// 8016// AVX Instructions 8017//===----------------------------------------------------------------------===// 8018 8019//===----------------------------------------------------------------------===// 8020// VBROADCAST - Load from memory and broadcast to all elements of the 8021// destination operand 8022// 8023class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC, 8024 X86MemOperand x86memop, Intrinsic Int, SchedWrite Sched> : 8025 AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 8026 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 8027 [(set RC:$dst, (Int addr:$src))]>, Sched<[Sched]>, VEX; 8028 8029class avx_broadcast_no_int<bits<8> opc, string OpcodeStr, RegisterClass RC, 8030 X86MemOperand x86memop, ValueType VT, 8031 PatFrag ld_frag, SchedWrite Sched> : 8032 AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 8033 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 8034 [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>, 8035 Sched<[Sched]>, VEX { 8036 let mayLoad = 1; 8037} 8038 8039// AVX2 adds register forms 8040class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC, 8041 Intrinsic Int, SchedWrite Sched> : 8042 AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), 8043 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 8044 [(set RC:$dst, (Int VR128:$src))]>, Sched<[Sched]>, VEX; 8045 8046let ExeDomain = SSEPackedSingle in { 8047 def VBROADCASTSSrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR128, 8048 f32mem, v4f32, loadf32, WriteLoad>; 8049 def VBROADCASTSSYrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR256, 8050 f32mem, v8f32, loadf32, 8051 WriteFShuffleLd>, VEX_L; 8052} 8053let ExeDomain = SSEPackedDouble in 8054def VBROADCASTSDYrm : avx_broadcast_no_int<0x19, "vbroadcastsd", VR256, f64mem, 8055 v4f64, loadf64, WriteFShuffleLd>, VEX_L; 8056def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem, 8057 int_x86_avx_vbroadcastf128_pd_256, 8058 WriteFShuffleLd>, VEX_L; 8059 8060let ExeDomain = SSEPackedSingle in { 8061 def VBROADCASTSSrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR128, 8062 int_x86_avx2_vbroadcast_ss_ps, 8063 WriteFShuffle>; 8064 def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256, 8065 int_x86_avx2_vbroadcast_ss_ps_256, 8066 WriteFShuffle256>, VEX_L; 8067} 8068let ExeDomain = SSEPackedDouble in 8069def VBROADCASTSDYrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256, 8070 int_x86_avx2_vbroadcast_sd_pd_256, 8071 WriteFShuffle256>, VEX_L; 8072 8073let Predicates = [HasAVX2] in 8074def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem, 8075 int_x86_avx2_vbroadcasti128, WriteLoad>, 8076 VEX_L; 8077 8078let Predicates = [HasAVX] in 8079def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), 8080 (VBROADCASTF128 addr:$src)>; 8081 8082 8083//===----------------------------------------------------------------------===// 8084// VINSERTF128 - Insert packed floating-point values 8085// 8086let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 8087def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst), 8088 (ins VR256:$src1, VR128:$src2, i8imm:$src3), 8089 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 8090 []>, Sched<[WriteFShuffle]>, VEX_4V, VEX_L; 8091let mayLoad = 1 in 8092def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), 8093 (ins VR256:$src1, f128mem:$src2, i8imm:$src3), 8094 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 8095 []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L; 8096} 8097 8098let Predicates = [HasAVX] in { 8099def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2), 8100 (iPTR imm)), 8101 (VINSERTF128rr VR256:$src1, VR128:$src2, 8102 (INSERT_get_vinsert128_imm VR256:$ins))>; 8103def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2), 8104 (iPTR imm)), 8105 (VINSERTF128rr VR256:$src1, VR128:$src2, 8106 (INSERT_get_vinsert128_imm VR256:$ins))>; 8107 8108def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2), 8109 (iPTR imm)), 8110 (VINSERTF128rm VR256:$src1, addr:$src2, 8111 (INSERT_get_vinsert128_imm VR256:$ins))>; 8112def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2), 8113 (iPTR imm)), 8114 (VINSERTF128rm VR256:$src1, addr:$src2, 8115 (INSERT_get_vinsert128_imm VR256:$ins))>; 8116} 8117 8118// Combine two consecutive 16-byte loads with a common destination register into 8119// one 32-byte load to that register. 8120let Predicates = [HasAVX, HasFastMem32] in { 8121 def : Pat<(insert_subvector 8122 (v8f32 (insert_subvector undef, (loadv4f32 addr:$src), (iPTR 0))), 8123 (loadv4f32 (add addr:$src, (iPTR 16))), 8124 (iPTR 4)), 8125 (VMOVUPSYrm addr:$src)>; 8126 8127 def : Pat<(insert_subvector 8128 (v4f64 (insert_subvector undef, (loadv2f64 addr:$src), (iPTR 0))), 8129 (loadv2f64 (add addr:$src, (iPTR 16))), 8130 (iPTR 2)), 8131 (VMOVUPDYrm addr:$src)>; 8132 8133 def : Pat<(insert_subvector 8134 (v32i8 (insert_subvector 8135 undef, (bc_v16i8 (loadv2i64 addr:$src)), (iPTR 0))), 8136 (bc_v16i8 (loadv2i64 (add addr:$src, (iPTR 16)))), 8137 (iPTR 16)), 8138 (VMOVDQUYrm addr:$src)>; 8139 8140 def : Pat<(insert_subvector 8141 (v16i16 (insert_subvector 8142 undef, (bc_v8i16 (loadv2i64 addr:$src)), (iPTR 0))), 8143 (bc_v8i16 (loadv2i64 (add addr:$src, (iPTR 16)))), 8144 (iPTR 8)), 8145 (VMOVDQUYrm addr:$src)>; 8146 8147 def : Pat<(insert_subvector 8148 (v8i32 (insert_subvector 8149 undef, (bc_v4i32 (loadv2i64 addr:$src)), (iPTR 0))), 8150 (bc_v4i32 (loadv2i64 (add addr:$src, (iPTR 16)))), 8151 (iPTR 4)), 8152 (VMOVDQUYrm addr:$src)>; 8153 8154 def : Pat<(insert_subvector 8155 (v4i64 (insert_subvector undef, (loadv2i64 addr:$src), (iPTR 0))), 8156 (loadv2i64 (add addr:$src, (iPTR 16))), 8157 (iPTR 2)), 8158 (VMOVDQUYrm addr:$src)>; 8159} 8160 8161let Predicates = [HasAVX1Only] in { 8162def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), 8163 (iPTR imm)), 8164 (VINSERTF128rr VR256:$src1, VR128:$src2, 8165 (INSERT_get_vinsert128_imm VR256:$ins))>; 8166def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), 8167 (iPTR imm)), 8168 (VINSERTF128rr VR256:$src1, VR128:$src2, 8169 (INSERT_get_vinsert128_imm VR256:$ins))>; 8170def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), 8171 (iPTR imm)), 8172 (VINSERTF128rr VR256:$src1, VR128:$src2, 8173 (INSERT_get_vinsert128_imm VR256:$ins))>; 8174def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), 8175 (iPTR imm)), 8176 (VINSERTF128rr VR256:$src1, VR128:$src2, 8177 (INSERT_get_vinsert128_imm VR256:$ins))>; 8178 8179def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2), 8180 (iPTR imm)), 8181 (VINSERTF128rm VR256:$src1, addr:$src2, 8182 (INSERT_get_vinsert128_imm VR256:$ins))>; 8183def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), 8184 (bc_v4i32 (loadv2i64 addr:$src2)), 8185 (iPTR imm)), 8186 (VINSERTF128rm VR256:$src1, addr:$src2, 8187 (INSERT_get_vinsert128_imm VR256:$ins))>; 8188def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), 8189 (bc_v16i8 (loadv2i64 addr:$src2)), 8190 (iPTR imm)), 8191 (VINSERTF128rm VR256:$src1, addr:$src2, 8192 (INSERT_get_vinsert128_imm VR256:$ins))>; 8193def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), 8194 (bc_v8i16 (loadv2i64 addr:$src2)), 8195 (iPTR imm)), 8196 (VINSERTF128rm VR256:$src1, addr:$src2, 8197 (INSERT_get_vinsert128_imm VR256:$ins))>; 8198} 8199 8200//===----------------------------------------------------------------------===// 8201// VEXTRACTF128 - Extract packed floating-point values 8202// 8203let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 8204def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst), 8205 (ins VR256:$src1, i8imm:$src2), 8206 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 8207 []>, Sched<[WriteFShuffle]>, VEX, VEX_L; 8208let mayStore = 1 in 8209def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), 8210 (ins f128mem:$dst, VR256:$src1, i8imm:$src2), 8211 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 8212 []>, Sched<[WriteStore]>, VEX, VEX_L; 8213} 8214 8215// AVX1 patterns 8216let Predicates = [HasAVX] in { 8217def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 8218 (v4f32 (VEXTRACTF128rr 8219 (v8f32 VR256:$src1), 8220 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 8221def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 8222 (v2f64 (VEXTRACTF128rr 8223 (v4f64 VR256:$src1), 8224 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 8225 8226def : Pat<(store (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1), 8227 (iPTR imm))), addr:$dst), 8228 (VEXTRACTF128mr addr:$dst, VR256:$src1, 8229 (EXTRACT_get_vextract128_imm VR128:$ext))>; 8230def : Pat<(store (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1), 8231 (iPTR imm))), addr:$dst), 8232 (VEXTRACTF128mr addr:$dst, VR256:$src1, 8233 (EXTRACT_get_vextract128_imm VR128:$ext))>; 8234} 8235 8236let Predicates = [HasAVX1Only] in { 8237def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 8238 (v2i64 (VEXTRACTF128rr 8239 (v4i64 VR256:$src1), 8240 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 8241def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 8242 (v4i32 (VEXTRACTF128rr 8243 (v8i32 VR256:$src1), 8244 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 8245def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 8246 (v8i16 (VEXTRACTF128rr 8247 (v16i16 VR256:$src1), 8248 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 8249def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 8250 (v16i8 (VEXTRACTF128rr 8251 (v32i8 VR256:$src1), 8252 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 8253 8254def : Pat<(alignedstore (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1), 8255 (iPTR imm))), addr:$dst), 8256 (VEXTRACTF128mr addr:$dst, VR256:$src1, 8257 (EXTRACT_get_vextract128_imm VR128:$ext))>; 8258def : Pat<(alignedstore (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1), 8259 (iPTR imm))), addr:$dst), 8260 (VEXTRACTF128mr addr:$dst, VR256:$src1, 8261 (EXTRACT_get_vextract128_imm VR128:$ext))>; 8262def : Pat<(alignedstore (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1), 8263 (iPTR imm))), addr:$dst), 8264 (VEXTRACTF128mr addr:$dst, VR256:$src1, 8265 (EXTRACT_get_vextract128_imm VR128:$ext))>; 8266def : Pat<(alignedstore (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1), 8267 (iPTR imm))), addr:$dst), 8268 (VEXTRACTF128mr addr:$dst, VR256:$src1, 8269 (EXTRACT_get_vextract128_imm VR128:$ext))>; 8270} 8271 8272//===----------------------------------------------------------------------===// 8273// VMASKMOV - Conditional SIMD Packed Loads and Stores 8274// 8275multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, 8276 Intrinsic IntLd, Intrinsic IntLd256, 8277 Intrinsic IntSt, Intrinsic IntSt256> { 8278 def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst), 8279 (ins VR128:$src1, f128mem:$src2), 8280 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8281 [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>, 8282 VEX_4V; 8283 def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst), 8284 (ins VR256:$src1, f256mem:$src2), 8285 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8286 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, 8287 VEX_4V, VEX_L; 8288 def mr : AVX8I<opc_mr, MRMDestMem, (outs), 8289 (ins f128mem:$dst, VR128:$src1, VR128:$src2), 8290 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8291 [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V; 8292 def Ymr : AVX8I<opc_mr, MRMDestMem, (outs), 8293 (ins f256mem:$dst, VR256:$src1, VR256:$src2), 8294 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8295 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L; 8296} 8297 8298let ExeDomain = SSEPackedSingle in 8299defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps", 8300 int_x86_avx_maskload_ps, 8301 int_x86_avx_maskload_ps_256, 8302 int_x86_avx_maskstore_ps, 8303 int_x86_avx_maskstore_ps_256>; 8304let ExeDomain = SSEPackedDouble in 8305defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", 8306 int_x86_avx_maskload_pd, 8307 int_x86_avx_maskload_pd_256, 8308 int_x86_avx_maskstore_pd, 8309 int_x86_avx_maskstore_pd_256>; 8310 8311//===----------------------------------------------------------------------===// 8312// VPERMIL - Permute Single and Double Floating-Point Values 8313// 8314multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, 8315 RegisterClass RC, X86MemOperand x86memop_f, 8316 X86MemOperand x86memop_i, PatFrag i_frag, 8317 Intrinsic IntVar, ValueType vt> { 8318 def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst), 8319 (ins RC:$src1, RC:$src2), 8320 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8321 [(set RC:$dst, (IntVar RC:$src1, RC:$src2))]>, VEX_4V, 8322 Sched<[WriteFShuffle]>; 8323 def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst), 8324 (ins RC:$src1, x86memop_i:$src2), 8325 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8326 [(set RC:$dst, (IntVar RC:$src1, 8327 (bitconvert (i_frag addr:$src2))))]>, VEX_4V, 8328 Sched<[WriteFShuffleLd, ReadAfterLd]>; 8329 8330 def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), 8331 (ins RC:$src1, i8imm:$src2), 8332 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8333 [(set RC:$dst, (vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX, 8334 Sched<[WriteFShuffle]>; 8335 def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), 8336 (ins x86memop_f:$src1, i8imm:$src2), 8337 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8338 [(set RC:$dst, 8339 (vt (X86VPermilpi (memop addr:$src1), (i8 imm:$src2))))]>, VEX, 8340 Sched<[WriteFShuffleLd]>; 8341} 8342 8343let ExeDomain = SSEPackedSingle in { 8344 defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem, 8345 loadv2i64, int_x86_avx_vpermilvar_ps, v4f32>; 8346 defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem, 8347 loadv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L; 8348} 8349let ExeDomain = SSEPackedDouble in { 8350 defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem, 8351 loadv2i64, int_x86_avx_vpermilvar_pd, v2f64>; 8352 defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem, 8353 loadv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L; 8354} 8355 8356let Predicates = [HasAVX] in { 8357def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))), 8358 (VPERMILPSYrr VR256:$src1, VR256:$src2)>; 8359def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), 8360 (VPERMILPSYrm VR256:$src1, addr:$src2)>; 8361def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))), 8362 (VPERMILPDYrr VR256:$src1, VR256:$src2)>; 8363def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))), 8364 (VPERMILPDYrm VR256:$src1, addr:$src2)>; 8365 8366def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))), 8367 (VPERMILPSYri VR256:$src1, imm:$imm)>; 8368def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))), 8369 (VPERMILPDYri VR256:$src1, imm:$imm)>; 8370def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)), 8371 (i8 imm:$imm))), 8372 (VPERMILPSYmi addr:$src1, imm:$imm)>; 8373def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))), 8374 (VPERMILPDYmi addr:$src1, imm:$imm)>; 8375 8376def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))), 8377 (VPERMILPSrr VR128:$src1, VR128:$src2)>; 8378def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))), 8379 (VPERMILPSrm VR128:$src1, addr:$src2)>; 8380def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))), 8381 (VPERMILPDrr VR128:$src1, VR128:$src2)>; 8382def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))), 8383 (VPERMILPDrm VR128:$src1, addr:$src2)>; 8384 8385def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))), 8386 (VPERMILPDri VR128:$src1, imm:$imm)>; 8387def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))), 8388 (VPERMILPDmi addr:$src1, imm:$imm)>; 8389} 8390 8391//===----------------------------------------------------------------------===// 8392// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks 8393// 8394let ExeDomain = SSEPackedSingle in { 8395def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), 8396 (ins VR256:$src1, VR256:$src2, i8imm:$src3), 8397 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 8398 [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2, 8399 (i8 imm:$src3))))]>, VEX_4V, VEX_L, 8400 Sched<[WriteFShuffle]>; 8401def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), 8402 (ins VR256:$src1, f256mem:$src2, i8imm:$src3), 8403 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 8404 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2), 8405 (i8 imm:$src3)))]>, VEX_4V, VEX_L, 8406 Sched<[WriteFShuffleLd, ReadAfterLd]>; 8407} 8408 8409let Predicates = [HasAVX] in { 8410def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), 8411 (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; 8412def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, 8413 (loadv4f64 addr:$src2), (i8 imm:$imm))), 8414 (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; 8415} 8416 8417let Predicates = [HasAVX1Only] in { 8418def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), 8419 (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; 8420def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), 8421 (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; 8422def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), 8423 (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; 8424def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), 8425 (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; 8426 8427def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, 8428 (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))), 8429 (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; 8430def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, 8431 (loadv4i64 addr:$src2), (i8 imm:$imm))), 8432 (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; 8433def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, 8434 (bc_v32i8 (loadv4i64 addr:$src2)), (i8 imm:$imm))), 8435 (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; 8436def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, 8437 (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))), 8438 (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; 8439} 8440 8441//===----------------------------------------------------------------------===// 8442// VZERO - Zero YMM registers 8443// 8444let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, 8445 YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in { 8446 // Zero All YMM registers 8447 def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall", 8448 [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>; 8449 8450 // Zero Upper bits of YMM registers 8451 def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", 8452 [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>; 8453} 8454 8455//===----------------------------------------------------------------------===// 8456// Half precision conversion instructions 8457//===----------------------------------------------------------------------===// 8458multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { 8459 def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), 8460 "vcvtph2ps\t{$src, $dst|$dst, $src}", 8461 [(set RC:$dst, (Int VR128:$src))]>, 8462 T8PD, VEX, Sched<[WriteCvtF2F]>; 8463 let hasSideEffects = 0, mayLoad = 1 in 8464 def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 8465 "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX, 8466 Sched<[WriteCvtF2FLd]>; 8467} 8468 8469multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { 8470 def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), 8471 (ins RC:$src1, i32i8imm:$src2), 8472 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", 8473 [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>, 8474 TAPD, VEX, Sched<[WriteCvtF2F]>; 8475 let hasSideEffects = 0, mayStore = 1, 8476 SchedRW = [WriteCvtF2FLd, WriteRMW] in 8477 def mr : Ii8<0x1D, MRMDestMem, (outs), 8478 (ins x86memop:$dst, RC:$src1, i32i8imm:$src2), 8479 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 8480 TAPD, VEX; 8481} 8482 8483let Predicates = [HasF16C] in { 8484 defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>; 8485 defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L; 8486 defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>; 8487 defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L; 8488 8489 // Pattern match vcvtph2ps of a scalar i64 load. 8490 def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)), 8491 (VCVTPH2PSrm addr:$src)>; 8492 def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)), 8493 (VCVTPH2PSrm addr:$src)>; 8494} 8495 8496// Patterns for matching conversions from float to half-float and vice versa. 8497let Predicates = [HasF16C] in { 8498 def : Pat<(fp_to_f16 FR32:$src), 8499 (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr 8500 (COPY_TO_REGCLASS FR32:$src, VR128), 0)), sub_16bit))>; 8501 8502 def : Pat<(f16_to_fp GR16:$src), 8503 (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr 8504 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >; 8505 8506 def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))), 8507 (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr 8508 (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 0)), FR32)) >; 8509} 8510 8511//===----------------------------------------------------------------------===// 8512// AVX2 Instructions 8513//===----------------------------------------------------------------------===// 8514 8515/// AVX2_binop_rmi_int - AVX2 binary operator with 8-bit immediate 8516multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr, 8517 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, 8518 X86MemOperand x86memop> { 8519 let isCommutable = 1 in 8520 def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), 8521 (ins RC:$src1, RC:$src2, i8imm:$src3), 8522 !strconcat(OpcodeStr, 8523 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 8524 [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>, 8525 Sched<[WriteBlend]>, VEX_4V; 8526 def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), 8527 (ins RC:$src1, x86memop:$src2, i8imm:$src3), 8528 !strconcat(OpcodeStr, 8529 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 8530 [(set RC:$dst, 8531 (IntId RC:$src1, 8532 (bitconvert (memop_frag addr:$src2)), imm:$src3))]>, 8533 Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V; 8534} 8535 8536defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128, 8537 VR128, loadv2i64, i128mem>; 8538defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256, 8539 VR256, loadv4i64, i256mem>, VEX_L; 8540 8541def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), 8542 imm:$mask)), 8543 (VPBLENDDrri VR128:$src1, VR128:$src2, imm:$mask)>; 8544def : Pat<(v8i32 (X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), 8545 imm:$mask)), 8546 (VPBLENDDYrri VR256:$src1, VR256:$src2, imm:$mask)>; 8547 8548//===----------------------------------------------------------------------===// 8549// VPBROADCAST - Load from memory and broadcast to all elements of the 8550// destination operand 8551// 8552multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, 8553 X86MemOperand x86memop, PatFrag ld_frag, 8554 Intrinsic Int128, Intrinsic Int256> { 8555 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 8556 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 8557 [(set VR128:$dst, (Int128 VR128:$src))]>, 8558 Sched<[WriteShuffle]>, VEX; 8559 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 8560 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 8561 [(set VR128:$dst, 8562 (Int128 (scalar_to_vector (ld_frag addr:$src))))]>, 8563 Sched<[WriteLoad]>, VEX; 8564 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 8565 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 8566 [(set VR256:$dst, (Int256 VR128:$src))]>, 8567 Sched<[WriteShuffle256]>, VEX, VEX_L; 8568 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), 8569 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 8570 [(set VR256:$dst, 8571 (Int256 (scalar_to_vector (ld_frag addr:$src))))]>, 8572 Sched<[WriteLoad]>, VEX, VEX_L; 8573} 8574 8575defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, 8576 int_x86_avx2_pbroadcastb_128, 8577 int_x86_avx2_pbroadcastb_256>; 8578defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16, 8579 int_x86_avx2_pbroadcastw_128, 8580 int_x86_avx2_pbroadcastw_256>; 8581defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, 8582 int_x86_avx2_pbroadcastd_128, 8583 int_x86_avx2_pbroadcastd_256>; 8584defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, 8585 int_x86_avx2_pbroadcastq_128, 8586 int_x86_avx2_pbroadcastq_256>; 8587 8588let Predicates = [HasAVX2] in { 8589 def : Pat<(v16i8 (X86VBroadcast (loadi8 addr:$src))), 8590 (VPBROADCASTBrm addr:$src)>; 8591 def : Pat<(v32i8 (X86VBroadcast (loadi8 addr:$src))), 8592 (VPBROADCASTBYrm addr:$src)>; 8593 def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))), 8594 (VPBROADCASTWrm addr:$src)>; 8595 def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))), 8596 (VPBROADCASTWYrm addr:$src)>; 8597 def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), 8598 (VPBROADCASTDrm addr:$src)>; 8599 def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), 8600 (VPBROADCASTDYrm addr:$src)>; 8601 def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), 8602 (VPBROADCASTQrm addr:$src)>; 8603 def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), 8604 (VPBROADCASTQYrm addr:$src)>; 8605 8606 def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))), 8607 (VPBROADCASTBrr VR128:$src)>; 8608 def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))), 8609 (VPBROADCASTBYrr VR128:$src)>; 8610 def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))), 8611 (VPBROADCASTWrr VR128:$src)>; 8612 def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))), 8613 (VPBROADCASTWYrr VR128:$src)>; 8614 def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))), 8615 (VPBROADCASTDrr VR128:$src)>; 8616 def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))), 8617 (VPBROADCASTDYrr VR128:$src)>; 8618 def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))), 8619 (VPBROADCASTQrr VR128:$src)>; 8620 def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))), 8621 (VPBROADCASTQYrr VR128:$src)>; 8622 def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))), 8623 (VBROADCASTSSrr VR128:$src)>; 8624 def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))), 8625 (VBROADCASTSSYrr VR128:$src)>; 8626 def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))), 8627 (VPBROADCASTQrr VR128:$src)>; 8628 def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))), 8629 (VBROADCASTSDYrr VR128:$src)>; 8630 8631 // Provide aliases for broadcast from the same regitser class that 8632 // automatically does the extract. 8633 def : Pat<(v32i8 (X86VBroadcast (v32i8 VR256:$src))), 8634 (VPBROADCASTBYrr (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), 8635 sub_xmm)))>; 8636 def : Pat<(v16i16 (X86VBroadcast (v16i16 VR256:$src))), 8637 (VPBROADCASTWYrr (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), 8638 sub_xmm)))>; 8639 def : Pat<(v8i32 (X86VBroadcast (v8i32 VR256:$src))), 8640 (VPBROADCASTDYrr (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), 8641 sub_xmm)))>; 8642 def : Pat<(v4i64 (X86VBroadcast (v4i64 VR256:$src))), 8643 (VPBROADCASTQYrr (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), 8644 sub_xmm)))>; 8645 def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))), 8646 (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), 8647 sub_xmm)))>; 8648 def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))), 8649 (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), 8650 sub_xmm)))>; 8651 8652 // Provide fallback in case the load node that is used in the patterns above 8653 // is used by additional users, which prevents the pattern selection. 8654 let AddedComplexity = 20 in { 8655 def : Pat<(v4f32 (X86VBroadcast FR32:$src)), 8656 (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>; 8657 def : Pat<(v8f32 (X86VBroadcast FR32:$src)), 8658 (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>; 8659 def : Pat<(v4f64 (X86VBroadcast FR64:$src)), 8660 (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>; 8661 8662 def : Pat<(v4i32 (X86VBroadcast GR32:$src)), 8663 (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>; 8664 def : Pat<(v8i32 (X86VBroadcast GR32:$src)), 8665 (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>; 8666 def : Pat<(v4i64 (X86VBroadcast GR64:$src)), 8667 (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>; 8668 8669 def : Pat<(v16i8 (X86VBroadcast GR8:$src)), 8670 (VPBROADCASTBrr (COPY_TO_REGCLASS 8671 (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), 8672 VR128))>; 8673 def : Pat<(v32i8 (X86VBroadcast GR8:$src)), 8674 (VPBROADCASTBYrr (COPY_TO_REGCLASS 8675 (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), 8676 VR128))>; 8677 8678 def : Pat<(v8i16 (X86VBroadcast GR16:$src)), 8679 (VPBROADCASTWrr (COPY_TO_REGCLASS 8680 (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)), 8681 VR128))>; 8682 def : Pat<(v16i16 (X86VBroadcast GR16:$src)), 8683 (VPBROADCASTWYrr (COPY_TO_REGCLASS 8684 (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)), 8685 VR128))>; 8686 8687 // The patterns for VPBROADCASTD are not needed because they would match 8688 // the exact same thing as VBROADCASTSS patterns. 8689 8690 def : Pat<(v2i64 (X86VBroadcast GR64:$src)), 8691 (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>; 8692 // The v4i64 pattern is not needed because VBROADCASTSDYrr already match. 8693 } 8694} 8695 8696// AVX1 broadcast patterns 8697let Predicates = [HasAVX1Only] in { 8698def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), 8699 (VBROADCASTSSYrm addr:$src)>; 8700def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), 8701 (VBROADCASTSDYrm addr:$src)>; 8702def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), 8703 (VBROADCASTSSrm addr:$src)>; 8704} 8705 8706let Predicates = [HasAVX] in { 8707 // Provide fallback in case the load node that is used in the patterns above 8708 // is used by additional users, which prevents the pattern selection. 8709 let AddedComplexity = 20 in { 8710 // 128bit broadcasts: 8711 def : Pat<(v4f32 (X86VBroadcast FR32:$src)), 8712 (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>; 8713 def : Pat<(v8f32 (X86VBroadcast FR32:$src)), 8714 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 8715 (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm), 8716 (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>; 8717 def : Pat<(v4f64 (X86VBroadcast FR64:$src)), 8718 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 8719 (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm), 8720 (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>; 8721 8722 def : Pat<(v4i32 (X86VBroadcast GR32:$src)), 8723 (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>; 8724 def : Pat<(v8i32 (X86VBroadcast GR32:$src)), 8725 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 8726 (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), sub_xmm), 8727 (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), 1)>; 8728 def : Pat<(v4i64 (X86VBroadcast GR64:$src)), 8729 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), 8730 (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm), 8731 (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>; 8732 } 8733 8734 def : Pat<(v2f64 (X86VBroadcast f64:$src)), 8735 (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>; 8736} 8737 8738//===----------------------------------------------------------------------===// 8739// VPERM - Permute instructions 8740// 8741 8742multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, 8743 ValueType OpVT, X86FoldableSchedWrite Sched> { 8744 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), 8745 (ins VR256:$src1, VR256:$src2), 8746 !strconcat(OpcodeStr, 8747 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8748 [(set VR256:$dst, 8749 (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, 8750 Sched<[Sched]>, VEX_4V, VEX_L; 8751 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), 8752 (ins VR256:$src1, i256mem:$src2), 8753 !strconcat(OpcodeStr, 8754 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8755 [(set VR256:$dst, 8756 (OpVT (X86VPermv VR256:$src1, 8757 (bitconvert (mem_frag addr:$src2)))))]>, 8758 Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L; 8759} 8760 8761defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256>; 8762let ExeDomain = SSEPackedSingle in 8763defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256>; 8764 8765multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, 8766 ValueType OpVT, X86FoldableSchedWrite Sched> { 8767 def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst), 8768 (ins VR256:$src1, i8imm:$src2), 8769 !strconcat(OpcodeStr, 8770 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8771 [(set VR256:$dst, 8772 (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>, 8773 Sched<[Sched]>, VEX, VEX_L; 8774 def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), 8775 (ins i256mem:$src1, i8imm:$src2), 8776 !strconcat(OpcodeStr, 8777 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8778 [(set VR256:$dst, 8779 (OpVT (X86VPermi (mem_frag addr:$src1), 8780 (i8 imm:$src2))))]>, 8781 Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L; 8782} 8783 8784defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64, 8785 WriteShuffle256>, VEX_W; 8786let ExeDomain = SSEPackedDouble in 8787defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64, 8788 WriteFShuffle256>, VEX_W; 8789 8790//===----------------------------------------------------------------------===// 8791// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks 8792// 8793def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), 8794 (ins VR256:$src1, VR256:$src2, i8imm:$src3), 8795 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 8796 [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, 8797 (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>, 8798 VEX_4V, VEX_L; 8799def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), 8800 (ins VR256:$src1, f256mem:$src2, i8imm:$src3), 8801 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 8802 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2), 8803 (i8 imm:$src3)))]>, 8804 Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; 8805 8806let Predicates = [HasAVX2] in { 8807def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), 8808 (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; 8809def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), 8810 (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; 8811def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), 8812 (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; 8813 8814def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (loadv4i64 addr:$src2)), 8815 (i8 imm:$imm))), 8816 (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; 8817def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, 8818 (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))), 8819 (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; 8820def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)), 8821 (i8 imm:$imm))), 8822 (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; 8823} 8824 8825 8826//===----------------------------------------------------------------------===// 8827// VINSERTI128 - Insert packed integer values 8828// 8829let hasSideEffects = 0 in { 8830def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst), 8831 (ins VR256:$src1, VR128:$src2, i8imm:$src3), 8832 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 8833 []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L; 8834let mayLoad = 1 in 8835def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), 8836 (ins VR256:$src1, i128mem:$src2, i8imm:$src3), 8837 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 8838 []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; 8839} 8840 8841let Predicates = [HasAVX2] in { 8842def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), 8843 (iPTR imm)), 8844 (VINSERTI128rr VR256:$src1, VR128:$src2, 8845 (INSERT_get_vinsert128_imm VR256:$ins))>; 8846def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), 8847 (iPTR imm)), 8848 (VINSERTI128rr VR256:$src1, VR128:$src2, 8849 (INSERT_get_vinsert128_imm VR256:$ins))>; 8850def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), 8851 (iPTR imm)), 8852 (VINSERTI128rr VR256:$src1, VR128:$src2, 8853 (INSERT_get_vinsert128_imm VR256:$ins))>; 8854def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), 8855 (iPTR imm)), 8856 (VINSERTI128rr VR256:$src1, VR128:$src2, 8857 (INSERT_get_vinsert128_imm VR256:$ins))>; 8858 8859def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2), 8860 (iPTR imm)), 8861 (VINSERTI128rm VR256:$src1, addr:$src2, 8862 (INSERT_get_vinsert128_imm VR256:$ins))>; 8863def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), 8864 (bc_v4i32 (loadv2i64 addr:$src2)), 8865 (iPTR imm)), 8866 (VINSERTI128rm VR256:$src1, addr:$src2, 8867 (INSERT_get_vinsert128_imm VR256:$ins))>; 8868def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), 8869 (bc_v16i8 (loadv2i64 addr:$src2)), 8870 (iPTR imm)), 8871 (VINSERTI128rm VR256:$src1, addr:$src2, 8872 (INSERT_get_vinsert128_imm VR256:$ins))>; 8873def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), 8874 (bc_v8i16 (loadv2i64 addr:$src2)), 8875 (iPTR imm)), 8876 (VINSERTI128rm VR256:$src1, addr:$src2, 8877 (INSERT_get_vinsert128_imm VR256:$ins))>; 8878} 8879 8880//===----------------------------------------------------------------------===// 8881// VEXTRACTI128 - Extract packed integer values 8882// 8883def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst), 8884 (ins VR256:$src1, i8imm:$src2), 8885 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 8886 [(set VR128:$dst, 8887 (int_x86_avx2_vextracti128 VR256:$src1, imm:$src2))]>, 8888 Sched<[WriteShuffle256]>, VEX, VEX_L; 8889let hasSideEffects = 0, mayStore = 1 in 8890def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), 8891 (ins i128mem:$dst, VR256:$src1, i8imm:$src2), 8892 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 8893 Sched<[WriteStore]>, VEX, VEX_L; 8894 8895let Predicates = [HasAVX2] in { 8896def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 8897 (v2i64 (VEXTRACTI128rr 8898 (v4i64 VR256:$src1), 8899 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 8900def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 8901 (v4i32 (VEXTRACTI128rr 8902 (v8i32 VR256:$src1), 8903 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 8904def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 8905 (v8i16 (VEXTRACTI128rr 8906 (v16i16 VR256:$src1), 8907 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 8908def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 8909 (v16i8 (VEXTRACTI128rr 8910 (v32i8 VR256:$src1), 8911 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 8912 8913def : Pat<(store (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1), 8914 (iPTR imm))), addr:$dst), 8915 (VEXTRACTI128mr addr:$dst, VR256:$src1, 8916 (EXTRACT_get_vextract128_imm VR128:$ext))>; 8917def : Pat<(store (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1), 8918 (iPTR imm))), addr:$dst), 8919 (VEXTRACTI128mr addr:$dst, VR256:$src1, 8920 (EXTRACT_get_vextract128_imm VR128:$ext))>; 8921def : Pat<(store (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1), 8922 (iPTR imm))), addr:$dst), 8923 (VEXTRACTI128mr addr:$dst, VR256:$src1, 8924 (EXTRACT_get_vextract128_imm VR128:$ext))>; 8925def : Pat<(store (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1), 8926 (iPTR imm))), addr:$dst), 8927 (VEXTRACTI128mr addr:$dst, VR256:$src1, 8928 (EXTRACT_get_vextract128_imm VR128:$ext))>; 8929} 8930 8931//===----------------------------------------------------------------------===// 8932// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores 8933// 8934multiclass avx2_pmovmask<string OpcodeStr, 8935 Intrinsic IntLd128, Intrinsic IntLd256, 8936 Intrinsic IntSt128, Intrinsic IntSt256> { 8937 def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst), 8938 (ins VR128:$src1, i128mem:$src2), 8939 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8940 [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, VEX_4V; 8941 def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst), 8942 (ins VR256:$src1, i256mem:$src2), 8943 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8944 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, 8945 VEX_4V, VEX_L; 8946 def mr : AVX28I<0x8e, MRMDestMem, (outs), 8947 (ins i128mem:$dst, VR128:$src1, VR128:$src2), 8948 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8949 [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V; 8950 def Ymr : AVX28I<0x8e, MRMDestMem, (outs), 8951 (ins i256mem:$dst, VR256:$src1, VR256:$src2), 8952 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8953 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L; 8954} 8955 8956defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd", 8957 int_x86_avx2_maskload_d, 8958 int_x86_avx2_maskload_d_256, 8959 int_x86_avx2_maskstore_d, 8960 int_x86_avx2_maskstore_d_256>; 8961defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", 8962 int_x86_avx2_maskload_q, 8963 int_x86_avx2_maskload_q_256, 8964 int_x86_avx2_maskstore_q, 8965 int_x86_avx2_maskstore_q_256>, VEX_W; 8966 8967def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)), 8968 (VMASKMOVPSYmr addr:$ptr, VR256:$mask, VR256:$src)>; 8969 8970def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)), 8971 (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>; 8972 8973def: Pat<(masked_store addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)), 8974 (VMASKMOVPSmr addr:$ptr, VR128:$mask, VR128:$src)>; 8975 8976def: Pat<(masked_store addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)), 8977 (VPMASKMOVDmr addr:$ptr, VR128:$mask, VR128:$src)>; 8978 8979def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)), 8980 (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>; 8981 8982def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), 8983 (bc_v8f32 (v8i32 immAllZerosV)))), 8984 (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>; 8985 8986def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src0))), 8987 (VBLENDVPSYrr VR256:$src0, (VMASKMOVPSYrm VR256:$mask, addr:$ptr), 8988 VR256:$mask)>; 8989 8990def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)), 8991 (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>; 8992 8993def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 immAllZerosV))), 8994 (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>; 8995 8996def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src0))), 8997 (VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr), 8998 VR256:$mask)>; 8999 9000def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)), 9001 (VMASKMOVPSrm VR128:$mask, addr:$ptr)>; 9002 9003def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), 9004 (bc_v4f32 (v4i32 immAllZerosV)))), 9005 (VMASKMOVPSrm VR128:$mask, addr:$ptr)>; 9006 9007def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src0))), 9008 (VBLENDVPSrr VR128:$src0, (VMASKMOVPSrm VR128:$mask, addr:$ptr), 9009 VR128:$mask)>; 9010 9011def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)), 9012 (VPMASKMOVDrm VR128:$mask, addr:$ptr)>; 9013 9014def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 immAllZerosV))), 9015 (VPMASKMOVDrm VR128:$mask, addr:$ptr)>; 9016 9017def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src0))), 9018 (VBLENDVPSrr VR128:$src0, (VPMASKMOVDrm VR128:$mask, addr:$ptr), 9019 VR128:$mask)>; 9020 9021def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)), 9022 (VMASKMOVPDYmr addr:$ptr, VR256:$mask, VR256:$src)>; 9023 9024def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)), 9025 (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>; 9026 9027def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)), 9028 (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>; 9029 9030def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), 9031 (v4f64 immAllZerosV))), 9032 (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>; 9033 9034def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src0))), 9035 (VBLENDVPDYrr VR256:$src0, (VMASKMOVPDYrm VR256:$mask, addr:$ptr), 9036 VR256:$mask)>; 9037 9038def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)), 9039 (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>; 9040 9041def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), 9042 (bc_v4i64 (v8i32 immAllZerosV)))), 9043 (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>; 9044 9045def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0))), 9046 (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr), 9047 VR256:$mask)>; 9048 9049def: Pat<(masked_store addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)), 9050 (VMASKMOVPDmr addr:$ptr, VR128:$mask, VR128:$src)>; 9051 9052def: Pat<(masked_store addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)), 9053 (VPMASKMOVQmr addr:$ptr, VR128:$mask, VR128:$src)>; 9054 9055def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)), 9056 (VMASKMOVPDrm VR128:$mask, addr:$ptr)>; 9057 9058def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), 9059 (v2f64 immAllZerosV))), 9060 (VMASKMOVPDrm VR128:$mask, addr:$ptr)>; 9061 9062def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src0))), 9063 (VBLENDVPDrr VR128:$src0, (VMASKMOVPDrm VR128:$mask, addr:$ptr), 9064 VR128:$mask)>; 9065 9066def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)), 9067 (VPMASKMOVQrm VR128:$mask, addr:$ptr)>; 9068 9069def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), 9070 (bc_v2i64 (v4i32 immAllZerosV)))), 9071 (VPMASKMOVQrm VR128:$mask, addr:$ptr)>; 9072 9073def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src0))), 9074 (VBLENDVPDrr VR128:$src0, (VPMASKMOVQrm VR128:$mask, addr:$ptr), 9075 VR128:$mask)>; 9076 9077//===----------------------------------------------------------------------===// 9078// Variable Bit Shifts 9079// 9080multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, 9081 ValueType vt128, ValueType vt256> { 9082 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), 9083 (ins VR128:$src1, VR128:$src2), 9084 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 9085 [(set VR128:$dst, 9086 (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>, 9087 VEX_4V, Sched<[WriteVarVecShift]>; 9088 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), 9089 (ins VR128:$src1, i128mem:$src2), 9090 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 9091 [(set VR128:$dst, 9092 (vt128 (OpNode VR128:$src1, 9093 (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>, 9094 VEX_4V, Sched<[WriteVarVecShiftLd, ReadAfterLd]>; 9095 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), 9096 (ins VR256:$src1, VR256:$src2), 9097 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 9098 [(set VR256:$dst, 9099 (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>, 9100 VEX_4V, VEX_L, Sched<[WriteVarVecShift]>; 9101 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), 9102 (ins VR256:$src1, i256mem:$src2), 9103 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 9104 [(set VR256:$dst, 9105 (vt256 (OpNode VR256:$src1, 9106 (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>, 9107 VEX_4V, VEX_L, Sched<[WriteVarVecShiftLd, ReadAfterLd]>; 9108} 9109 9110defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>; 9111defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W; 9112defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>; 9113defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W; 9114defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>; 9115 9116//===----------------------------------------------------------------------===// 9117// VGATHER - GATHER Operations 9118multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256, 9119 X86MemOperand memop128, X86MemOperand memop256> { 9120 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst, VR128:$mask_wb), 9121 (ins VR128:$src1, memop128:$src2, VR128:$mask), 9122 !strconcat(OpcodeStr, 9123 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), 9124 []>, VEX_4VOp3; 9125 def Yrm : AVX28I<opc, MRMSrcMem, (outs RC256:$dst, RC256:$mask_wb), 9126 (ins RC256:$src1, memop256:$src2, RC256:$mask), 9127 !strconcat(OpcodeStr, 9128 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), 9129 []>, VEX_4VOp3, VEX_L; 9130} 9131 9132let mayLoad = 1, Constraints 9133 = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" 9134 in { 9135 defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx64mem, vx64mem>, VEX_W; 9136 defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx64mem, vy64mem>, VEX_W; 9137 defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx32mem, vy32mem>; 9138 defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx32mem, vy32mem>; 9139 9140 let ExeDomain = SSEPackedDouble in { 9141 defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W; 9142 defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W; 9143 } 9144 9145 let ExeDomain = SSEPackedSingle in { 9146 defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>; 9147 defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>; 9148 } 9149} 9150