1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=avx2,fma | FileCheck %s --check-prefixes=CHECK,X64 3; RUN: llc < %s -mtriple=i686-- -mattr=avx2,fma | FileCheck %s --check-prefixes=CHECK,X86 4 5define float @fneg_v4f32(<4 x float> %x) nounwind { 6; X64-LABEL: fneg_v4f32: 7; X64: # %bb.0: 8; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 9; X64-NEXT: vxorps %xmm1, %xmm0, %xmm0 10; X64-NEXT: retq 11; 12; X86-LABEL: fneg_v4f32: 13; X86: # %bb.0: 14; X86-NEXT: pushl %eax 15; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 16; X86-NEXT: vxorps %xmm1, %xmm0, %xmm0 17; X86-NEXT: vmovss %xmm0, (%esp) 18; X86-NEXT: flds (%esp) 19; X86-NEXT: popl %eax 20; X86-NEXT: retl 21 %v = fneg <4 x float> %x 22 %r = extractelement <4 x float> %v, i32 0 23 ret float %r 24} 25 26define double @fneg_v4f64(<4 x double> %x) nounwind { 27; X64-LABEL: fneg_v4f64: 28; X64: # %bb.0: 29; X64-NEXT: vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] 30; X64-NEXT: # xmm1 = mem[0,0] 31; X64-NEXT: vxorps %xmm1, %xmm0, %xmm0 32; X64-NEXT: vzeroupper 33; X64-NEXT: retq 34; 35; X86-LABEL: fneg_v4f64: 36; X86: # %bb.0: 37; X86-NEXT: pushl %ebp 38; X86-NEXT: movl %esp, %ebp 39; X86-NEXT: andl $-8, %esp 40; X86-NEXT: subl $8, %esp 41; X86-NEXT: vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] 42; X86-NEXT: # xmm1 = mem[0,0] 43; X86-NEXT: vxorps %xmm1, %xmm0, %xmm0 44; X86-NEXT: vmovlps %xmm0, (%esp) 45; X86-NEXT: fldl (%esp) 46; X86-NEXT: movl %ebp, %esp 47; X86-NEXT: popl %ebp 48; X86-NEXT: vzeroupper 49; X86-NEXT: retl 50 %v = fneg <4 x double> %x 51 %r = extractelement <4 x double> %v, i32 0 52 ret double %r 53} 54 55define float @fadd_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 56; X64-LABEL: fadd_v4f32: 57; X64: # %bb.0: 58; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0 59; X64-NEXT: retq 60; 61; X86-LABEL: fadd_v4f32: 62; X86: # %bb.0: 63; X86-NEXT: pushl %eax 64; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0 65; X86-NEXT: vmovss %xmm0, (%esp) 66; X86-NEXT: flds (%esp) 67; X86-NEXT: popl %eax 68; X86-NEXT: retl 69 %v = fadd <4 x float> %x, %y 70 %r = extractelement <4 x float> %v, i32 0 71 ret float %r 72} 73 74define double @fadd_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 75; X64-LABEL: fadd_v4f64: 76; X64: # %bb.0: 77; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 78; X64-NEXT: vzeroupper 79; X64-NEXT: retq 80; 81; X86-LABEL: fadd_v4f64: 82; X86: # %bb.0: 83; X86-NEXT: pushl %ebp 84; X86-NEXT: movl %esp, %ebp 85; X86-NEXT: andl $-8, %esp 86; X86-NEXT: subl $8, %esp 87; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0 88; X86-NEXT: vmovsd %xmm0, (%esp) 89; X86-NEXT: fldl (%esp) 90; X86-NEXT: movl %ebp, %esp 91; X86-NEXT: popl %ebp 92; X86-NEXT: vzeroupper 93; X86-NEXT: retl 94 %v = fadd <4 x double> %x, %y 95 %r = extractelement <4 x double> %v, i32 0 96 ret double %r 97} 98 99define float @fsub_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 100; X64-LABEL: fsub_v4f32: 101; X64: # %bb.0: 102; X64-NEXT: vsubss %xmm1, %xmm0, %xmm0 103; X64-NEXT: retq 104; 105; X86-LABEL: fsub_v4f32: 106; X86: # %bb.0: 107; X86-NEXT: pushl %eax 108; X86-NEXT: vsubss %xmm1, %xmm0, %xmm0 109; X86-NEXT: vmovss %xmm0, (%esp) 110; X86-NEXT: flds (%esp) 111; X86-NEXT: popl %eax 112; X86-NEXT: retl 113 %v = fsub <4 x float> %x, %y 114 %r = extractelement <4 x float> %v, i32 0 115 ret float %r 116} 117 118define double @fsub_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 119; X64-LABEL: fsub_v4f64: 120; X64: # %bb.0: 121; X64-NEXT: vsubsd %xmm1, %xmm0, %xmm0 122; X64-NEXT: vzeroupper 123; X64-NEXT: retq 124; 125; X86-LABEL: fsub_v4f64: 126; X86: # %bb.0: 127; X86-NEXT: pushl %ebp 128; X86-NEXT: movl %esp, %ebp 129; X86-NEXT: andl $-8, %esp 130; X86-NEXT: subl $8, %esp 131; X86-NEXT: vsubsd %xmm1, %xmm0, %xmm0 132; X86-NEXT: vmovsd %xmm0, (%esp) 133; X86-NEXT: fldl (%esp) 134; X86-NEXT: movl %ebp, %esp 135; X86-NEXT: popl %ebp 136; X86-NEXT: vzeroupper 137; X86-NEXT: retl 138 %v = fsub <4 x double> %x, %y 139 %r = extractelement <4 x double> %v, i32 0 140 ret double %r 141} 142 143define float @fmul_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 144; X64-LABEL: fmul_v4f32: 145; X64: # %bb.0: 146; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0 147; X64-NEXT: retq 148; 149; X86-LABEL: fmul_v4f32: 150; X86: # %bb.0: 151; X86-NEXT: pushl %eax 152; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0 153; X86-NEXT: vmovss %xmm0, (%esp) 154; X86-NEXT: flds (%esp) 155; X86-NEXT: popl %eax 156; X86-NEXT: retl 157 %v = fmul <4 x float> %x, %y 158 %r = extractelement <4 x float> %v, i32 0 159 ret float %r 160} 161 162define double @fmul_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 163; X64-LABEL: fmul_v4f64: 164; X64: # %bb.0: 165; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0 166; X64-NEXT: vzeroupper 167; X64-NEXT: retq 168; 169; X86-LABEL: fmul_v4f64: 170; X86: # %bb.0: 171; X86-NEXT: pushl %ebp 172; X86-NEXT: movl %esp, %ebp 173; X86-NEXT: andl $-8, %esp 174; X86-NEXT: subl $8, %esp 175; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0 176; X86-NEXT: vmovsd %xmm0, (%esp) 177; X86-NEXT: fldl (%esp) 178; X86-NEXT: movl %ebp, %esp 179; X86-NEXT: popl %ebp 180; X86-NEXT: vzeroupper 181; X86-NEXT: retl 182 %v = fmul <4 x double> %x, %y 183 %r = extractelement <4 x double> %v, i32 0 184 ret double %r 185} 186 187define float @fdiv_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 188; X64-LABEL: fdiv_v4f32: 189; X64: # %bb.0: 190; X64-NEXT: vdivss %xmm1, %xmm0, %xmm0 191; X64-NEXT: retq 192; 193; X86-LABEL: fdiv_v4f32: 194; X86: # %bb.0: 195; X86-NEXT: pushl %eax 196; X86-NEXT: vdivss %xmm1, %xmm0, %xmm0 197; X86-NEXT: vmovss %xmm0, (%esp) 198; X86-NEXT: flds (%esp) 199; X86-NEXT: popl %eax 200; X86-NEXT: retl 201 %v = fdiv <4 x float> %x, %y 202 %r = extractelement <4 x float> %v, i32 0 203 ret float %r 204} 205 206define double @fdiv_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 207; X64-LABEL: fdiv_v4f64: 208; X64: # %bb.0: 209; X64-NEXT: vdivsd %xmm1, %xmm0, %xmm0 210; X64-NEXT: vzeroupper 211; X64-NEXT: retq 212; 213; X86-LABEL: fdiv_v4f64: 214; X86: # %bb.0: 215; X86-NEXT: pushl %ebp 216; X86-NEXT: movl %esp, %ebp 217; X86-NEXT: andl $-8, %esp 218; X86-NEXT: subl $8, %esp 219; X86-NEXT: vdivsd %xmm1, %xmm0, %xmm0 220; X86-NEXT: vmovsd %xmm0, (%esp) 221; X86-NEXT: fldl (%esp) 222; X86-NEXT: movl %ebp, %esp 223; X86-NEXT: popl %ebp 224; X86-NEXT: vzeroupper 225; X86-NEXT: retl 226 %v = fdiv <4 x double> %x, %y 227 %r = extractelement <4 x double> %v, i32 0 228 ret double %r 229} 230 231define float @frem_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 232; X64-LABEL: frem_v4f32: 233; X64: # %bb.0: 234; X64-NEXT: jmp fmodf@PLT # TAILCALL 235; 236; X86-LABEL: frem_v4f32: 237; X86: # %bb.0: 238; X86-NEXT: subl $8, %esp 239; X86-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp) 240; X86-NEXT: vmovss %xmm0, (%esp) 241; X86-NEXT: calll fmodf 242; X86-NEXT: addl $8, %esp 243; X86-NEXT: retl 244 %v = frem <4 x float> %x, %y 245 %r = extractelement <4 x float> %v, i32 0 246 ret float %r 247} 248 249define double @frem_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 250; X64-LABEL: frem_v4f64: 251; X64: # %bb.0: 252; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 253; X64-NEXT: # kill: def $xmm1 killed $xmm1 killed $ymm1 254; X64-NEXT: vzeroupper 255; X64-NEXT: jmp fmod@PLT # TAILCALL 256; 257; X86-LABEL: frem_v4f64: 258; X86: # %bb.0: 259; X86-NEXT: subl $16, %esp 260; X86-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 261; X86-NEXT: vmovups %xmm0, (%esp) 262; X86-NEXT: vzeroupper 263; X86-NEXT: calll fmod 264; X86-NEXT: addl $16, %esp 265; X86-NEXT: retl 266 %v = frem <4 x double> %x, %y 267 %r = extractelement <4 x double> %v, i32 0 268 ret double %r 269} 270 271define i1 @fcmp_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 272; CHECK-LABEL: fcmp_v4f32: 273; CHECK: # %bb.0: 274; CHECK-NEXT: vucomiss %xmm1, %xmm0 275; CHECK-NEXT: seta %al 276; CHECK-NEXT: ret{{[l|q]}} 277 %v = fcmp ogt <4 x float> %x, %y 278 %r = extractelement <4 x i1> %v, i32 0 279 ret i1 %r 280} 281 282define i1 @fcmp_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 283; CHECK-LABEL: fcmp_v4f64: 284; CHECK: # %bb.0: 285; CHECK-NEXT: vucomisd %xmm0, %xmm1 286; CHECK-NEXT: setb %al 287; CHECK-NEXT: vzeroupper 288; CHECK-NEXT: ret{{[l|q]}} 289 %v = fcmp ugt <4 x double> %x, %y 290 %r = extractelement <4 x i1> %v, i32 0 291 ret i1 %r 292} 293 294; If we do the fcmp transform late, make sure we have the right types. 295; https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=13700 296 297define void @extsetcc(<4 x float> %x) { 298; X64-LABEL: extsetcc: 299; X64: # %bb.0: 300; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 301; X64-NEXT: vucomiss %xmm1, %xmm0 302; X64-NEXT: setb (%rax) 303; X64-NEXT: retq 304; 305; X86-LABEL: extsetcc: 306; X86: # %bb.0: 307; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 308; X86-NEXT: vucomiss %xmm1, %xmm0 309; X86-NEXT: setb (%eax) 310; X86-NEXT: retl 311 %cmp = fcmp ult <4 x float> %x, zeroinitializer 312 %sext = sext <4 x i1> %cmp to <4 x i32> 313 %e = extractelement <4 x i1> %cmp, i1 0 314 store i1 %e, i1* undef 315 ret void 316} 317 318; This used to crash by creating a setcc with an i64 condition on a 32-bit target. 319define <3 x double> @extvselectsetcc_crash(<2 x double> %x) { 320; X64-LABEL: extvselectsetcc_crash: 321; X64: # %bb.0: 322; X64-NEXT: vcmpeqpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 323; X64-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 324; X64-NEXT: vandpd %xmm2, %xmm1, %xmm1 325; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 326; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] 327; X64-NEXT: retq 328; 329; X86-LABEL: extvselectsetcc_crash: 330; X86: # %bb.0: 331; X86-NEXT: vcmpeqpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 332; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 333; X86-NEXT: vandpd %xmm2, %xmm1, %xmm1 334; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 335; X86-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] 336; X86-NEXT: retl 337 %cmp = fcmp oeq <2 x double> %x, <double 5.0, double 5.0> 338 %s = select <2 x i1> %cmp, <2 x double> <double 1.0, double undef>, <2 x double> <double 0.0, double undef> 339 %r = shufflevector <2 x double> %s, <2 x double> %x, <3 x i32> <i32 0, i32 2, i32 3> 340 ret <3 x double> %r 341} 342 343define float @select_fcmp_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) nounwind { 344; X64-LABEL: select_fcmp_v4f32: 345; X64: # %bb.0: 346; X64-NEXT: vcmpneq_oqss %xmm1, %xmm0, %xmm0 347; X64-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 348; X64-NEXT: retq 349; 350; X86-LABEL: select_fcmp_v4f32: 351; X86: # %bb.0: 352; X86-NEXT: pushl %ebp 353; X86-NEXT: movl %esp, %ebp 354; X86-NEXT: andl $-16, %esp 355; X86-NEXT: subl $16, %esp 356; X86-NEXT: vmovaps 8(%ebp), %xmm3 357; X86-NEXT: vcmpneq_oqss %xmm1, %xmm0, %xmm0 358; X86-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 359; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) 360; X86-NEXT: flds {{[0-9]+}}(%esp) 361; X86-NEXT: movl %ebp, %esp 362; X86-NEXT: popl %ebp 363; X86-NEXT: retl 364 %c = fcmp one <4 x float> %x, %y 365 %s = select <4 x i1> %c, <4 x float> %z, <4 x float> %w 366 %r = extractelement <4 x float> %s, i32 0 367 ret float %r 368} 369 370define double @select_fcmp_v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z, <4 x double> %w) nounwind { 371; X64-LABEL: select_fcmp_v4f64: 372; X64: # %bb.0: 373; X64-NEXT: vcmpnltsd %xmm0, %xmm1, %xmm0 374; X64-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0 375; X64-NEXT: vzeroupper 376; X64-NEXT: retq 377; 378; X86-LABEL: select_fcmp_v4f64: 379; X86: # %bb.0: 380; X86-NEXT: pushl %ebp 381; X86-NEXT: movl %esp, %ebp 382; X86-NEXT: andl $-32, %esp 383; X86-NEXT: subl $32, %esp 384; X86-NEXT: vcmpnltsd %xmm0, %xmm1, %xmm0 385; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 386; X86-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 387; X86-NEXT: vmovlpd %xmm0, {{[0-9]+}}(%esp) 388; X86-NEXT: fldl {{[0-9]+}}(%esp) 389; X86-NEXT: movl %ebp, %esp 390; X86-NEXT: popl %ebp 391; X86-NEXT: vzeroupper 392; X86-NEXT: retl 393 %c = fcmp ule <4 x double> %x, %y 394 %s = select <4 x i1> %c, <4 x double> %z, <4 x double> %w 395 %r = extractelement <4 x double> %s, i32 0 396 ret double %r 397} 398 399define float @fsqrt_v4f32(<4 x float> %x) nounwind { 400; X64-LABEL: fsqrt_v4f32: 401; X64: # %bb.0: 402; X64-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 403; X64-NEXT: retq 404; 405; X86-LABEL: fsqrt_v4f32: 406; X86: # %bb.0: 407; X86-NEXT: pushl %eax 408; X86-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 409; X86-NEXT: vmovss %xmm0, (%esp) 410; X86-NEXT: flds (%esp) 411; X86-NEXT: popl %eax 412; X86-NEXT: retl 413 %v = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) 414 %r = extractelement <4 x float> %v, i32 0 415 ret float %r 416} 417 418define double @fsqrt_v4f64(<4 x double> %x) nounwind { 419; X64-LABEL: fsqrt_v4f64: 420; X64: # %bb.0: 421; X64-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 422; X64-NEXT: vzeroupper 423; X64-NEXT: retq 424; 425; X86-LABEL: fsqrt_v4f64: 426; X86: # %bb.0: 427; X86-NEXT: pushl %ebp 428; X86-NEXT: movl %esp, %ebp 429; X86-NEXT: andl $-8, %esp 430; X86-NEXT: subl $8, %esp 431; X86-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 432; X86-NEXT: vmovsd %xmm0, (%esp) 433; X86-NEXT: fldl (%esp) 434; X86-NEXT: movl %ebp, %esp 435; X86-NEXT: popl %ebp 436; X86-NEXT: vzeroupper 437; X86-NEXT: retl 438 %v = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %x) 439 %r = extractelement <4 x double> %v, i32 0 440 ret double %r 441} 442 443define float @fsin_v4f32(<4 x float> %x) nounwind { 444; X64-LABEL: fsin_v4f32: 445; X64: # %bb.0: 446; X64-NEXT: jmp sinf@PLT # TAILCALL 447; 448; X86-LABEL: fsin_v4f32: 449; X86: # %bb.0: 450; X86-NEXT: pushl %eax 451; X86-NEXT: vmovss %xmm0, (%esp) 452; X86-NEXT: calll sinf 453; X86-NEXT: popl %eax 454; X86-NEXT: retl 455 %v = call <4 x float> @llvm.sin.v4f32(<4 x float> %x) 456 %r = extractelement <4 x float> %v, i32 0 457 ret float %r 458} 459 460define double @fsin_v4f64(<4 x double> %x) nounwind { 461; X64-LABEL: fsin_v4f64: 462; X64: # %bb.0: 463; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 464; X64-NEXT: vzeroupper 465; X64-NEXT: jmp sin@PLT # TAILCALL 466; 467; X86-LABEL: fsin_v4f64: 468; X86: # %bb.0: 469; X86-NEXT: subl $8, %esp 470; X86-NEXT: vmovlps %xmm0, (%esp) 471; X86-NEXT: vzeroupper 472; X86-NEXT: calll sin 473; X86-NEXT: addl $8, %esp 474; X86-NEXT: retl 475 %v = call <4 x double> @llvm.sin.v4f64(<4 x double> %x) 476 %r = extractelement <4 x double> %v, i32 0 477 ret double %r 478} 479 480define float @fma_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z) nounwind { 481; X64-LABEL: fma_v4f32: 482; X64: # %bb.0: 483; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 484; X64-NEXT: retq 485; 486; X86-LABEL: fma_v4f32: 487; X86: # %bb.0: 488; X86-NEXT: pushl %eax 489; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 490; X86-NEXT: vmovss %xmm0, (%esp) 491; X86-NEXT: flds (%esp) 492; X86-NEXT: popl %eax 493; X86-NEXT: retl 494 %v = call <4 x float> @llvm.fma.v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z) 495 %r = extractelement <4 x float> %v, i32 0 496 ret float %r 497} 498 499define double @fma_v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z) nounwind { 500; X64-LABEL: fma_v4f64: 501; X64: # %bb.0: 502; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 503; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 504; X64-NEXT: vzeroupper 505; X64-NEXT: retq 506; 507; X86-LABEL: fma_v4f64: 508; X86: # %bb.0: 509; X86-NEXT: pushl %ebp 510; X86-NEXT: movl %esp, %ebp 511; X86-NEXT: andl $-8, %esp 512; X86-NEXT: subl $8, %esp 513; X86-NEXT: vfmadd213sd {{.*#+}} xmm1 = (xmm0 * xmm1) + xmm2 514; X86-NEXT: vmovsd %xmm1, (%esp) 515; X86-NEXT: fldl (%esp) 516; X86-NEXT: movl %ebp, %esp 517; X86-NEXT: popl %ebp 518; X86-NEXT: vzeroupper 519; X86-NEXT: retl 520 %v = call <4 x double> @llvm.fma.v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z) 521 %r = extractelement <4 x double> %v, i32 0 522 ret double %r 523} 524 525define float @fabs_v4f32(<4 x float> %x) nounwind { 526; X64-LABEL: fabs_v4f32: 527; X64: # %bb.0: 528; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] 529; X64-NEXT: vandps %xmm1, %xmm0, %xmm0 530; X64-NEXT: retq 531; 532; X86-LABEL: fabs_v4f32: 533; X86: # %bb.0: 534; X86-NEXT: pushl %eax 535; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] 536; X86-NEXT: vandps %xmm1, %xmm0, %xmm0 537; X86-NEXT: vmovss %xmm0, (%esp) 538; X86-NEXT: flds (%esp) 539; X86-NEXT: popl %eax 540; X86-NEXT: retl 541 %v = call <4 x float> @llvm.fabs.v4f32(<4 x float> %x) 542 %r = extractelement <4 x float> %v, i32 0 543 ret float %r 544} 545 546define double @fabs_v4f64(<4 x double> %x) nounwind { 547; X64-LABEL: fabs_v4f64: 548; X64: # %bb.0: 549; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 550; X64-NEXT: vzeroupper 551; X64-NEXT: retq 552; 553; X86-LABEL: fabs_v4f64: 554; X86: # %bb.0: 555; X86-NEXT: pushl %ebp 556; X86-NEXT: movl %esp, %ebp 557; X86-NEXT: andl $-8, %esp 558; X86-NEXT: subl $8, %esp 559; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 560; X86-NEXT: vmovlps %xmm0, (%esp) 561; X86-NEXT: fldl (%esp) 562; X86-NEXT: movl %ebp, %esp 563; X86-NEXT: popl %ebp 564; X86-NEXT: vzeroupper 565; X86-NEXT: retl 566 %v = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x) 567 %r = extractelement <4 x double> %v, i32 0 568 ret double %r 569} 570 571define float @fmaxnum_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 572; X64-LABEL: fmaxnum_v4f32: 573; X64: # %bb.0: 574; X64-NEXT: vmaxss %xmm0, %xmm1, %xmm2 575; X64-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 576; X64-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 577; X64-NEXT: retq 578; 579; X86-LABEL: fmaxnum_v4f32: 580; X86: # %bb.0: 581; X86-NEXT: pushl %eax 582; X86-NEXT: vmaxss %xmm0, %xmm1, %xmm2 583; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 584; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 585; X86-NEXT: vmovss %xmm0, (%esp) 586; X86-NEXT: flds (%esp) 587; X86-NEXT: popl %eax 588; X86-NEXT: retl 589 %v = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y) 590 %r = extractelement <4 x float> %v, i32 0 591 ret float %r 592} 593 594define double @fmaxnum_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 595; X64-LABEL: fmaxnum_v4f64: 596; X64: # %bb.0: 597; X64-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 598; X64-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 599; X64-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 600; X64-NEXT: vzeroupper 601; X64-NEXT: retq 602; 603; X86-LABEL: fmaxnum_v4f64: 604; X86: # %bb.0: 605; X86-NEXT: pushl %ebp 606; X86-NEXT: movl %esp, %ebp 607; X86-NEXT: andl $-8, %esp 608; X86-NEXT: subl $8, %esp 609; X86-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 610; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 611; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 612; X86-NEXT: vmovlpd %xmm0, (%esp) 613; X86-NEXT: fldl (%esp) 614; X86-NEXT: movl %ebp, %esp 615; X86-NEXT: popl %ebp 616; X86-NEXT: vzeroupper 617; X86-NEXT: retl 618 %v = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %x, <4 x double> %y) 619 %r = extractelement <4 x double> %v, i32 0 620 ret double %r 621} 622 623define float @fminnum_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 624; X64-LABEL: fminnum_v4f32: 625; X64: # %bb.0: 626; X64-NEXT: vminss %xmm0, %xmm1, %xmm2 627; X64-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 628; X64-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 629; X64-NEXT: retq 630; 631; X86-LABEL: fminnum_v4f32: 632; X86: # %bb.0: 633; X86-NEXT: pushl %eax 634; X86-NEXT: vminss %xmm0, %xmm1, %xmm2 635; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 636; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 637; X86-NEXT: vmovss %xmm0, (%esp) 638; X86-NEXT: flds (%esp) 639; X86-NEXT: popl %eax 640; X86-NEXT: retl 641 %v = call <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y) 642 %r = extractelement <4 x float> %v, i32 0 643 ret float %r 644} 645 646define double @fminnum_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 647; X64-LABEL: fminnum_v4f64: 648; X64: # %bb.0: 649; X64-NEXT: vminsd %xmm0, %xmm1, %xmm2 650; X64-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 651; X64-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 652; X64-NEXT: vzeroupper 653; X64-NEXT: retq 654; 655; X86-LABEL: fminnum_v4f64: 656; X86: # %bb.0: 657; X86-NEXT: pushl %ebp 658; X86-NEXT: movl %esp, %ebp 659; X86-NEXT: andl $-8, %esp 660; X86-NEXT: subl $8, %esp 661; X86-NEXT: vminsd %xmm0, %xmm1, %xmm2 662; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 663; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 664; X86-NEXT: vmovlpd %xmm0, (%esp) 665; X86-NEXT: fldl (%esp) 666; X86-NEXT: movl %ebp, %esp 667; X86-NEXT: popl %ebp 668; X86-NEXT: vzeroupper 669; X86-NEXT: retl 670 %v = call <4 x double> @llvm.minnum.v4f64(<4 x double> %x, <4 x double> %y) 671 %r = extractelement <4 x double> %v, i32 0 672 ret double %r 673} 674 675;define float @fmaximum_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 676; %v = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> %y) 677; %r = extractelement <4 x float> %v, i32 0 678; ret float %r 679;} 680 681;define double @fmaximum_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 682; %v = call <4 x double> @llvm.maximum.v4f64(<4 x double> %x, <4 x double> %y) 683; %r = extractelement <4 x double> %v, i32 0 684; ret double %r 685;} 686 687;define float @fminimum_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 688; %v = call <4 x float> @llvm.minimum.v4f32(<4 x float> %x, <4 x float> %y) 689; %r = extractelement <4 x float> %v, i32 0 690; ret float %r 691;} 692 693;define double @fminimum_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 694; %v = call <4 x double> @llvm.minimum.v4f64(<4 x double> %x, <4 x double> %y) 695; %r = extractelement <4 x double> %v, i32 0 696; ret double %r 697;} 698 699define float @maxps_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 700; X64-LABEL: maxps_v4f32: 701; X64: # %bb.0: 702; X64-NEXT: vmaxss %xmm1, %xmm0, %xmm0 703; X64-NEXT: retq 704; 705; X86-LABEL: maxps_v4f32: 706; X86: # %bb.0: 707; X86-NEXT: pushl %eax 708; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm0 709; X86-NEXT: vmovss %xmm0, (%esp) 710; X86-NEXT: flds (%esp) 711; X86-NEXT: popl %eax 712; X86-NEXT: retl 713 %cmp = fcmp ogt <4 x float> %x, %y 714 %v = select <4 x i1> %cmp, <4 x float> %x, <4 x float> %y 715 %r = extractelement <4 x float> %v, i32 0 716 ret float %r 717} 718 719define double @maxpd_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 720; X64-LABEL: maxpd_v4f64: 721; X64: # %bb.0: 722; X64-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 723; X64-NEXT: vzeroupper 724; X64-NEXT: retq 725; 726; X86-LABEL: maxpd_v4f64: 727; X86: # %bb.0: 728; X86-NEXT: pushl %ebp 729; X86-NEXT: movl %esp, %ebp 730; X86-NEXT: andl $-8, %esp 731; X86-NEXT: subl $8, %esp 732; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 733; X86-NEXT: vmovsd %xmm0, (%esp) 734; X86-NEXT: fldl (%esp) 735; X86-NEXT: movl %ebp, %esp 736; X86-NEXT: popl %ebp 737; X86-NEXT: vzeroupper 738; X86-NEXT: retl 739 %cmp = fcmp ogt <4 x double> %x, %y 740 %v = select <4 x i1> %cmp, <4 x double> %x, <4 x double> %y 741 %r = extractelement <4 x double> %v, i32 0 742 ret double %r 743} 744 745define float @minps_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 746; X64-LABEL: minps_v4f32: 747; X64: # %bb.0: 748; X64-NEXT: vminss %xmm1, %xmm0, %xmm0 749; X64-NEXT: retq 750; 751; X86-LABEL: minps_v4f32: 752; X86: # %bb.0: 753; X86-NEXT: pushl %eax 754; X86-NEXT: vminss %xmm1, %xmm0, %xmm0 755; X86-NEXT: vmovss %xmm0, (%esp) 756; X86-NEXT: flds (%esp) 757; X86-NEXT: popl %eax 758; X86-NEXT: retl 759 %cmp = fcmp olt <4 x float> %x, %y 760 %v = select <4 x i1> %cmp, <4 x float> %x, <4 x float> %y 761 %r = extractelement <4 x float> %v, i32 0 762 ret float %r 763} 764 765define double @minpd_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 766; X64-LABEL: minpd_v4f64: 767; X64: # %bb.0: 768; X64-NEXT: vminsd %xmm1, %xmm0, %xmm0 769; X64-NEXT: vzeroupper 770; X64-NEXT: retq 771; 772; X86-LABEL: minpd_v4f64: 773; X86: # %bb.0: 774; X86-NEXT: pushl %ebp 775; X86-NEXT: movl %esp, %ebp 776; X86-NEXT: andl $-8, %esp 777; X86-NEXT: subl $8, %esp 778; X86-NEXT: vminsd %xmm1, %xmm0, %xmm0 779; X86-NEXT: vmovsd %xmm0, (%esp) 780; X86-NEXT: fldl (%esp) 781; X86-NEXT: movl %ebp, %esp 782; X86-NEXT: popl %ebp 783; X86-NEXT: vzeroupper 784; X86-NEXT: retl 785 %cmp = fcmp olt <4 x double> %x, %y 786 %v = select <4 x i1> %cmp, <4 x double> %x, <4 x double> %y 787 %r = extractelement <4 x double> %v, i32 0 788 ret double %r 789} 790 791define float @copysign_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 792; X64-LABEL: copysign_v4f32: 793; X64: # %bb.0: 794; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 795; X64-NEXT: vandps %xmm2, %xmm1, %xmm1 796; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] 797; X64-NEXT: vandps %xmm2, %xmm0, %xmm0 798; X64-NEXT: vorps %xmm1, %xmm0, %xmm0 799; X64-NEXT: retq 800; 801; X86-LABEL: copysign_v4f32: 802; X86: # %bb.0: 803; X86-NEXT: pushl %eax 804; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 805; X86-NEXT: vandps %xmm2, %xmm1, %xmm1 806; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] 807; X86-NEXT: vandps %xmm2, %xmm0, %xmm0 808; X86-NEXT: vorps %xmm1, %xmm0, %xmm0 809; X86-NEXT: vmovss %xmm0, (%esp) 810; X86-NEXT: flds (%esp) 811; X86-NEXT: popl %eax 812; X86-NEXT: retl 813 %v = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> %y) 814 %r = extractelement <4 x float> %v, i32 0 815 ret float %r 816} 817 818define double @copysign_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 819; X64-LABEL: copysign_v4f64: 820; X64: # %bb.0: 821; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 822; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 823; X64-NEXT: vorps %xmm1, %xmm0, %xmm0 824; X64-NEXT: vzeroupper 825; X64-NEXT: retq 826; 827; X86-LABEL: copysign_v4f64: 828; X86: # %bb.0: 829; X86-NEXT: pushl %ebp 830; X86-NEXT: movl %esp, %ebp 831; X86-NEXT: andl $-8, %esp 832; X86-NEXT: subl $8, %esp 833; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 834; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 835; X86-NEXT: vorps %xmm1, %xmm0, %xmm0 836; X86-NEXT: vmovlps %xmm0, (%esp) 837; X86-NEXT: fldl (%esp) 838; X86-NEXT: movl %ebp, %esp 839; X86-NEXT: popl %ebp 840; X86-NEXT: vzeroupper 841; X86-NEXT: retl 842 %v = call <4 x double> @llvm.copysign.v4f64(<4 x double> %x, <4 x double> %y) 843 %r = extractelement <4 x double> %v, i32 0 844 ret double %r 845} 846 847define float @floor_v4f32(<4 x float> %x) nounwind { 848; X64-LABEL: floor_v4f32: 849; X64: # %bb.0: 850; X64-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0 851; X64-NEXT: retq 852; 853; X86-LABEL: floor_v4f32: 854; X86: # %bb.0: 855; X86-NEXT: pushl %eax 856; X86-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0 857; X86-NEXT: vmovss %xmm0, (%esp) 858; X86-NEXT: flds (%esp) 859; X86-NEXT: popl %eax 860; X86-NEXT: retl 861 %v = call <4 x float> @llvm.floor.v4f32(<4 x float> %x) 862 %r = extractelement <4 x float> %v, i32 0 863 ret float %r 864} 865 866define double @floor_v4f64(<4 x double> %x) nounwind { 867; X64-LABEL: floor_v4f64: 868; X64: # %bb.0: 869; X64-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0 870; X64-NEXT: vzeroupper 871; X64-NEXT: retq 872; 873; X86-LABEL: floor_v4f64: 874; X86: # %bb.0: 875; X86-NEXT: pushl %ebp 876; X86-NEXT: movl %esp, %ebp 877; X86-NEXT: andl $-8, %esp 878; X86-NEXT: subl $8, %esp 879; X86-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0 880; X86-NEXT: vmovsd %xmm0, (%esp) 881; X86-NEXT: fldl (%esp) 882; X86-NEXT: movl %ebp, %esp 883; X86-NEXT: popl %ebp 884; X86-NEXT: vzeroupper 885; X86-NEXT: retl 886 %v = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) 887 %r = extractelement <4 x double> %v, i32 0 888 ret double %r 889} 890 891define float @ceil_v4f32(<4 x float> %x) nounwind { 892; X64-LABEL: ceil_v4f32: 893; X64: # %bb.0: 894; X64-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0 895; X64-NEXT: retq 896; 897; X86-LABEL: ceil_v4f32: 898; X86: # %bb.0: 899; X86-NEXT: pushl %eax 900; X86-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0 901; X86-NEXT: vmovss %xmm0, (%esp) 902; X86-NEXT: flds (%esp) 903; X86-NEXT: popl %eax 904; X86-NEXT: retl 905 %v = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) 906 %r = extractelement <4 x float> %v, i32 0 907 ret float %r 908} 909 910define double @ceil_v4f64(<4 x double> %x) nounwind { 911; X64-LABEL: ceil_v4f64: 912; X64: # %bb.0: 913; X64-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0 914; X64-NEXT: vzeroupper 915; X64-NEXT: retq 916; 917; X86-LABEL: ceil_v4f64: 918; X86: # %bb.0: 919; X86-NEXT: pushl %ebp 920; X86-NEXT: movl %esp, %ebp 921; X86-NEXT: andl $-8, %esp 922; X86-NEXT: subl $8, %esp 923; X86-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0 924; X86-NEXT: vmovsd %xmm0, (%esp) 925; X86-NEXT: fldl (%esp) 926; X86-NEXT: movl %ebp, %esp 927; X86-NEXT: popl %ebp 928; X86-NEXT: vzeroupper 929; X86-NEXT: retl 930 %v = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) 931 %r = extractelement <4 x double> %v, i32 0 932 ret double %r 933} 934 935define float @trunc_v4f32(<4 x float> %x) nounwind { 936; X64-LABEL: trunc_v4f32: 937; X64: # %bb.0: 938; X64-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 939; X64-NEXT: retq 940; 941; X86-LABEL: trunc_v4f32: 942; X86: # %bb.0: 943; X86-NEXT: pushl %eax 944; X86-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 945; X86-NEXT: vmovss %xmm0, (%esp) 946; X86-NEXT: flds (%esp) 947; X86-NEXT: popl %eax 948; X86-NEXT: retl 949 %v = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) 950 %r = extractelement <4 x float> %v, i32 0 951 ret float %r 952} 953 954define double @trunc_v4f64(<4 x double> %x) nounwind { 955; X64-LABEL: trunc_v4f64: 956; X64: # %bb.0: 957; X64-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 958; X64-NEXT: vzeroupper 959; X64-NEXT: retq 960; 961; X86-LABEL: trunc_v4f64: 962; X86: # %bb.0: 963; X86-NEXT: pushl %ebp 964; X86-NEXT: movl %esp, %ebp 965; X86-NEXT: andl $-8, %esp 966; X86-NEXT: subl $8, %esp 967; X86-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 968; X86-NEXT: vmovsd %xmm0, (%esp) 969; X86-NEXT: fldl (%esp) 970; X86-NEXT: movl %ebp, %esp 971; X86-NEXT: popl %ebp 972; X86-NEXT: vzeroupper 973; X86-NEXT: retl 974 %v = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) 975 %r = extractelement <4 x double> %v, i32 0 976 ret double %r 977} 978 979define float @rint_v4f32(<4 x float> %x) nounwind { 980; X64-LABEL: rint_v4f32: 981; X64: # %bb.0: 982; X64-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 983; X64-NEXT: retq 984; 985; X86-LABEL: rint_v4f32: 986; X86: # %bb.0: 987; X86-NEXT: pushl %eax 988; X86-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 989; X86-NEXT: vmovss %xmm0, (%esp) 990; X86-NEXT: flds (%esp) 991; X86-NEXT: popl %eax 992; X86-NEXT: retl 993 %v = call <4 x float> @llvm.rint.v4f32(<4 x float> %x) 994 %r = extractelement <4 x float> %v, i32 0 995 ret float %r 996} 997 998define double @rint_v4f64(<4 x double> %x) nounwind { 999; X64-LABEL: rint_v4f64: 1000; X64: # %bb.0: 1001; X64-NEXT: vroundsd $4, %xmm0, %xmm0, %xmm0 1002; X64-NEXT: vzeroupper 1003; X64-NEXT: retq 1004; 1005; X86-LABEL: rint_v4f64: 1006; X86: # %bb.0: 1007; X86-NEXT: pushl %ebp 1008; X86-NEXT: movl %esp, %ebp 1009; X86-NEXT: andl $-8, %esp 1010; X86-NEXT: subl $8, %esp 1011; X86-NEXT: vroundsd $4, %xmm0, %xmm0, %xmm0 1012; X86-NEXT: vmovsd %xmm0, (%esp) 1013; X86-NEXT: fldl (%esp) 1014; X86-NEXT: movl %ebp, %esp 1015; X86-NEXT: popl %ebp 1016; X86-NEXT: vzeroupper 1017; X86-NEXT: retl 1018 %v = call <4 x double> @llvm.rint.v4f64(<4 x double> %x) 1019 %r = extractelement <4 x double> %v, i32 0 1020 ret double %r 1021} 1022 1023define float @nearbyint_v4f32(<4 x float> %x) nounwind { 1024; X64-LABEL: nearbyint_v4f32: 1025; X64: # %bb.0: 1026; X64-NEXT: vroundss $12, %xmm0, %xmm0, %xmm0 1027; X64-NEXT: retq 1028; 1029; X86-LABEL: nearbyint_v4f32: 1030; X86: # %bb.0: 1031; X86-NEXT: pushl %eax 1032; X86-NEXT: vroundss $12, %xmm0, %xmm0, %xmm0 1033; X86-NEXT: vmovss %xmm0, (%esp) 1034; X86-NEXT: flds (%esp) 1035; X86-NEXT: popl %eax 1036; X86-NEXT: retl 1037 %v = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %x) 1038 %r = extractelement <4 x float> %v, i32 0 1039 ret float %r 1040} 1041 1042define double @nearbyint_v4f64(<4 x double> %x) nounwind { 1043; X64-LABEL: nearbyint_v4f64: 1044; X64: # %bb.0: 1045; X64-NEXT: vroundsd $12, %xmm0, %xmm0, %xmm0 1046; X64-NEXT: vzeroupper 1047; X64-NEXT: retq 1048; 1049; X86-LABEL: nearbyint_v4f64: 1050; X86: # %bb.0: 1051; X86-NEXT: pushl %ebp 1052; X86-NEXT: movl %esp, %ebp 1053; X86-NEXT: andl $-8, %esp 1054; X86-NEXT: subl $8, %esp 1055; X86-NEXT: vroundsd $12, %xmm0, %xmm0, %xmm0 1056; X86-NEXT: vmovsd %xmm0, (%esp) 1057; X86-NEXT: fldl (%esp) 1058; X86-NEXT: movl %ebp, %esp 1059; X86-NEXT: popl %ebp 1060; X86-NEXT: vzeroupper 1061; X86-NEXT: retl 1062 %v = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %x) 1063 %r = extractelement <4 x double> %v, i32 0 1064 ret double %r 1065} 1066 1067define float @round_v4f32(<4 x float> %x) nounwind { 1068; X64-LABEL: round_v4f32: 1069; X64: # %bb.0: 1070; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 1071; X64-NEXT: vandps %xmm1, %xmm0, %xmm1 1072; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] 1073; X64-NEXT: vorps %xmm1, %xmm2, %xmm1 1074; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0 1075; X64-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 1076; X64-NEXT: retq 1077; 1078; X86-LABEL: round_v4f32: 1079; X86: # %bb.0: 1080; X86-NEXT: pushl %eax 1081; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 1082; X86-NEXT: vandps %xmm1, %xmm0, %xmm1 1083; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] 1084; X86-NEXT: vorps %xmm1, %xmm2, %xmm1 1085; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0 1086; X86-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 1087; X86-NEXT: vmovss %xmm0, (%esp) 1088; X86-NEXT: flds (%esp) 1089; X86-NEXT: popl %eax 1090; X86-NEXT: retl 1091 %v = call <4 x float> @llvm.round.v4f32(<4 x float> %x) 1092 %r = extractelement <4 x float> %v, i32 0 1093 ret float %r 1094} 1095 1096define double @round_v4f64(<4 x double> %x) nounwind { 1097; X64-LABEL: round_v4f64: 1098; X64: # %bb.0: 1099; X64-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1100; X64-NEXT: vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1] 1101; X64-NEXT: # xmm2 = mem[0,0] 1102; X64-NEXT: vorpd %xmm1, %xmm2, %xmm1 1103; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1104; X64-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 1105; X64-NEXT: vzeroupper 1106; X64-NEXT: retq 1107; 1108; X86-LABEL: round_v4f64: 1109; X86: # %bb.0: 1110; X86-NEXT: pushl %ebp 1111; X86-NEXT: movl %esp, %ebp 1112; X86-NEXT: andl $-8, %esp 1113; X86-NEXT: subl $8, %esp 1114; X86-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 1115; X86-NEXT: vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1] 1116; X86-NEXT: # xmm2 = mem[0,0] 1117; X86-NEXT: vorpd %xmm1, %xmm2, %xmm1 1118; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1119; X86-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 1120; X86-NEXT: vmovsd %xmm0, (%esp) 1121; X86-NEXT: fldl (%esp) 1122; X86-NEXT: movl %ebp, %esp 1123; X86-NEXT: popl %ebp 1124; X86-NEXT: vzeroupper 1125; X86-NEXT: retl 1126 %v = call <4 x double> @llvm.round.v4f64(<4 x double> %x) 1127 %r = extractelement <4 x double> %v, i32 0 1128 ret double %r 1129} 1130 1131define float @rcp_v4f32(<4 x float> %x) nounwind { 1132; X64-LABEL: rcp_v4f32: 1133; X64: # %bb.0: 1134; X64-NEXT: vrcpss %xmm0, %xmm0, %xmm0 1135; X64-NEXT: retq 1136; 1137; X86-LABEL: rcp_v4f32: 1138; X86: # %bb.0: 1139; X86-NEXT: pushl %eax 1140; X86-NEXT: vrcpss %xmm0, %xmm0, %xmm0 1141; X86-NEXT: vmovss %xmm0, (%esp) 1142; X86-NEXT: flds (%esp) 1143; X86-NEXT: popl %eax 1144; X86-NEXT: retl 1145 %v = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %x) 1146 %r = extractelement <4 x float> %v, i32 0 1147 ret float %r 1148} 1149 1150define float @rcp_v8f32(<8 x float> %x) nounwind { 1151; X64-LABEL: rcp_v8f32: 1152; X64: # %bb.0: 1153; X64-NEXT: vrcpps %ymm0, %ymm0 1154; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1155; X64-NEXT: vzeroupper 1156; X64-NEXT: retq 1157; 1158; X86-LABEL: rcp_v8f32: 1159; X86: # %bb.0: 1160; X86-NEXT: pushl %eax 1161; X86-NEXT: vrcpps %ymm0, %ymm0 1162; X86-NEXT: vmovss %xmm0, (%esp) 1163; X86-NEXT: flds (%esp) 1164; X86-NEXT: popl %eax 1165; X86-NEXT: vzeroupper 1166; X86-NEXT: retl 1167 %v = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %x) 1168 %r = extractelement <8 x float> %v, i32 0 1169 ret float %r 1170} 1171 1172define float @rsqrt_v4f32(<4 x float> %x) nounwind { 1173; X64-LABEL: rsqrt_v4f32: 1174; X64: # %bb.0: 1175; X64-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 1176; X64-NEXT: retq 1177; 1178; X86-LABEL: rsqrt_v4f32: 1179; X86: # %bb.0: 1180; X86-NEXT: pushl %eax 1181; X86-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 1182; X86-NEXT: vmovss %xmm0, (%esp) 1183; X86-NEXT: flds (%esp) 1184; X86-NEXT: popl %eax 1185; X86-NEXT: retl 1186 %v = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %x) 1187 %r = extractelement <4 x float> %v, i32 0 1188 ret float %r 1189} 1190 1191define float @rsqrt_v8f32(<8 x float> %x) nounwind { 1192; X64-LABEL: rsqrt_v8f32: 1193; X64: # %bb.0: 1194; X64-NEXT: vrsqrtps %ymm0, %ymm0 1195; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1196; X64-NEXT: vzeroupper 1197; X64-NEXT: retq 1198; 1199; X86-LABEL: rsqrt_v8f32: 1200; X86: # %bb.0: 1201; X86-NEXT: pushl %eax 1202; X86-NEXT: vrsqrtps %ymm0, %ymm0 1203; X86-NEXT: vmovss %xmm0, (%esp) 1204; X86-NEXT: flds (%esp) 1205; X86-NEXT: popl %eax 1206; X86-NEXT: vzeroupper 1207; X86-NEXT: retl 1208 %v = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %x) 1209 %r = extractelement <8 x float> %v, i32 0 1210 ret float %r 1211} 1212 1213declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) 1214declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) 1215declare <4 x float> @llvm.sin.v4f32(<4 x float>) 1216declare <4 x double> @llvm.sin.v4f64(<4 x double>) 1217declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) 1218declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) 1219declare <4 x float> @llvm.fabs.v4f32(<4 x float>) 1220declare <4 x double> @llvm.fabs.v4f64(<4 x double>) 1221declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) 1222declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>) 1223declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) 1224declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>) 1225declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>) 1226declare <4 x double> @llvm.maximum.v4f64(<4 x double>, <4 x double>) 1227declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>) 1228declare <4 x double> @llvm.minimum.v4f64(<4 x double>, <4 x double>) 1229declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) 1230declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) 1231declare <4 x float> @llvm.floor.v4f32(<4 x float>) 1232declare <4 x double> @llvm.floor.v4f64(<4 x double>) 1233declare <4 x float> @llvm.ceil.v4f32(<4 x float>) 1234declare <4 x double> @llvm.ceil.v4f64(<4 x double>) 1235declare <4 x float> @llvm.trunc.v4f32(<4 x float>) 1236declare <4 x double> @llvm.trunc.v4f64(<4 x double>) 1237declare <4 x float> @llvm.rint.v4f32(<4 x float>) 1238declare <4 x double> @llvm.rint.v4f64(<4 x double>) 1239declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) 1240declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) 1241declare <4 x float> @llvm.round.v4f32(<4 x float>) 1242declare <4 x double> @llvm.round.v4f64(<4 x double>) 1243 1244declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) 1245declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) 1246declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) 1247declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) 1248