1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41 4; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F 6; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512fp16,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512FP16 7 8define half @roundeven_f16(half %h) { 9; SSE2-LABEL: roundeven_f16: 10; SSE2: ## %bb.0: ## %entry 11; SSE2-NEXT: pushq %rax 12; SSE2-NEXT: .cfi_def_cfa_offset 16 13; SSE2-NEXT: movzwl %di, %edi 14; SSE2-NEXT: callq ___extendhfsf2 15; SSE2-NEXT: callq _roundevenf 16; SSE2-NEXT: callq ___truncsfhf2 17; SSE2-NEXT: popq %rcx 18; SSE2-NEXT: retq 19; 20; SSE41-LABEL: roundeven_f16: 21; SSE41: ## %bb.0: ## %entry 22; SSE41-NEXT: pushq %rax 23; SSE41-NEXT: .cfi_def_cfa_offset 16 24; SSE41-NEXT: movzwl %di, %edi 25; SSE41-NEXT: callq ___extendhfsf2 26; SSE41-NEXT: roundss $8, %xmm0, %xmm0 27; SSE41-NEXT: callq ___truncsfhf2 28; SSE41-NEXT: popq %rcx 29; SSE41-NEXT: retq 30; 31; AVX1-LABEL: roundeven_f16: 32; AVX1: ## %bb.0: ## %entry 33; AVX1-NEXT: pushq %rax 34; AVX1-NEXT: .cfi_def_cfa_offset 16 35; AVX1-NEXT: movzwl %di, %edi 36; AVX1-NEXT: callq ___extendhfsf2 37; AVX1-NEXT: vroundss $8, %xmm0, %xmm0, %xmm0 38; AVX1-NEXT: callq ___truncsfhf2 39; AVX1-NEXT: popq %rcx 40; AVX1-NEXT: retq 41; 42; AVX512F-LABEL: roundeven_f16: 43; AVX512F: ## %bb.0: ## %entry 44; AVX512F-NEXT: movzwl %di, %eax 45; AVX512F-NEXT: vmovd %eax, %xmm0 46; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 47; AVX512F-NEXT: vroundss $8, %xmm0, %xmm0, %xmm0 48; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 49; AVX512F-NEXT: vmovd %xmm0, %eax 50; AVX512F-NEXT: ## kill: def $ax killed $ax killed $eax 51; AVX512F-NEXT: retq 52; 53; AVX512FP16-LABEL: roundeven_f16: 54; AVX512FP16: ## %bb.0: ## %entry 55; AVX512FP16-NEXT: vrndscalesh $8, %xmm0, %xmm0, %xmm0 56; AVX512FP16-NEXT: retq 57entry: 58 %a = call half @llvm.roundeven.f16(half %h) 59 ret half %a 60} 61 62define float @roundeven_f32(float %x) { 63; SSE2-LABEL: roundeven_f32: 64; SSE2: ## %bb.0: 65; SSE2-NEXT: jmp _roundevenf ## TAILCALL 66; 67; SSE41-LABEL: roundeven_f32: 68; SSE41: ## %bb.0: 69; SSE41-NEXT: roundss $8, %xmm0, %xmm0 70; SSE41-NEXT: retq 71; 72; AVX-LABEL: roundeven_f32: 73; AVX: ## %bb.0: 74; AVX-NEXT: vroundss $8, %xmm0, %xmm0, %xmm0 75; AVX-NEXT: retq 76 %a = call float @llvm.roundeven.f32(float %x) 77 ret float %a 78} 79 80define double @roundeven_f64(double %x) { 81; SSE2-LABEL: roundeven_f64: 82; SSE2: ## %bb.0: 83; SSE2-NEXT: jmp _roundeven ## TAILCALL 84; 85; SSE41-LABEL: roundeven_f64: 86; SSE41: ## %bb.0: 87; SSE41-NEXT: roundsd $8, %xmm0, %xmm0 88; SSE41-NEXT: retq 89; 90; AVX-LABEL: roundeven_f64: 91; AVX: ## %bb.0: 92; AVX-NEXT: vroundsd $8, %xmm0, %xmm0, %xmm0 93; AVX-NEXT: retq 94 %a = call double @llvm.roundeven.f64(double %x) 95 ret double %a 96} 97 98define <4 x float> @roundeven_v4f32(<4 x float> %x) { 99; SSE2-LABEL: roundeven_v4f32: 100; SSE2: ## %bb.0: 101; SSE2-NEXT: subq $56, %rsp 102; SSE2-NEXT: .cfi_def_cfa_offset 64 103; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 104; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 105; SSE2-NEXT: callq _roundevenf 106; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 107; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 108; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 109; SSE2-NEXT: callq _roundevenf 110; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload 111; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 112; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 113; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 114; SSE2-NEXT: callq _roundevenf 115; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 116; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 117; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 118; SSE2-NEXT: callq _roundevenf 119; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 120; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 121; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload 122; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] 123; SSE2-NEXT: movaps %xmm1, %xmm0 124; SSE2-NEXT: addq $56, %rsp 125; SSE2-NEXT: retq 126; 127; SSE41-LABEL: roundeven_v4f32: 128; SSE41: ## %bb.0: 129; SSE41-NEXT: roundps $8, %xmm0, %xmm0 130; SSE41-NEXT: retq 131; 132; AVX-LABEL: roundeven_v4f32: 133; AVX: ## %bb.0: 134; AVX-NEXT: vroundps $8, %xmm0, %xmm0 135; AVX-NEXT: retq 136 %a = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %x) 137 ret <4 x float> %a 138} 139 140define <2 x double> @roundeven_v2f64(<2 x double> %x) { 141; SSE2-LABEL: roundeven_v2f64: 142; SSE2: ## %bb.0: 143; SSE2-NEXT: subq $40, %rsp 144; SSE2-NEXT: .cfi_def_cfa_offset 48 145; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 146; SSE2-NEXT: callq _roundeven 147; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 148; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 149; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 150; SSE2-NEXT: callq _roundeven 151; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 152; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 153; SSE2-NEXT: movaps %xmm1, %xmm0 154; SSE2-NEXT: addq $40, %rsp 155; SSE2-NEXT: retq 156; 157; SSE41-LABEL: roundeven_v2f64: 158; SSE41: ## %bb.0: 159; SSE41-NEXT: roundpd $8, %xmm0, %xmm0 160; SSE41-NEXT: retq 161; 162; AVX-LABEL: roundeven_v2f64: 163; AVX: ## %bb.0: 164; AVX-NEXT: vroundpd $8, %xmm0, %xmm0 165; AVX-NEXT: retq 166 %a = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %x) 167 ret <2 x double> %a 168} 169 170define <8 x float> @roundeven_v8f32(<8 x float> %x) { 171; SSE2-LABEL: roundeven_v8f32: 172; SSE2: ## %bb.0: 173; SSE2-NEXT: subq $72, %rsp 174; SSE2-NEXT: .cfi_def_cfa_offset 80 175; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 176; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 177; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 178; SSE2-NEXT: callq _roundevenf 179; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 180; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 181; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 182; SSE2-NEXT: callq _roundevenf 183; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload 184; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 185; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 186; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 187; SSE2-NEXT: callq _roundevenf 188; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 189; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 190; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 191; SSE2-NEXT: callq _roundevenf 192; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 193; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 194; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload 195; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] 196; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 197; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 198; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 199; SSE2-NEXT: callq _roundevenf 200; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 201; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 202; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 203; SSE2-NEXT: callq _roundevenf 204; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload 205; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 206; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 207; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 208; SSE2-NEXT: callq _roundevenf 209; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 210; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 211; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 212; SSE2-NEXT: callq _roundevenf 213; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 214; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 215; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload 216; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] 217; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 218; SSE2-NEXT: addq $72, %rsp 219; SSE2-NEXT: retq 220; 221; SSE41-LABEL: roundeven_v8f32: 222; SSE41: ## %bb.0: 223; SSE41-NEXT: roundps $8, %xmm0, %xmm0 224; SSE41-NEXT: roundps $8, %xmm1, %xmm1 225; SSE41-NEXT: retq 226; 227; AVX-LABEL: roundeven_v8f32: 228; AVX: ## %bb.0: 229; AVX-NEXT: vroundps $8, %ymm0, %ymm0 230; AVX-NEXT: retq 231 %a = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %x) 232 ret <8 x float> %a 233} 234 235define <4 x double> @roundeven_v4f64(<4 x double> %x) { 236; SSE2-LABEL: roundeven_v4f64: 237; SSE2: ## %bb.0: 238; SSE2-NEXT: subq $56, %rsp 239; SSE2-NEXT: .cfi_def_cfa_offset 64 240; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 241; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 242; SSE2-NEXT: callq _roundeven 243; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 244; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 245; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 246; SSE2-NEXT: callq _roundeven 247; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 248; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 249; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 250; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 251; SSE2-NEXT: callq _roundeven 252; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 253; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 254; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 255; SSE2-NEXT: callq _roundeven 256; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload 257; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 258; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 259; SSE2-NEXT: addq $56, %rsp 260; SSE2-NEXT: retq 261; 262; SSE41-LABEL: roundeven_v4f64: 263; SSE41: ## %bb.0: 264; SSE41-NEXT: roundpd $8, %xmm0, %xmm0 265; SSE41-NEXT: roundpd $8, %xmm1, %xmm1 266; SSE41-NEXT: retq 267; 268; AVX-LABEL: roundeven_v4f64: 269; AVX: ## %bb.0: 270; AVX-NEXT: vroundpd $8, %ymm0, %ymm0 271; AVX-NEXT: retq 272 %a = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %x) 273 ret <4 x double> %a 274} 275 276define <16 x float> @roundeven_v16f32(<16 x float> %x) { 277; SSE2-LABEL: roundeven_v16f32: 278; SSE2: ## %bb.0: 279; SSE2-NEXT: subq $104, %rsp 280; SSE2-NEXT: .cfi_def_cfa_offset 112 281; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 282; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 283; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill 284; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 285; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 286; SSE2-NEXT: callq _roundevenf 287; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 288; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 289; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 290; SSE2-NEXT: callq _roundevenf 291; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload 292; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 293; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 294; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 295; SSE2-NEXT: callq _roundevenf 296; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 297; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 298; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 299; SSE2-NEXT: callq _roundevenf 300; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 301; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 302; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload 303; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] 304; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 305; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 306; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 307; SSE2-NEXT: callq _roundevenf 308; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 309; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 310; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 311; SSE2-NEXT: callq _roundevenf 312; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload 313; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 314; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 315; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 316; SSE2-NEXT: callq _roundevenf 317; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 318; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 319; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 320; SSE2-NEXT: callq _roundevenf 321; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 322; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 323; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload 324; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] 325; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 326; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 327; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 328; SSE2-NEXT: callq _roundevenf 329; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 330; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 331; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 332; SSE2-NEXT: callq _roundevenf 333; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload 334; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 335; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 336; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 337; SSE2-NEXT: callq _roundevenf 338; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 339; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 340; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 341; SSE2-NEXT: callq _roundevenf 342; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload 343; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 344; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload 345; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] 346; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill 347; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 348; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 349; SSE2-NEXT: callq _roundevenf 350; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 351; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 352; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 353; SSE2-NEXT: callq _roundevenf 354; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload 355; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 356; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 357; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 358; SSE2-NEXT: callq _roundevenf 359; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 360; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 361; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 362; SSE2-NEXT: callq _roundevenf 363; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload 364; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 365; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Folded Reload 366; SSE2-NEXT: ## xmm3 = xmm3[0],mem[0] 367; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 368; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 369; SSE2-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload 370; SSE2-NEXT: addq $104, %rsp 371; SSE2-NEXT: retq 372; 373; SSE41-LABEL: roundeven_v16f32: 374; SSE41: ## %bb.0: 375; SSE41-NEXT: roundps $8, %xmm0, %xmm0 376; SSE41-NEXT: roundps $8, %xmm1, %xmm1 377; SSE41-NEXT: roundps $8, %xmm2, %xmm2 378; SSE41-NEXT: roundps $8, %xmm3, %xmm3 379; SSE41-NEXT: retq 380; 381; AVX1-LABEL: roundeven_v16f32: 382; AVX1: ## %bb.0: 383; AVX1-NEXT: vroundps $8, %ymm0, %ymm0 384; AVX1-NEXT: vroundps $8, %ymm1, %ymm1 385; AVX1-NEXT: retq 386; 387; AVX512-LABEL: roundeven_v16f32: 388; AVX512: ## %bb.0: 389; AVX512-NEXT: vrndscaleps $8, %zmm0, %zmm0 390; AVX512-NEXT: retq 391 %a = call <16 x float> @llvm.roundeven.v16f32(<16 x float> %x) 392 ret <16 x float> %a 393} 394 395define <8 x double> @roundeven_v8f64(<8 x double> %x) { 396; SSE2-LABEL: roundeven_v8f64: 397; SSE2: ## %bb.0: 398; SSE2-NEXT: subq $88, %rsp 399; SSE2-NEXT: .cfi_def_cfa_offset 96 400; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 401; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 402; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill 403; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 404; SSE2-NEXT: callq _roundeven 405; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 406; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 407; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 408; SSE2-NEXT: callq _roundeven 409; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 410; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 411; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 412; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 413; SSE2-NEXT: callq _roundeven 414; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 415; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 416; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 417; SSE2-NEXT: callq _roundeven 418; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 419; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 420; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 421; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 422; SSE2-NEXT: callq _roundeven 423; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 424; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 425; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 426; SSE2-NEXT: callq _roundeven 427; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload 428; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 429; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill 430; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 431; SSE2-NEXT: callq _roundeven 432; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 433; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 434; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 435; SSE2-NEXT: callq _roundeven 436; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload 437; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] 438; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 439; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 440; SSE2-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload 441; SSE2-NEXT: addq $88, %rsp 442; SSE2-NEXT: retq 443; 444; SSE41-LABEL: roundeven_v8f64: 445; SSE41: ## %bb.0: 446; SSE41-NEXT: roundpd $8, %xmm0, %xmm0 447; SSE41-NEXT: roundpd $8, %xmm1, %xmm1 448; SSE41-NEXT: roundpd $8, %xmm2, %xmm2 449; SSE41-NEXT: roundpd $8, %xmm3, %xmm3 450; SSE41-NEXT: retq 451; 452; AVX1-LABEL: roundeven_v8f64: 453; AVX1: ## %bb.0: 454; AVX1-NEXT: vroundpd $8, %ymm0, %ymm0 455; AVX1-NEXT: vroundpd $8, %ymm1, %ymm1 456; AVX1-NEXT: retq 457; 458; AVX512-LABEL: roundeven_v8f64: 459; AVX512: ## %bb.0: 460; AVX512-NEXT: vrndscalepd $8, %zmm0, %zmm0 461; AVX512-NEXT: retq 462 %a = call <8 x double> @llvm.roundeven.v8f64(<8 x double> %x) 463 ret <8 x double> %a 464} 465 466declare half @llvm.roundeven.f16(half) 467declare float @llvm.roundeven.f32(float) 468declare double @llvm.roundeven.f64(double) 469declare <4 x float> @llvm.roundeven.v4f32(<4 x float>) 470declare <2 x double> @llvm.roundeven.v2f64(<2 x double>) 471declare <8 x float> @llvm.roundeven.v8f32(<8 x float>) 472declare <4 x double> @llvm.roundeven.v4f64(<4 x double>) 473declare <16 x float> @llvm.roundeven.v16f32(<16 x float>) 474declare <8 x double> @llvm.roundeven.v8f64(<8 x double>) 475