1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41 4; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512 6 7define float @round_f32(float %x) { 8; SSE2-LABEL: round_f32: 9; SSE2: ## %bb.0: 10; SSE2-NEXT: jmp _roundf ## TAILCALL 11; 12; SSE41-LABEL: round_f32: 13; SSE41: ## %bb.0: 14; SSE41-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 15; SSE41-NEXT: andps %xmm0, %xmm1 16; SSE41-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 17; SSE41-NEXT: addss %xmm0, %xmm1 18; SSE41-NEXT: xorps %xmm0, %xmm0 19; SSE41-NEXT: roundss $11, %xmm1, %xmm0 20; SSE41-NEXT: retq 21; 22; AVX1-LABEL: round_f32: 23; AVX1: ## %bb.0: 24; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 25; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] 26; AVX1-NEXT: vorps %xmm1, %xmm2, %xmm1 27; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 28; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 29; AVX1-NEXT: retq 30; 31; AVX512-LABEL: round_f32: 32; AVX512: ## %bb.0: 33; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 34; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm1 35; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] 36; AVX512-NEXT: vorps %xmm1, %xmm2, %xmm1 37; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 38; AVX512-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 39; AVX512-NEXT: retq 40 %a = call float @llvm.round.f32(float %x) 41 ret float %a 42} 43 44define double @round_f64(double %x) { 45; SSE2-LABEL: round_f64: 46; SSE2: ## %bb.0: 47; SSE2-NEXT: jmp _round ## TAILCALL 48; 49; SSE41-LABEL: round_f64: 50; SSE41: ## %bb.0: 51; SSE41-NEXT: movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] 52; SSE41-NEXT: andpd %xmm0, %xmm1 53; SSE41-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 54; SSE41-NEXT: addsd %xmm0, %xmm1 55; SSE41-NEXT: xorps %xmm0, %xmm0 56; SSE41-NEXT: roundsd $11, %xmm1, %xmm0 57; SSE41-NEXT: retq 58; 59; AVX-LABEL: round_f64: 60; AVX: ## %bb.0: 61; AVX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 62; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1] 63; AVX-NEXT: ## xmm2 = mem[0,0] 64; AVX-NEXT: vorpd %xmm1, %xmm2, %xmm1 65; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 66; AVX-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 67; AVX-NEXT: retq 68 %a = call double @llvm.round.f64(double %x) 69 ret double %a 70} 71 72define <4 x float> @round_v4f32(<4 x float> %x) { 73; SSE2-LABEL: round_v4f32: 74; SSE2: ## %bb.0: 75; SSE2-NEXT: subq $56, %rsp 76; SSE2-NEXT: .cfi_def_cfa_offset 64 77; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 78; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 79; SSE2-NEXT: callq _roundf 80; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 81; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 82; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 83; SSE2-NEXT: callq _roundf 84; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload 85; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 86; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 87; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 88; SSE2-NEXT: callq _roundf 89; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 90; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 91; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 92; SSE2-NEXT: callq _roundf 93; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 94; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 95; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload 96; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] 97; SSE2-NEXT: movaps %xmm1, %xmm0 98; SSE2-NEXT: addq $56, %rsp 99; SSE2-NEXT: retq 100; 101; SSE41-LABEL: round_v4f32: 102; SSE41: ## %bb.0: 103; SSE41-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 104; SSE41-NEXT: andps %xmm0, %xmm1 105; SSE41-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 106; SSE41-NEXT: addps %xmm0, %xmm1 107; SSE41-NEXT: roundps $11, %xmm1, %xmm0 108; SSE41-NEXT: retq 109; 110; AVX1-LABEL: round_v4f32: 111; AVX1: ## %bb.0: 112; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 113; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 114; AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 115; AVX1-NEXT: vroundps $11, %xmm0, %xmm0 116; AVX1-NEXT: retq 117; 118; AVX512-LABEL: round_v4f32: 119; AVX512: ## %bb.0: 120; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 121; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm1 122; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] 123; AVX512-NEXT: vorps %xmm1, %xmm2, %xmm1 124; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 125; AVX512-NEXT: vroundps $11, %xmm0, %xmm0 126; AVX512-NEXT: retq 127 %a = call <4 x float> @llvm.round.v4f32(<4 x float> %x) 128 ret <4 x float> %a 129} 130 131define <2 x double> @round_v2f64(<2 x double> %x) { 132; SSE2-LABEL: round_v2f64: 133; SSE2: ## %bb.0: 134; SSE2-NEXT: subq $40, %rsp 135; SSE2-NEXT: .cfi_def_cfa_offset 48 136; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 137; SSE2-NEXT: callq _round 138; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 139; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 140; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 141; SSE2-NEXT: callq _round 142; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 143; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 144; SSE2-NEXT: movaps %xmm1, %xmm0 145; SSE2-NEXT: addq $40, %rsp 146; SSE2-NEXT: retq 147; 148; SSE41-LABEL: round_v2f64: 149; SSE41: ## %bb.0: 150; SSE41-NEXT: movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] 151; SSE41-NEXT: andpd %xmm0, %xmm1 152; SSE41-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 153; SSE41-NEXT: addpd %xmm0, %xmm1 154; SSE41-NEXT: roundpd $11, %xmm1, %xmm0 155; SSE41-NEXT: retq 156; 157; AVX-LABEL: round_v2f64: 158; AVX: ## %bb.0: 159; AVX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 160; AVX-NEXT: vorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 161; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 162; AVX-NEXT: vroundpd $11, %xmm0, %xmm0 163; AVX-NEXT: retq 164 %a = call <2 x double> @llvm.round.v2f64(<2 x double> %x) 165 ret <2 x double> %a 166} 167 168define <8 x float> @round_v8f32(<8 x float> %x) { 169; SSE2-LABEL: round_v8f32: 170; SSE2: ## %bb.0: 171; SSE2-NEXT: subq $72, %rsp 172; SSE2-NEXT: .cfi_def_cfa_offset 80 173; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 174; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 175; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 176; SSE2-NEXT: callq _roundf 177; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 178; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 179; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 180; SSE2-NEXT: callq _roundf 181; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload 182; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 183; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 184; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 185; SSE2-NEXT: callq _roundf 186; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 187; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 188; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 189; SSE2-NEXT: callq _roundf 190; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 191; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 192; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload 193; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] 194; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 195; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 196; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 197; SSE2-NEXT: callq _roundf 198; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 199; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 200; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 201; SSE2-NEXT: callq _roundf 202; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload 203; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 204; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 205; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 206; SSE2-NEXT: callq _roundf 207; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 208; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 209; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 210; SSE2-NEXT: callq _roundf 211; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 212; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 213; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload 214; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] 215; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 216; SSE2-NEXT: addq $72, %rsp 217; SSE2-NEXT: retq 218; 219; SSE41-LABEL: round_v8f32: 220; SSE41: ## %bb.0: 221; SSE41-NEXT: movaps {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 222; SSE41-NEXT: movaps %xmm0, %xmm3 223; SSE41-NEXT: andps %xmm2, %xmm3 224; SSE41-NEXT: movaps {{.*#+}} xmm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] 225; SSE41-NEXT: orps %xmm4, %xmm3 226; SSE41-NEXT: addps %xmm0, %xmm3 227; SSE41-NEXT: roundps $11, %xmm3, %xmm0 228; SSE41-NEXT: andps %xmm1, %xmm2 229; SSE41-NEXT: orps %xmm4, %xmm2 230; SSE41-NEXT: addps %xmm1, %xmm2 231; SSE41-NEXT: roundps $11, %xmm2, %xmm1 232; SSE41-NEXT: retq 233; 234; AVX1-LABEL: round_v8f32: 235; AVX1: ## %bb.0: 236; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 237; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 238; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 239; AVX1-NEXT: vroundps $11, %ymm0, %ymm0 240; AVX1-NEXT: retq 241; 242; AVX512-LABEL: round_v8f32: 243; AVX512: ## %bb.0: 244; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 245; AVX512-NEXT: vandps %ymm1, %ymm0, %ymm1 246; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] 247; AVX512-NEXT: vorps %ymm1, %ymm2, %ymm1 248; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 249; AVX512-NEXT: vroundps $11, %ymm0, %ymm0 250; AVX512-NEXT: retq 251 %a = call <8 x float> @llvm.round.v8f32(<8 x float> %x) 252 ret <8 x float> %a 253} 254 255define <4 x double> @round_v4f64(<4 x double> %x) { 256; SSE2-LABEL: round_v4f64: 257; SSE2: ## %bb.0: 258; SSE2-NEXT: subq $56, %rsp 259; SSE2-NEXT: .cfi_def_cfa_offset 64 260; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 261; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 262; SSE2-NEXT: callq _round 263; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 264; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 265; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 266; SSE2-NEXT: callq _round 267; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 268; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 269; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 270; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 271; SSE2-NEXT: callq _round 272; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 273; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 274; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 275; SSE2-NEXT: callq _round 276; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload 277; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 278; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 279; SSE2-NEXT: addq $56, %rsp 280; SSE2-NEXT: retq 281; 282; SSE41-LABEL: round_v4f64: 283; SSE41: ## %bb.0: 284; SSE41-NEXT: movapd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0] 285; SSE41-NEXT: movapd %xmm0, %xmm3 286; SSE41-NEXT: andpd %xmm2, %xmm3 287; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4.9999999999999994E-1,4.9999999999999994E-1] 288; SSE41-NEXT: orpd %xmm4, %xmm3 289; SSE41-NEXT: addpd %xmm0, %xmm3 290; SSE41-NEXT: roundpd $11, %xmm3, %xmm0 291; SSE41-NEXT: andpd %xmm1, %xmm2 292; SSE41-NEXT: orpd %xmm4, %xmm2 293; SSE41-NEXT: addpd %xmm1, %xmm2 294; SSE41-NEXT: roundpd $11, %xmm2, %xmm1 295; SSE41-NEXT: retq 296; 297; AVX1-LABEL: round_v4f64: 298; AVX1: ## %bb.0: 299; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 300; AVX1-NEXT: vorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 301; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 302; AVX1-NEXT: vroundpd $11, %ymm0, %ymm0 303; AVX1-NEXT: retq 304; 305; AVX512-LABEL: round_v4f64: 306; AVX512: ## %bb.0: 307; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 308; AVX512-NEXT: vandpd %ymm1, %ymm0, %ymm1 309; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1] 310; AVX512-NEXT: vorpd %ymm1, %ymm2, %ymm1 311; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0 312; AVX512-NEXT: vroundpd $11, %ymm0, %ymm0 313; AVX512-NEXT: retq 314 %a = call <4 x double> @llvm.round.v4f64(<4 x double> %x) 315 ret <4 x double> %a 316} 317 318define <16 x float> @round_v16f32(<16 x float> %x) { 319; SSE2-LABEL: round_v16f32: 320; SSE2: ## %bb.0: 321; SSE2-NEXT: subq $104, %rsp 322; SSE2-NEXT: .cfi_def_cfa_offset 112 323; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 324; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 325; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill 326; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 327; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 328; SSE2-NEXT: callq _roundf 329; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 330; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 331; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 332; SSE2-NEXT: callq _roundf 333; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload 334; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 335; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 336; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 337; SSE2-NEXT: callq _roundf 338; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 339; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 340; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 341; SSE2-NEXT: callq _roundf 342; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 343; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 344; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload 345; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] 346; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 347; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 348; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 349; SSE2-NEXT: callq _roundf 350; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 351; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 352; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 353; SSE2-NEXT: callq _roundf 354; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload 355; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 356; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 357; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 358; SSE2-NEXT: callq _roundf 359; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 360; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 361; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 362; SSE2-NEXT: callq _roundf 363; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 364; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 365; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload 366; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] 367; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 368; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 369; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 370; SSE2-NEXT: callq _roundf 371; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 372; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 373; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 374; SSE2-NEXT: callq _roundf 375; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload 376; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 377; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 378; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 379; SSE2-NEXT: callq _roundf 380; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 381; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 382; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 383; SSE2-NEXT: callq _roundf 384; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload 385; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 386; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload 387; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] 388; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill 389; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 390; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 391; SSE2-NEXT: callq _roundf 392; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 393; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 394; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 395; SSE2-NEXT: callq _roundf 396; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload 397; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 398; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 399; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 400; SSE2-NEXT: callq _roundf 401; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 402; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 403; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 404; SSE2-NEXT: callq _roundf 405; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload 406; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 407; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Folded Reload 408; SSE2-NEXT: ## xmm3 = xmm3[0],mem[0] 409; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 410; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 411; SSE2-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload 412; SSE2-NEXT: addq $104, %rsp 413; SSE2-NEXT: retq 414; 415; SSE41-LABEL: round_v16f32: 416; SSE41: ## %bb.0: 417; SSE41-NEXT: movaps {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 418; SSE41-NEXT: movaps %xmm0, %xmm5 419; SSE41-NEXT: andps %xmm4, %xmm5 420; SSE41-NEXT: movaps {{.*#+}} xmm6 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] 421; SSE41-NEXT: orps %xmm6, %xmm5 422; SSE41-NEXT: addps %xmm0, %xmm5 423; SSE41-NEXT: roundps $11, %xmm5, %xmm0 424; SSE41-NEXT: movaps %xmm1, %xmm5 425; SSE41-NEXT: andps %xmm4, %xmm5 426; SSE41-NEXT: orps %xmm6, %xmm5 427; SSE41-NEXT: addps %xmm1, %xmm5 428; SSE41-NEXT: roundps $11, %xmm5, %xmm1 429; SSE41-NEXT: movaps %xmm2, %xmm5 430; SSE41-NEXT: andps %xmm4, %xmm5 431; SSE41-NEXT: orps %xmm6, %xmm5 432; SSE41-NEXT: addps %xmm2, %xmm5 433; SSE41-NEXT: roundps $11, %xmm5, %xmm2 434; SSE41-NEXT: andps %xmm3, %xmm4 435; SSE41-NEXT: orps %xmm6, %xmm4 436; SSE41-NEXT: addps %xmm3, %xmm4 437; SSE41-NEXT: roundps $11, %xmm4, %xmm3 438; SSE41-NEXT: retq 439; 440; AVX1-LABEL: round_v16f32: 441; AVX1: ## %bb.0: 442; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 443; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm3 444; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] 445; AVX1-NEXT: vorps %ymm3, %ymm4, %ymm3 446; AVX1-NEXT: vaddps %ymm3, %ymm0, %ymm0 447; AVX1-NEXT: vroundps $11, %ymm0, %ymm0 448; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm2 449; AVX1-NEXT: vorps %ymm2, %ymm4, %ymm2 450; AVX1-NEXT: vaddps %ymm2, %ymm1, %ymm1 451; AVX1-NEXT: vroundps $11, %ymm1, %ymm1 452; AVX1-NEXT: retq 453; 454; AVX512-LABEL: round_v16f32: 455; AVX512: ## %bb.0: 456; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] 457; AVX512-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1 458; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 459; AVX512-NEXT: vrndscaleps $11, %zmm0, %zmm0 460; AVX512-NEXT: retq 461 %a = call <16 x float> @llvm.round.v16f32(<16 x float> %x) 462 ret <16 x float> %a 463} 464 465define <8 x double> @round_v8f64(<8 x double> %x) { 466; SSE2-LABEL: round_v8f64: 467; SSE2: ## %bb.0: 468; SSE2-NEXT: subq $88, %rsp 469; SSE2-NEXT: .cfi_def_cfa_offset 96 470; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 471; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 472; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill 473; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 474; SSE2-NEXT: callq _round 475; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 476; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 477; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 478; SSE2-NEXT: callq _round 479; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 480; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 481; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 482; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 483; SSE2-NEXT: callq _round 484; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 485; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 486; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 487; SSE2-NEXT: callq _round 488; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 489; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 490; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 491; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 492; SSE2-NEXT: callq _round 493; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 494; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 495; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 496; SSE2-NEXT: callq _round 497; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload 498; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 499; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill 500; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 501; SSE2-NEXT: callq _round 502; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 503; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 504; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 505; SSE2-NEXT: callq _round 506; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload 507; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] 508; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 509; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 510; SSE2-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload 511; SSE2-NEXT: addq $88, %rsp 512; SSE2-NEXT: retq 513; 514; SSE41-LABEL: round_v8f64: 515; SSE41: ## %bb.0: 516; SSE41-NEXT: movapd {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0] 517; SSE41-NEXT: movapd %xmm0, %xmm5 518; SSE41-NEXT: andpd %xmm4, %xmm5 519; SSE41-NEXT: movapd {{.*#+}} xmm6 = [4.9999999999999994E-1,4.9999999999999994E-1] 520; SSE41-NEXT: orpd %xmm6, %xmm5 521; SSE41-NEXT: addpd %xmm0, %xmm5 522; SSE41-NEXT: roundpd $11, %xmm5, %xmm0 523; SSE41-NEXT: movapd %xmm1, %xmm5 524; SSE41-NEXT: andpd %xmm4, %xmm5 525; SSE41-NEXT: orpd %xmm6, %xmm5 526; SSE41-NEXT: addpd %xmm1, %xmm5 527; SSE41-NEXT: roundpd $11, %xmm5, %xmm1 528; SSE41-NEXT: movapd %xmm2, %xmm5 529; SSE41-NEXT: andpd %xmm4, %xmm5 530; SSE41-NEXT: orpd %xmm6, %xmm5 531; SSE41-NEXT: addpd %xmm2, %xmm5 532; SSE41-NEXT: roundpd $11, %xmm5, %xmm2 533; SSE41-NEXT: andpd %xmm3, %xmm4 534; SSE41-NEXT: orpd %xmm6, %xmm4 535; SSE41-NEXT: addpd %xmm3, %xmm4 536; SSE41-NEXT: roundpd $11, %xmm4, %xmm3 537; SSE41-NEXT: retq 538; 539; AVX1-LABEL: round_v8f64: 540; AVX1: ## %bb.0: 541; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 542; AVX1-NEXT: vandpd %ymm2, %ymm0, %ymm3 543; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1] 544; AVX1-NEXT: vorpd %ymm3, %ymm4, %ymm3 545; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 546; AVX1-NEXT: vroundpd $11, %ymm0, %ymm0 547; AVX1-NEXT: vandpd %ymm2, %ymm1, %ymm2 548; AVX1-NEXT: vorpd %ymm2, %ymm4, %ymm2 549; AVX1-NEXT: vaddpd %ymm2, %ymm1, %ymm1 550; AVX1-NEXT: vroundpd $11, %ymm1, %ymm1 551; AVX1-NEXT: retq 552; 553; AVX512-LABEL: round_v8f64: 554; AVX512: ## %bb.0: 555; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1] 556; AVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 557; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 558; AVX512-NEXT: vrndscalepd $11, %zmm0, %zmm0 559; AVX512-NEXT: retq 560 %a = call <8 x double> @llvm.round.v8f64(<8 x double> %x) 561 ret <8 x double> %a 562} 563 564declare float @llvm.round.f32(float) 565declare double @llvm.round.f64(double) 566declare <4 x float> @llvm.round.v4f32(<4 x float>) 567declare <2 x double> @llvm.round.v2f64(<2 x double>) 568declare <8 x float> @llvm.round.v8f32(<8 x float>) 569declare <4 x double> @llvm.round.v4f64(<4 x double>) 570declare <16 x float> @llvm.round.v16f32(<16 x float>) 571declare <8 x double> @llvm.round.v8f64(<8 x double>) 572