1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-RECIP 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefixes=AVX,FMA-RECIP 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 | FileCheck %s --check-prefixes=AVX,BDVER2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,BTVER2 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,SANDY 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell | FileCheck %s --check-prefixes=AVX,HASWELL 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefixes=AVX,HASWELL-NO-FMA 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX,AVX512,KNL 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=AVX,AVX512,SKX 12 13; If the target's divss/divps instructions are substantially 14; slower than rcpss/rcpps with a Newton-Raphson refinement, 15; we should generate the estimate sequence. 16 17; See PR21385 ( http://llvm.org/bugs/show_bug.cgi?id=21385 ) 18; for details about the accuracy, speed, and implementation 19; differences of x86 reciprocal estimates. 20 21define float @f32_no_estimate(float %x) #0 { 22; SSE-LABEL: f32_no_estimate: 23; SSE: # %bb.0: 24; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 25; SSE-NEXT: divss %xmm0, %xmm1 26; SSE-NEXT: movaps %xmm1, %xmm0 27; SSE-NEXT: retq 28; 29; AVX-LABEL: f32_no_estimate: 30; AVX: # %bb.0: 31; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 32; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 33; AVX-NEXT: retq 34 %div = fdiv fast float 1.0, %x 35 ret float %div 36} 37 38define float @f32_one_step(float %x) #1 { 39; SSE-LABEL: f32_one_step: 40; SSE: # %bb.0: 41; SSE-NEXT: rcpss %xmm0, %xmm2 42; SSE-NEXT: mulss %xmm2, %xmm0 43; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 44; SSE-NEXT: subss %xmm0, %xmm1 45; SSE-NEXT: mulss %xmm2, %xmm1 46; SSE-NEXT: addss %xmm2, %xmm1 47; SSE-NEXT: movaps %xmm1, %xmm0 48; SSE-NEXT: retq 49; 50; AVX-RECIP-LABEL: f32_one_step: 51; AVX-RECIP: # %bb.0: 52; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 53; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 54; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 55; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 56; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 57; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 58; AVX-RECIP-NEXT: retq 59; 60; FMA-RECIP-LABEL: f32_one_step: 61; FMA-RECIP: # %bb.0: 62; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 63; FMA-RECIP-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - mem 64; FMA-RECIP-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 65; FMA-RECIP-NEXT: retq 66; 67; BDVER2-LABEL: f32_one_step: 68; BDVER2: # %bb.0: 69; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 70; BDVER2-NEXT: vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm1) - mem 71; BDVER2-NEXT: vfnmaddss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 72; BDVER2-NEXT: retq 73; 74; BTVER2-LABEL: f32_one_step: 75; BTVER2: # %bb.0: 76; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 77; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 78; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 79; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 80; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 81; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 82; BTVER2-NEXT: retq 83; 84; SANDY-LABEL: f32_one_step: 85; SANDY: # %bb.0: 86; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 87; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 88; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 89; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 90; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 91; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 92; SANDY-NEXT: retq 93; 94; HASWELL-LABEL: f32_one_step: 95; HASWELL: # %bb.0: 96; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 97; HASWELL-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - mem 98; HASWELL-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 99; HASWELL-NEXT: retq 100; 101; HASWELL-NO-FMA-LABEL: f32_one_step: 102; HASWELL-NO-FMA: # %bb.0: 103; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 104; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 105; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 106; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 107; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 108; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 109; HASWELL-NO-FMA-NEXT: retq 110; 111; AVX512-LABEL: f32_one_step: 112; AVX512: # %bb.0: 113; AVX512-NEXT: vrcpss %xmm0, %xmm0, %xmm1 114; AVX512-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - mem 115; AVX512-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 116; AVX512-NEXT: retq 117 %div = fdiv fast float 1.0, %x 118 ret float %div 119} 120 121define float @f32_one_step_variables(float %x, float %y) #1 { 122; SSE-LABEL: f32_one_step_variables: 123; SSE: # %bb.0: 124; SSE-NEXT: rcpss %xmm1, %xmm2 125; SSE-NEXT: movaps %xmm0, %xmm3 126; SSE-NEXT: mulss %xmm2, %xmm3 127; SSE-NEXT: mulss %xmm3, %xmm1 128; SSE-NEXT: subss %xmm1, %xmm0 129; SSE-NEXT: mulss %xmm2, %xmm0 130; SSE-NEXT: addss %xmm3, %xmm0 131; SSE-NEXT: retq 132; 133; AVX-RECIP-LABEL: f32_one_step_variables: 134; AVX-RECIP: # %bb.0: 135; AVX-RECIP-NEXT: vrcpss %xmm1, %xmm1, %xmm2 136; AVX-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm3 137; AVX-RECIP-NEXT: vmulss %xmm3, %xmm1, %xmm1 138; AVX-RECIP-NEXT: vsubss %xmm1, %xmm0, %xmm0 139; AVX-RECIP-NEXT: vmulss %xmm0, %xmm2, %xmm0 140; AVX-RECIP-NEXT: vaddss %xmm0, %xmm3, %xmm0 141; AVX-RECIP-NEXT: retq 142; 143; FMA-RECIP-LABEL: f32_one_step_variables: 144; FMA-RECIP: # %bb.0: 145; FMA-RECIP-NEXT: vrcpss %xmm1, %xmm1, %xmm2 146; FMA-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm3 147; FMA-RECIP-NEXT: vfmsub231ss {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0 148; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 149; FMA-RECIP-NEXT: retq 150; 151; BDVER2-LABEL: f32_one_step_variables: 152; BDVER2: # %bb.0: 153; BDVER2-NEXT: vrcpss %xmm1, %xmm1, %xmm2 154; BDVER2-NEXT: vmulss %xmm2, %xmm0, %xmm3 155; BDVER2-NEXT: vfmsubss {{.*#+}} xmm0 = (xmm1 * xmm3) - xmm0 156; BDVER2-NEXT: vfnmaddss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 157; BDVER2-NEXT: retq 158; 159; BTVER2-LABEL: f32_one_step_variables: 160; BTVER2: # %bb.0: 161; BTVER2-NEXT: vrcpss %xmm1, %xmm1, %xmm2 162; BTVER2-NEXT: vmulss %xmm2, %xmm0, %xmm3 163; BTVER2-NEXT: vmulss %xmm3, %xmm1, %xmm1 164; BTVER2-NEXT: vsubss %xmm1, %xmm0, %xmm0 165; BTVER2-NEXT: vmulss %xmm0, %xmm2, %xmm0 166; BTVER2-NEXT: vaddss %xmm0, %xmm3, %xmm0 167; BTVER2-NEXT: retq 168; 169; SANDY-LABEL: f32_one_step_variables: 170; SANDY: # %bb.0: 171; SANDY-NEXT: vrcpss %xmm1, %xmm1, %xmm2 172; SANDY-NEXT: vmulss %xmm2, %xmm0, %xmm3 173; SANDY-NEXT: vmulss %xmm3, %xmm1, %xmm1 174; SANDY-NEXT: vsubss %xmm1, %xmm0, %xmm0 175; SANDY-NEXT: vmulss %xmm0, %xmm2, %xmm0 176; SANDY-NEXT: vaddss %xmm0, %xmm3, %xmm0 177; SANDY-NEXT: retq 178; 179; HASWELL-LABEL: f32_one_step_variables: 180; HASWELL: # %bb.0: 181; HASWELL-NEXT: vrcpss %xmm1, %xmm1, %xmm2 182; HASWELL-NEXT: vmulss %xmm2, %xmm0, %xmm3 183; HASWELL-NEXT: vfmsub231ss {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0 184; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 185; HASWELL-NEXT: retq 186; 187; HASWELL-NO-FMA-LABEL: f32_one_step_variables: 188; HASWELL-NO-FMA: # %bb.0: 189; HASWELL-NO-FMA-NEXT: vrcpss %xmm1, %xmm1, %xmm2 190; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm0, %xmm3 191; HASWELL-NO-FMA-NEXT: vmulss %xmm3, %xmm1, %xmm1 192; HASWELL-NO-FMA-NEXT: vsubss %xmm1, %xmm0, %xmm0 193; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm2, %xmm0 194; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm3, %xmm0 195; HASWELL-NO-FMA-NEXT: retq 196; 197; AVX512-LABEL: f32_one_step_variables: 198; AVX512: # %bb.0: 199; AVX512-NEXT: vrcpss %xmm1, %xmm1, %xmm2 200; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm3 201; AVX512-NEXT: vfmsub231ss {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0 202; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 203; AVX512-NEXT: retq 204 %div = fdiv fast float %x, %y 205 ret float %div 206} 207 208define float @f32_two_step(float %x) #2 { 209; SSE-LABEL: f32_two_step: 210; SSE: # %bb.0: 211; SSE-NEXT: rcpss %xmm0, %xmm2 212; SSE-NEXT: movaps %xmm0, %xmm3 213; SSE-NEXT: mulss %xmm2, %xmm3 214; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 215; SSE-NEXT: movaps %xmm1, %xmm4 216; SSE-NEXT: subss %xmm3, %xmm4 217; SSE-NEXT: mulss %xmm2, %xmm4 218; SSE-NEXT: addss %xmm2, %xmm4 219; SSE-NEXT: mulss %xmm4, %xmm0 220; SSE-NEXT: subss %xmm0, %xmm1 221; SSE-NEXT: mulss %xmm4, %xmm1 222; SSE-NEXT: addss %xmm4, %xmm1 223; SSE-NEXT: movaps %xmm1, %xmm0 224; SSE-NEXT: retq 225; 226; AVX-RECIP-LABEL: f32_two_step: 227; AVX-RECIP: # %bb.0: 228; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 229; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm2 230; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero 231; AVX-RECIP-NEXT: vsubss %xmm2, %xmm3, %xmm2 232; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm2 233; AVX-RECIP-NEXT: vaddss %xmm2, %xmm1, %xmm1 234; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 235; AVX-RECIP-NEXT: vsubss %xmm0, %xmm3, %xmm0 236; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 237; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 238; AVX-RECIP-NEXT: retq 239; 240; FMA-RECIP-LABEL: f32_two_step: 241; FMA-RECIP: # %bb.0: 242; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 243; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 244; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 245; FMA-RECIP-NEXT: vfmsub213ss {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2 246; FMA-RECIP-NEXT: vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1 247; FMA-RECIP-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2 248; FMA-RECIP-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm3) + xmm3 249; FMA-RECIP-NEXT: retq 250; 251; BDVER2-LABEL: f32_two_step: 252; BDVER2: # %bb.0: 253; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 254; BDVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 255; BDVER2-NEXT: vfmsubss {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm2 256; BDVER2-NEXT: vfnmaddss {{.*#+}} xmm1 = -(xmm1 * xmm3) + xmm1 257; BDVER2-NEXT: vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 258; BDVER2-NEXT: vfnmaddss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 259; BDVER2-NEXT: retq 260; 261; BTVER2-LABEL: f32_two_step: 262; BTVER2: # %bb.0: 263; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero 264; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 265; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 266; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2 267; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2 268; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1 269; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 270; BTVER2-NEXT: vsubss %xmm0, %xmm3, %xmm0 271; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 272; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 273; BTVER2-NEXT: retq 274; 275; SANDY-LABEL: f32_two_step: 276; SANDY: # %bb.0: 277; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 278; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 279; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero 280; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 281; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 282; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 283; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 284; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 285; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 286; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 287; SANDY-NEXT: retq 288; 289; HASWELL-LABEL: f32_two_step: 290; HASWELL: # %bb.0: 291; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 292; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 293; HASWELL-NEXT: vmovaps %xmm1, %xmm3 294; HASWELL-NEXT: vfmsub213ss {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2 295; HASWELL-NEXT: vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1 296; HASWELL-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2 297; HASWELL-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm3) + xmm3 298; HASWELL-NEXT: retq 299; 300; HASWELL-NO-FMA-LABEL: f32_two_step: 301; HASWELL-NO-FMA: # %bb.0: 302; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 303; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2 304; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero 305; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2 306; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 307; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1 308; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 309; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0 310; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 311; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 312; HASWELL-NO-FMA-NEXT: retq 313; 314; AVX512-LABEL: f32_two_step: 315; AVX512: # %bb.0: 316; AVX512-NEXT: vrcpss %xmm0, %xmm0, %xmm1 317; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 318; AVX512-NEXT: vmovaps %xmm1, %xmm3 319; AVX512-NEXT: vfmsub213ss {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2 320; AVX512-NEXT: vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1 321; AVX512-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2 322; AVX512-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm3) + xmm3 323; AVX512-NEXT: retq 324 %div = fdiv fast float 1.0, %x 325 ret float %div 326} 327 328define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 { 329; SSE-LABEL: v4f32_no_estimate: 330; SSE: # %bb.0: 331; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 332; SSE-NEXT: divps %xmm0, %xmm1 333; SSE-NEXT: movaps %xmm1, %xmm0 334; SSE-NEXT: retq 335; 336; AVX-RECIP-LABEL: v4f32_no_estimate: 337; AVX-RECIP: # %bb.0: 338; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 339; AVX-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0 340; AVX-RECIP-NEXT: retq 341; 342; FMA-RECIP-LABEL: v4f32_no_estimate: 343; FMA-RECIP: # %bb.0: 344; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 345; FMA-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0 346; FMA-RECIP-NEXT: retq 347; 348; BDVER2-LABEL: v4f32_no_estimate: 349; BDVER2: # %bb.0: 350; BDVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 351; BDVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0 352; BDVER2-NEXT: retq 353; 354; BTVER2-LABEL: v4f32_no_estimate: 355; BTVER2: # %bb.0: 356; BTVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 357; BTVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0 358; BTVER2-NEXT: retq 359; 360; SANDY-LABEL: v4f32_no_estimate: 361; SANDY: # %bb.0: 362; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 363; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 364; SANDY-NEXT: retq 365; 366; HASWELL-LABEL: v4f32_no_estimate: 367; HASWELL: # %bb.0: 368; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 369; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0 370; HASWELL-NEXT: retq 371; 372; HASWELL-NO-FMA-LABEL: v4f32_no_estimate: 373; HASWELL-NO-FMA: # %bb.0: 374; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 375; HASWELL-NO-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0 376; HASWELL-NO-FMA-NEXT: retq 377; 378; AVX512-LABEL: v4f32_no_estimate: 379; AVX512: # %bb.0: 380; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 381; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0 382; AVX512-NEXT: retq 383 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x 384 ret <4 x float> %div 385} 386 387define <4 x float> @v4f32_one_step(<4 x float> %x) #1 { 388; SSE-LABEL: v4f32_one_step: 389; SSE: # %bb.0: 390; SSE-NEXT: rcpps %xmm0, %xmm2 391; SSE-NEXT: mulps %xmm2, %xmm0 392; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 393; SSE-NEXT: subps %xmm0, %xmm1 394; SSE-NEXT: mulps %xmm2, %xmm1 395; SSE-NEXT: addps %xmm2, %xmm1 396; SSE-NEXT: movaps %xmm1, %xmm0 397; SSE-NEXT: retq 398; 399; AVX-RECIP-LABEL: v4f32_one_step: 400; AVX-RECIP: # %bb.0: 401; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 402; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 403; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 404; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 405; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 406; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 407; AVX-RECIP-NEXT: retq 408; 409; FMA-RECIP-LABEL: v4f32_one_step: 410; FMA-RECIP: # %bb.0: 411; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 412; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - mem 413; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 414; FMA-RECIP-NEXT: retq 415; 416; BDVER2-LABEL: v4f32_one_step: 417; BDVER2: # %bb.0: 418; BDVER2-NEXT: vrcpps %xmm0, %xmm1 419; BDVER2-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - mem 420; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 421; BDVER2-NEXT: retq 422; 423; BTVER2-LABEL: v4f32_one_step: 424; BTVER2: # %bb.0: 425; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 426; BTVER2-NEXT: vrcpps %xmm0, %xmm1 427; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 428; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 429; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 430; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 431; BTVER2-NEXT: retq 432; 433; SANDY-LABEL: v4f32_one_step: 434; SANDY: # %bb.0: 435; SANDY-NEXT: vrcpps %xmm0, %xmm1 436; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 437; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 438; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 439; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 440; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 441; SANDY-NEXT: retq 442; 443; HASWELL-LABEL: v4f32_one_step: 444; HASWELL: # %bb.0: 445; HASWELL-NEXT: vrcpps %xmm0, %xmm2 446; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 447; HASWELL-NEXT: vfmsub231ps {{.*#+}} xmm1 = (xmm2 * xmm0) - xmm1 448; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm2 449; HASWELL-NEXT: vmovaps %xmm1, %xmm0 450; HASWELL-NEXT: retq 451; 452; HASWELL-NO-FMA-LABEL: v4f32_one_step: 453; HASWELL-NO-FMA: # %bb.0: 454; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 455; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 456; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 457; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 458; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 459; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 460; HASWELL-NO-FMA-NEXT: retq 461; 462; KNL-LABEL: v4f32_one_step: 463; KNL: # %bb.0: 464; KNL-NEXT: vrcpps %xmm0, %xmm2 465; KNL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 466; KNL-NEXT: vfmsub231ps {{.*#+}} xmm1 = (xmm2 * xmm0) - xmm1 467; KNL-NEXT: vfnmadd132ps {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm2 468; KNL-NEXT: vmovaps %xmm1, %xmm0 469; KNL-NEXT: retq 470; 471; SKX-LABEL: v4f32_one_step: 472; SKX: # %bb.0: 473; SKX-NEXT: vrcpps %xmm0, %xmm1 474; SKX-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - mem 475; SKX-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 476; SKX-NEXT: retq 477 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x 478 ret <4 x float> %div 479} 480 481define <4 x float> @v4f32_one_step_variables(<4 x float> %x, <4 x float> %y) #1 { 482; SSE-LABEL: v4f32_one_step_variables: 483; SSE: # %bb.0: 484; SSE-NEXT: rcpps %xmm1, %xmm2 485; SSE-NEXT: movaps %xmm0, %xmm3 486; SSE-NEXT: mulps %xmm2, %xmm3 487; SSE-NEXT: mulps %xmm3, %xmm1 488; SSE-NEXT: subps %xmm1, %xmm0 489; SSE-NEXT: mulps %xmm2, %xmm0 490; SSE-NEXT: addps %xmm3, %xmm0 491; SSE-NEXT: retq 492; 493; AVX-RECIP-LABEL: v4f32_one_step_variables: 494; AVX-RECIP: # %bb.0: 495; AVX-RECIP-NEXT: vrcpps %xmm1, %xmm2 496; AVX-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm3 497; AVX-RECIP-NEXT: vmulps %xmm3, %xmm1, %xmm1 498; AVX-RECIP-NEXT: vsubps %xmm1, %xmm0, %xmm0 499; AVX-RECIP-NEXT: vmulps %xmm0, %xmm2, %xmm0 500; AVX-RECIP-NEXT: vaddps %xmm0, %xmm3, %xmm0 501; AVX-RECIP-NEXT: retq 502; 503; FMA-RECIP-LABEL: v4f32_one_step_variables: 504; FMA-RECIP: # %bb.0: 505; FMA-RECIP-NEXT: vrcpps %xmm1, %xmm2 506; FMA-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm3 507; FMA-RECIP-NEXT: vfmsub231ps {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0 508; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 509; FMA-RECIP-NEXT: retq 510; 511; BDVER2-LABEL: v4f32_one_step_variables: 512; BDVER2: # %bb.0: 513; BDVER2-NEXT: vrcpps %xmm1, %xmm2 514; BDVER2-NEXT: vmulps %xmm2, %xmm0, %xmm3 515; BDVER2-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm1 * xmm3) - xmm0 516; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 517; BDVER2-NEXT: retq 518; 519; BTVER2-LABEL: v4f32_one_step_variables: 520; BTVER2: # %bb.0: 521; BTVER2-NEXT: vrcpps %xmm1, %xmm2 522; BTVER2-NEXT: vmulps %xmm2, %xmm0, %xmm3 523; BTVER2-NEXT: vmulps %xmm3, %xmm1, %xmm1 524; BTVER2-NEXT: vsubps %xmm1, %xmm0, %xmm0 525; BTVER2-NEXT: vmulps %xmm0, %xmm2, %xmm0 526; BTVER2-NEXT: vaddps %xmm0, %xmm3, %xmm0 527; BTVER2-NEXT: retq 528; 529; SANDY-LABEL: v4f32_one_step_variables: 530; SANDY: # %bb.0: 531; SANDY-NEXT: vrcpps %xmm1, %xmm2 532; SANDY-NEXT: vmulps %xmm2, %xmm0, %xmm3 533; SANDY-NEXT: vmulps %xmm3, %xmm1, %xmm1 534; SANDY-NEXT: vsubps %xmm1, %xmm0, %xmm0 535; SANDY-NEXT: vmulps %xmm0, %xmm2, %xmm0 536; SANDY-NEXT: vaddps %xmm0, %xmm3, %xmm0 537; SANDY-NEXT: retq 538; 539; HASWELL-LABEL: v4f32_one_step_variables: 540; HASWELL: # %bb.0: 541; HASWELL-NEXT: vrcpps %xmm1, %xmm2 542; HASWELL-NEXT: vmulps %xmm2, %xmm0, %xmm3 543; HASWELL-NEXT: vfmsub231ps {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0 544; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 545; HASWELL-NEXT: retq 546; 547; HASWELL-NO-FMA-LABEL: v4f32_one_step_variables: 548; HASWELL-NO-FMA: # %bb.0: 549; HASWELL-NO-FMA-NEXT: vrcpps %xmm1, %xmm2 550; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm0, %xmm3 551; HASWELL-NO-FMA-NEXT: vmulps %xmm3, %xmm1, %xmm1 552; HASWELL-NO-FMA-NEXT: vsubps %xmm1, %xmm0, %xmm0 553; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm2, %xmm0 554; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm3, %xmm0 555; HASWELL-NO-FMA-NEXT: retq 556; 557; AVX512-LABEL: v4f32_one_step_variables: 558; AVX512: # %bb.0: 559; AVX512-NEXT: vrcpps %xmm1, %xmm2 560; AVX512-NEXT: vmulps %xmm2, %xmm0, %xmm3 561; AVX512-NEXT: vfmsub231ps {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0 562; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 563; AVX512-NEXT: retq 564 %div = fdiv fast <4 x float> %x, %y 565 ret <4 x float> %div 566} 567 568define <4 x float> @v4f32_two_step(<4 x float> %x) #2 { 569; SSE-LABEL: v4f32_two_step: 570; SSE: # %bb.0: 571; SSE-NEXT: rcpps %xmm0, %xmm2 572; SSE-NEXT: movaps %xmm0, %xmm3 573; SSE-NEXT: mulps %xmm2, %xmm3 574; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 575; SSE-NEXT: movaps %xmm1, %xmm4 576; SSE-NEXT: subps %xmm3, %xmm4 577; SSE-NEXT: mulps %xmm2, %xmm4 578; SSE-NEXT: addps %xmm2, %xmm4 579; SSE-NEXT: mulps %xmm4, %xmm0 580; SSE-NEXT: subps %xmm0, %xmm1 581; SSE-NEXT: mulps %xmm4, %xmm1 582; SSE-NEXT: addps %xmm4, %xmm1 583; SSE-NEXT: movaps %xmm1, %xmm0 584; SSE-NEXT: retq 585; 586; AVX-RECIP-LABEL: v4f32_two_step: 587; AVX-RECIP: # %bb.0: 588; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 589; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2 590; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 591; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2 592; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2 593; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1 594; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 595; AVX-RECIP-NEXT: vsubps %xmm0, %xmm3, %xmm0 596; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 597; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 598; AVX-RECIP-NEXT: retq 599; 600; FMA-RECIP-LABEL: v4f32_two_step: 601; FMA-RECIP: # %bb.0: 602; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 603; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 604; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 605; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2 606; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1 607; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2 608; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm3) + xmm3 609; FMA-RECIP-NEXT: retq 610; 611; BDVER2-LABEL: v4f32_two_step: 612; BDVER2: # %bb.0: 613; BDVER2-NEXT: vrcpps %xmm0, %xmm1 614; BDVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 615; BDVER2-NEXT: vfmsubps {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm2 616; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm1 = -(xmm1 * xmm3) + xmm1 617; BDVER2-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 618; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 619; BDVER2-NEXT: retq 620; 621; BTVER2-LABEL: v4f32_two_step: 622; BTVER2: # %bb.0: 623; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 624; BTVER2-NEXT: vrcpps %xmm0, %xmm1 625; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 626; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 627; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2 628; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1 629; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 630; BTVER2-NEXT: vsubps %xmm0, %xmm3, %xmm0 631; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 632; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 633; BTVER2-NEXT: retq 634; 635; SANDY-LABEL: v4f32_two_step: 636; SANDY: # %bb.0: 637; SANDY-NEXT: vrcpps %xmm0, %xmm1 638; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 639; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 640; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 641; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 642; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 643; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 644; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 645; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 646; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 647; SANDY-NEXT: retq 648; 649; HASWELL-LABEL: v4f32_two_step: 650; HASWELL: # %bb.0: 651; HASWELL-NEXT: vrcpps %xmm0, %xmm1 652; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 653; HASWELL-NEXT: vmovaps %xmm1, %xmm3 654; HASWELL-NEXT: vfmsub213ps {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2 655; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1 656; HASWELL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2 657; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm3) + xmm3 658; HASWELL-NEXT: retq 659; 660; HASWELL-NO-FMA-LABEL: v4f32_two_step: 661; HASWELL-NO-FMA: # %bb.0: 662; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 663; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 664; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 665; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 666; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 667; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 668; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 669; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0 670; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 671; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 672; HASWELL-NO-FMA-NEXT: retq 673; 674; AVX512-LABEL: v4f32_two_step: 675; AVX512: # %bb.0: 676; AVX512-NEXT: vrcpps %xmm0, %xmm1 677; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 678; AVX512-NEXT: vmovaps %xmm1, %xmm3 679; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2 680; AVX512-NEXT: vfnmadd132ps {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1 681; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2 682; AVX512-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm3) + xmm3 683; AVX512-NEXT: retq 684 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x 685 ret <4 x float> %div 686} 687 688define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 { 689; SSE-LABEL: v8f32_no_estimate: 690; SSE: # %bb.0: 691; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 692; SSE-NEXT: movaps %xmm2, %xmm3 693; SSE-NEXT: divps %xmm0, %xmm3 694; SSE-NEXT: divps %xmm1, %xmm2 695; SSE-NEXT: movaps %xmm3, %xmm0 696; SSE-NEXT: movaps %xmm2, %xmm1 697; SSE-NEXT: retq 698; 699; AVX-RECIP-LABEL: v8f32_no_estimate: 700; AVX-RECIP: # %bb.0: 701; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 702; AVX-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0 703; AVX-RECIP-NEXT: retq 704; 705; FMA-RECIP-LABEL: v8f32_no_estimate: 706; FMA-RECIP: # %bb.0: 707; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 708; FMA-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0 709; FMA-RECIP-NEXT: retq 710; 711; BDVER2-LABEL: v8f32_no_estimate: 712; BDVER2: # %bb.0: 713; BDVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 714; BDVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0 715; BDVER2-NEXT: retq 716; 717; BTVER2-LABEL: v8f32_no_estimate: 718; BTVER2: # %bb.0: 719; BTVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 720; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0 721; BTVER2-NEXT: retq 722; 723; SANDY-LABEL: v8f32_no_estimate: 724; SANDY: # %bb.0: 725; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 726; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 727; SANDY-NEXT: retq 728; 729; HASWELL-LABEL: v8f32_no_estimate: 730; HASWELL: # %bb.0: 731; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 732; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0 733; HASWELL-NEXT: retq 734; 735; HASWELL-NO-FMA-LABEL: v8f32_no_estimate: 736; HASWELL-NO-FMA: # %bb.0: 737; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 738; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0 739; HASWELL-NO-FMA-NEXT: retq 740; 741; AVX512-LABEL: v8f32_no_estimate: 742; AVX512: # %bb.0: 743; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 744; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 745; AVX512-NEXT: retq 746 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x 747 ret <8 x float> %div 748} 749 750define <8 x float> @v8f32_one_step(<8 x float> %x) #1 { 751; SSE-LABEL: v8f32_one_step: 752; SSE: # %bb.0: 753; SSE-NEXT: rcpps %xmm0, %xmm4 754; SSE-NEXT: mulps %xmm4, %xmm0 755; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 756; SSE-NEXT: movaps %xmm2, %xmm3 757; SSE-NEXT: subps %xmm0, %xmm3 758; SSE-NEXT: mulps %xmm4, %xmm3 759; SSE-NEXT: addps %xmm4, %xmm3 760; SSE-NEXT: rcpps %xmm1, %xmm0 761; SSE-NEXT: mulps %xmm0, %xmm1 762; SSE-NEXT: subps %xmm1, %xmm2 763; SSE-NEXT: mulps %xmm0, %xmm2 764; SSE-NEXT: addps %xmm0, %xmm2 765; SSE-NEXT: movaps %xmm3, %xmm0 766; SSE-NEXT: movaps %xmm2, %xmm1 767; SSE-NEXT: retq 768; 769; AVX-RECIP-LABEL: v8f32_one_step: 770; AVX-RECIP: # %bb.0: 771; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 772; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 773; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 774; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 775; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 776; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 777; AVX-RECIP-NEXT: retq 778; 779; FMA-RECIP-LABEL: v8f32_one_step: 780; FMA-RECIP: # %bb.0: 781; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 782; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - mem 783; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1 784; FMA-RECIP-NEXT: retq 785; 786; BDVER2-LABEL: v8f32_one_step: 787; BDVER2: # %bb.0: 788; BDVER2-NEXT: vrcpps %ymm0, %ymm1 789; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm1) - mem 790; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm1 791; BDVER2-NEXT: retq 792; 793; BTVER2-LABEL: v8f32_one_step: 794; BTVER2: # %bb.0: 795; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 796; BTVER2-NEXT: vrcpps %ymm0, %ymm1 797; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 798; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 799; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 800; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 801; BTVER2-NEXT: retq 802; 803; SANDY-LABEL: v8f32_one_step: 804; SANDY: # %bb.0: 805; SANDY-NEXT: vrcpps %ymm0, %ymm1 806; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 807; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 808; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 809; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 810; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 811; SANDY-NEXT: retq 812; 813; HASWELL-LABEL: v8f32_one_step: 814; HASWELL: # %bb.0: 815; HASWELL-NEXT: vrcpps %ymm0, %ymm2 816; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 817; HASWELL-NEXT: vfmsub231ps {{.*#+}} ymm1 = (ymm2 * ymm0) - ymm1 818; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2 819; HASWELL-NEXT: vmovaps %ymm1, %ymm0 820; HASWELL-NEXT: retq 821; 822; HASWELL-NO-FMA-LABEL: v8f32_one_step: 823; HASWELL-NO-FMA: # %bb.0: 824; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 825; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 826; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 827; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 828; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 829; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 830; HASWELL-NO-FMA-NEXT: retq 831; 832; KNL-LABEL: v8f32_one_step: 833; KNL: # %bb.0: 834; KNL-NEXT: vrcpps %ymm0, %ymm2 835; KNL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 836; KNL-NEXT: vfmsub231ps {{.*#+}} ymm1 = (ymm2 * ymm0) - ymm1 837; KNL-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2 838; KNL-NEXT: vmovaps %ymm1, %ymm0 839; KNL-NEXT: retq 840; 841; SKX-LABEL: v8f32_one_step: 842; SKX: # %bb.0: 843; SKX-NEXT: vrcpps %ymm0, %ymm1 844; SKX-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - mem 845; SKX-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1 846; SKX-NEXT: retq 847 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x 848 ret <8 x float> %div 849} 850 851define <8 x float> @v8f32_two_step(<8 x float> %x) #2 { 852; SSE-LABEL: v8f32_two_step: 853; SSE: # %bb.0: 854; SSE-NEXT: movaps %xmm1, %xmm2 855; SSE-NEXT: rcpps %xmm0, %xmm3 856; SSE-NEXT: movaps %xmm0, %xmm4 857; SSE-NEXT: mulps %xmm3, %xmm4 858; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 859; SSE-NEXT: movaps %xmm1, %xmm5 860; SSE-NEXT: subps %xmm4, %xmm5 861; SSE-NEXT: mulps %xmm3, %xmm5 862; SSE-NEXT: addps %xmm3, %xmm5 863; SSE-NEXT: mulps %xmm5, %xmm0 864; SSE-NEXT: movaps %xmm1, %xmm3 865; SSE-NEXT: subps %xmm0, %xmm3 866; SSE-NEXT: mulps %xmm5, %xmm3 867; SSE-NEXT: addps %xmm5, %xmm3 868; SSE-NEXT: rcpps %xmm2, %xmm0 869; SSE-NEXT: movaps %xmm2, %xmm4 870; SSE-NEXT: mulps %xmm0, %xmm4 871; SSE-NEXT: movaps %xmm1, %xmm5 872; SSE-NEXT: subps %xmm4, %xmm5 873; SSE-NEXT: mulps %xmm0, %xmm5 874; SSE-NEXT: addps %xmm0, %xmm5 875; SSE-NEXT: mulps %xmm5, %xmm2 876; SSE-NEXT: subps %xmm2, %xmm1 877; SSE-NEXT: mulps %xmm5, %xmm1 878; SSE-NEXT: addps %xmm5, %xmm1 879; SSE-NEXT: movaps %xmm3, %xmm0 880; SSE-NEXT: retq 881; 882; AVX-RECIP-LABEL: v8f32_two_step: 883; AVX-RECIP: # %bb.0: 884; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 885; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2 886; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 887; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2 888; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2 889; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1 890; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 891; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 892; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 893; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 894; AVX-RECIP-NEXT: retq 895; 896; FMA-RECIP-LABEL: v8f32_two_step: 897; FMA-RECIP: # %bb.0: 898; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 899; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 900; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3 901; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm3 = (ymm0 * ymm3) - ymm2 902; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm1) + ymm1 903; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm2 904; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm3) + ymm3 905; FMA-RECIP-NEXT: retq 906; 907; BDVER2-LABEL: v8f32_two_step: 908; BDVER2: # %bb.0: 909; BDVER2-NEXT: vrcpps %ymm0, %ymm1 910; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 911; BDVER2-NEXT: vfmsubps {{.*#+}} ymm3 = (ymm0 * ymm1) - ymm2 912; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm1 * ymm3) + ymm1 913; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2 914; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm1 915; BDVER2-NEXT: retq 916; 917; BTVER2-LABEL: v8f32_two_step: 918; BTVER2: # %bb.0: 919; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 920; BTVER2-NEXT: vrcpps %ymm0, %ymm1 921; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 922; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 923; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 924; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 925; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 926; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 927; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 928; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 929; BTVER2-NEXT: retq 930; 931; SANDY-LABEL: v8f32_two_step: 932; SANDY: # %bb.0: 933; SANDY-NEXT: vrcpps %ymm0, %ymm1 934; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 935; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 936; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 937; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 938; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 939; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 940; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 941; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 942; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 943; SANDY-NEXT: retq 944; 945; HASWELL-LABEL: v8f32_two_step: 946; HASWELL: # %bb.0: 947; HASWELL-NEXT: vrcpps %ymm0, %ymm1 948; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 949; HASWELL-NEXT: vmovaps %ymm1, %ymm3 950; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm3 = (ymm0 * ymm3) - ymm2 951; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm1) + ymm1 952; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm2 953; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm3) + ymm3 954; HASWELL-NEXT: retq 955; 956; HASWELL-NO-FMA-LABEL: v8f32_two_step: 957; HASWELL-NO-FMA: # %bb.0: 958; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 959; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 960; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 961; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 962; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 963; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 964; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 965; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 966; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 967; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 968; HASWELL-NO-FMA-NEXT: retq 969; 970; AVX512-LABEL: v8f32_two_step: 971; AVX512: # %bb.0: 972; AVX512-NEXT: vrcpps %ymm0, %ymm1 973; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 974; AVX512-NEXT: vmovaps %ymm1, %ymm3 975; AVX512-NEXT: vfmsub213ps {{.*#+}} ymm3 = (ymm0 * ymm3) - ymm2 976; AVX512-NEXT: vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm1) + ymm1 977; AVX512-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm2 978; AVX512-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm3) + ymm3 979; AVX512-NEXT: retq 980 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x 981 ret <8 x float> %div 982} 983 984define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 { 985; SSE-LABEL: v16f32_no_estimate: 986; SSE: # %bb.0: 987; SSE-NEXT: movaps {{.*#+}} xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 988; SSE-NEXT: movaps %xmm4, %xmm5 989; SSE-NEXT: divps %xmm0, %xmm5 990; SSE-NEXT: movaps %xmm4, %xmm6 991; SSE-NEXT: divps %xmm1, %xmm6 992; SSE-NEXT: movaps %xmm4, %xmm7 993; SSE-NEXT: divps %xmm2, %xmm7 994; SSE-NEXT: divps %xmm3, %xmm4 995; SSE-NEXT: movaps %xmm5, %xmm0 996; SSE-NEXT: movaps %xmm6, %xmm1 997; SSE-NEXT: movaps %xmm7, %xmm2 998; SSE-NEXT: movaps %xmm4, %xmm3 999; SSE-NEXT: retq 1000; 1001; AVX-RECIP-LABEL: v16f32_no_estimate: 1002; AVX-RECIP: # %bb.0: 1003; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1004; AVX-RECIP-NEXT: vdivps %ymm0, %ymm2, %ymm0 1005; AVX-RECIP-NEXT: vdivps %ymm1, %ymm2, %ymm1 1006; AVX-RECIP-NEXT: retq 1007; 1008; FMA-RECIP-LABEL: v16f32_no_estimate: 1009; FMA-RECIP: # %bb.0: 1010; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1011; FMA-RECIP-NEXT: vdivps %ymm0, %ymm2, %ymm0 1012; FMA-RECIP-NEXT: vdivps %ymm1, %ymm2, %ymm1 1013; FMA-RECIP-NEXT: retq 1014; 1015; BDVER2-LABEL: v16f32_no_estimate: 1016; BDVER2: # %bb.0: 1017; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1018; BDVER2-NEXT: vdivps %ymm0, %ymm2, %ymm0 1019; BDVER2-NEXT: vdivps %ymm1, %ymm2, %ymm1 1020; BDVER2-NEXT: retq 1021; 1022; BTVER2-LABEL: v16f32_no_estimate: 1023; BTVER2: # %bb.0: 1024; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1025; BTVER2-NEXT: vdivps %ymm0, %ymm2, %ymm0 1026; BTVER2-NEXT: vdivps %ymm1, %ymm2, %ymm1 1027; BTVER2-NEXT: retq 1028; 1029; SANDY-LABEL: v16f32_no_estimate: 1030; SANDY: # %bb.0: 1031; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1032; SANDY-NEXT: vdivps %ymm0, %ymm2, %ymm0 1033; SANDY-NEXT: vdivps %ymm1, %ymm2, %ymm1 1034; SANDY-NEXT: retq 1035; 1036; HASWELL-LABEL: v16f32_no_estimate: 1037; HASWELL: # %bb.0: 1038; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1039; HASWELL-NEXT: vdivps %ymm0, %ymm2, %ymm0 1040; HASWELL-NEXT: vdivps %ymm1, %ymm2, %ymm1 1041; HASWELL-NEXT: retq 1042; 1043; HASWELL-NO-FMA-LABEL: v16f32_no_estimate: 1044; HASWELL-NO-FMA: # %bb.0: 1045; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1046; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm2, %ymm0 1047; HASWELL-NO-FMA-NEXT: vdivps %ymm1, %ymm2, %ymm1 1048; HASWELL-NO-FMA-NEXT: retq 1049; 1050; AVX512-LABEL: v16f32_no_estimate: 1051; AVX512: # %bb.0: 1052; AVX512-NEXT: vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1053; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0 1054; AVX512-NEXT: retq 1055 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x 1056 ret <16 x float> %div 1057} 1058 1059define <16 x float> @v16f32_one_step(<16 x float> %x) #1 { 1060; SSE-LABEL: v16f32_one_step: 1061; SSE: # %bb.0: 1062; SSE-NEXT: movaps %xmm3, %xmm4 1063; SSE-NEXT: movaps %xmm0, %xmm5 1064; SSE-NEXT: rcpps %xmm0, %xmm6 1065; SSE-NEXT: mulps %xmm6, %xmm5 1066; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1067; SSE-NEXT: movaps %xmm3, %xmm0 1068; SSE-NEXT: subps %xmm5, %xmm0 1069; SSE-NEXT: mulps %xmm6, %xmm0 1070; SSE-NEXT: addps %xmm6, %xmm0 1071; SSE-NEXT: rcpps %xmm1, %xmm6 1072; SSE-NEXT: mulps %xmm6, %xmm1 1073; SSE-NEXT: movaps %xmm3, %xmm5 1074; SSE-NEXT: subps %xmm1, %xmm5 1075; SSE-NEXT: mulps %xmm6, %xmm5 1076; SSE-NEXT: addps %xmm6, %xmm5 1077; SSE-NEXT: rcpps %xmm2, %xmm1 1078; SSE-NEXT: mulps %xmm1, %xmm2 1079; SSE-NEXT: movaps %xmm3, %xmm6 1080; SSE-NEXT: subps %xmm2, %xmm6 1081; SSE-NEXT: mulps %xmm1, %xmm6 1082; SSE-NEXT: addps %xmm1, %xmm6 1083; SSE-NEXT: rcpps %xmm4, %xmm1 1084; SSE-NEXT: mulps %xmm1, %xmm4 1085; SSE-NEXT: subps %xmm4, %xmm3 1086; SSE-NEXT: mulps %xmm1, %xmm3 1087; SSE-NEXT: addps %xmm1, %xmm3 1088; SSE-NEXT: movaps %xmm5, %xmm1 1089; SSE-NEXT: movaps %xmm6, %xmm2 1090; SSE-NEXT: retq 1091; 1092; AVX-RECIP-LABEL: v16f32_one_step: 1093; AVX-RECIP: # %bb.0: 1094; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 1095; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 1096; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1097; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 1098; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 1099; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 1100; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 1101; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1 1102; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1 1103; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 1104; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1 1105; AVX-RECIP-NEXT: retq 1106; 1107; FMA-RECIP-LABEL: v16f32_one_step: 1108; FMA-RECIP: # %bb.0: 1109; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 1110; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1111; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3 1112; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2 1113; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 1114; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm3 1115; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2 1116; FMA-RECIP-NEXT: retq 1117; 1118; BDVER2-LABEL: v16f32_one_step: 1119; BDVER2: # %bb.0: 1120; BDVER2-NEXT: vrcpps %ymm0, %ymm2 1121; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1122; BDVER2-NEXT: vrcpps %ymm1, %ymm4 1123; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3 1124; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm3 1125; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2 1126; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm4 1127; BDVER2-NEXT: retq 1128; 1129; BTVER2-LABEL: v16f32_one_step: 1130; BTVER2: # %bb.0: 1131; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1132; BTVER2-NEXT: vrcpps %ymm0, %ymm2 1133; BTVER2-NEXT: vrcpps %ymm1, %ymm4 1134; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 1135; BTVER2-NEXT: vmulps %ymm4, %ymm1, %ymm1 1136; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 1137; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1 1138; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0 1139; BTVER2-NEXT: vmulps %ymm1, %ymm4, %ymm1 1140; BTVER2-NEXT: vaddps %ymm0, %ymm2, %ymm0 1141; BTVER2-NEXT: vaddps %ymm1, %ymm4, %ymm1 1142; BTVER2-NEXT: retq 1143; 1144; SANDY-LABEL: v16f32_one_step: 1145; SANDY: # %bb.0: 1146; SANDY-NEXT: vrcpps %ymm0, %ymm2 1147; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 1148; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1149; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 1150; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 1151; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 1152; SANDY-NEXT: vrcpps %ymm1, %ymm2 1153; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm1 1154; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1 1155; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 1156; SANDY-NEXT: vaddps %ymm1, %ymm2, %ymm1 1157; SANDY-NEXT: retq 1158; 1159; HASWELL-LABEL: v16f32_one_step: 1160; HASWELL: # %bb.0: 1161; HASWELL-NEXT: vrcpps %ymm0, %ymm2 1162; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1163; HASWELL-NEXT: vrcpps %ymm1, %ymm4 1164; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3 1165; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2 1166; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm3 1167; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm4) + ymm4 1168; HASWELL-NEXT: retq 1169; 1170; HASWELL-NO-FMA-LABEL: v16f32_one_step: 1171; HASWELL-NO-FMA: # %bb.0: 1172; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2 1173; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 1174; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1175; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 1176; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0 1177; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0 1178; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm2 1179; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm1 1180; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm3, %ymm1 1181; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1 1182; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm2, %ymm1 1183; HASWELL-NO-FMA-NEXT: retq 1184; 1185; AVX512-LABEL: v16f32_one_step: 1186; AVX512: # %bb.0: 1187; AVX512-NEXT: vrcp14ps %zmm0, %zmm1 1188; AVX512-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - mem 1189; AVX512-NEXT: vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm1 1190; AVX512-NEXT: retq 1191 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x 1192 ret <16 x float> %div 1193} 1194 1195define <16 x float> @v16f32_two_step(<16 x float> %x) #2 { 1196; SSE-LABEL: v16f32_two_step: 1197; SSE: # %bb.0: 1198; SSE-NEXT: movaps %xmm3, %xmm4 1199; SSE-NEXT: movaps %xmm1, %xmm5 1200; SSE-NEXT: movaps %xmm0, %xmm1 1201; SSE-NEXT: rcpps %xmm0, %xmm0 1202; SSE-NEXT: movaps %xmm1, %xmm6 1203; SSE-NEXT: mulps %xmm0, %xmm6 1204; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1205; SSE-NEXT: movaps %xmm3, %xmm7 1206; SSE-NEXT: subps %xmm6, %xmm7 1207; SSE-NEXT: mulps %xmm0, %xmm7 1208; SSE-NEXT: addps %xmm0, %xmm7 1209; SSE-NEXT: mulps %xmm7, %xmm1 1210; SSE-NEXT: movaps %xmm3, %xmm0 1211; SSE-NEXT: subps %xmm1, %xmm0 1212; SSE-NEXT: mulps %xmm7, %xmm0 1213; SSE-NEXT: addps %xmm7, %xmm0 1214; SSE-NEXT: rcpps %xmm5, %xmm1 1215; SSE-NEXT: movaps %xmm5, %xmm6 1216; SSE-NEXT: mulps %xmm1, %xmm6 1217; SSE-NEXT: movaps %xmm3, %xmm7 1218; SSE-NEXT: subps %xmm6, %xmm7 1219; SSE-NEXT: mulps %xmm1, %xmm7 1220; SSE-NEXT: addps %xmm1, %xmm7 1221; SSE-NEXT: mulps %xmm7, %xmm5 1222; SSE-NEXT: movaps %xmm3, %xmm1 1223; SSE-NEXT: subps %xmm5, %xmm1 1224; SSE-NEXT: mulps %xmm7, %xmm1 1225; SSE-NEXT: addps %xmm7, %xmm1 1226; SSE-NEXT: rcpps %xmm2, %xmm5 1227; SSE-NEXT: movaps %xmm2, %xmm6 1228; SSE-NEXT: mulps %xmm5, %xmm6 1229; SSE-NEXT: movaps %xmm3, %xmm7 1230; SSE-NEXT: subps %xmm6, %xmm7 1231; SSE-NEXT: mulps %xmm5, %xmm7 1232; SSE-NEXT: addps %xmm5, %xmm7 1233; SSE-NEXT: mulps %xmm7, %xmm2 1234; SSE-NEXT: movaps %xmm3, %xmm5 1235; SSE-NEXT: subps %xmm2, %xmm5 1236; SSE-NEXT: mulps %xmm7, %xmm5 1237; SSE-NEXT: addps %xmm7, %xmm5 1238; SSE-NEXT: rcpps %xmm4, %xmm2 1239; SSE-NEXT: movaps %xmm4, %xmm6 1240; SSE-NEXT: mulps %xmm2, %xmm6 1241; SSE-NEXT: movaps %xmm3, %xmm7 1242; SSE-NEXT: subps %xmm6, %xmm7 1243; SSE-NEXT: mulps %xmm2, %xmm7 1244; SSE-NEXT: addps %xmm2, %xmm7 1245; SSE-NEXT: mulps %xmm7, %xmm4 1246; SSE-NEXT: subps %xmm4, %xmm3 1247; SSE-NEXT: mulps %xmm7, %xmm3 1248; SSE-NEXT: addps %xmm7, %xmm3 1249; SSE-NEXT: movaps %xmm5, %xmm2 1250; SSE-NEXT: retq 1251; 1252; AVX-RECIP-LABEL: v16f32_two_step: 1253; AVX-RECIP: # %bb.0: 1254; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 1255; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3 1256; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1257; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 1258; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 1259; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 1260; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 1261; AVX-RECIP-NEXT: vsubps %ymm0, %ymm4, %ymm0 1262; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 1263; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 1264; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 1265; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3 1266; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 1267; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 1268; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 1269; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1 1270; AVX-RECIP-NEXT: vsubps %ymm1, %ymm4, %ymm1 1271; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 1272; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1 1273; AVX-RECIP-NEXT: retq 1274; 1275; FMA-RECIP-LABEL: v16f32_two_step: 1276; FMA-RECIP: # %bb.0: 1277; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 1278; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1279; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 1280; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3 1281; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2 1282; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm3 1283; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm4) + ymm4 1284; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 1285; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 1286; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm1 * ymm4) - ymm3 1287; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2 1288; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm3 1289; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm4) + ymm4 1290; FMA-RECIP-NEXT: retq 1291; 1292; BDVER2-LABEL: v16f32_two_step: 1293; BDVER2: # %bb.0: 1294; BDVER2-NEXT: vrcpps %ymm0, %ymm2 1295; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1296; BDVER2-NEXT: vfmsubps {{.*#+}} ymm4 = (ymm0 * ymm2) - ymm3 1297; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2 1298; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3 1299; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2 1300; BDVER2-NEXT: vrcpps %ymm1, %ymm2 1301; BDVER2-NEXT: vfmsubps {{.*#+}} ymm4 = (ymm1 * ymm2) - ymm3 1302; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2 1303; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm2) - ymm3 1304; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm2 1305; BDVER2-NEXT: retq 1306; 1307; BTVER2-LABEL: v16f32_two_step: 1308; BTVER2: # %bb.0: 1309; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1310; BTVER2-NEXT: vrcpps %ymm0, %ymm2 1311; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3 1312; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 1313; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 1314; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 1315; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 1316; BTVER2-NEXT: vsubps %ymm0, %ymm4, %ymm0 1317; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0 1318; BTVER2-NEXT: vaddps %ymm0, %ymm2, %ymm0 1319; BTVER2-NEXT: vrcpps %ymm1, %ymm2 1320; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 1321; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 1322; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 1323; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 1324; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm1 1325; BTVER2-NEXT: vsubps %ymm1, %ymm4, %ymm1 1326; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 1327; BTVER2-NEXT: vaddps %ymm1, %ymm2, %ymm1 1328; BTVER2-NEXT: retq 1329; 1330; SANDY-LABEL: v16f32_two_step: 1331; SANDY: # %bb.0: 1332; SANDY-NEXT: vrcpps %ymm0, %ymm2 1333; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3 1334; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1335; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 1336; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 1337; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 1338; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 1339; SANDY-NEXT: vsubps %ymm0, %ymm4, %ymm0 1340; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 1341; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 1342; SANDY-NEXT: vrcpps %ymm1, %ymm2 1343; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3 1344; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 1345; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 1346; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 1347; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm1 1348; SANDY-NEXT: vsubps %ymm1, %ymm4, %ymm1 1349; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 1350; SANDY-NEXT: vaddps %ymm1, %ymm2, %ymm1 1351; SANDY-NEXT: retq 1352; 1353; HASWELL-LABEL: v16f32_two_step: 1354; HASWELL: # %bb.0: 1355; HASWELL-NEXT: vrcpps %ymm0, %ymm2 1356; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1357; HASWELL-NEXT: vmovaps %ymm2, %ymm4 1358; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3 1359; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2 1360; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm3 1361; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm4) + ymm4 1362; HASWELL-NEXT: vrcpps %ymm1, %ymm2 1363; HASWELL-NEXT: vmovaps %ymm2, %ymm4 1364; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm1 * ymm4) - ymm3 1365; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2 1366; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm3 1367; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm4) + ymm4 1368; HASWELL-NEXT: retq 1369; 1370; HASWELL-NO-FMA-LABEL: v16f32_two_step: 1371; HASWELL-NO-FMA: # %bb.0: 1372; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2 1373; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm3 1374; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1375; HASWELL-NO-FMA-NEXT: vsubps %ymm3, %ymm4, %ymm3 1376; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm3 1377; HASWELL-NO-FMA-NEXT: vaddps %ymm3, %ymm2, %ymm2 1378; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 1379; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm4, %ymm0 1380; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0 1381; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0 1382; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm2 1383; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm3 1384; HASWELL-NO-FMA-NEXT: vsubps %ymm3, %ymm4, %ymm3 1385; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm3 1386; HASWELL-NO-FMA-NEXT: vaddps %ymm3, %ymm2, %ymm2 1387; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm1 1388; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm4, %ymm1 1389; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1 1390; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm2, %ymm1 1391; HASWELL-NO-FMA-NEXT: retq 1392; 1393; AVX512-LABEL: v16f32_two_step: 1394; AVX512: # %bb.0: 1395; AVX512-NEXT: vrcp14ps %zmm0, %zmm1 1396; AVX512-NEXT: vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1397; AVX512-NEXT: vmovaps %zmm1, %zmm3 1398; AVX512-NEXT: vfmsub213ps {{.*#+}} zmm3 = (zmm0 * zmm3) - zmm2 1399; AVX512-NEXT: vfnmadd132ps {{.*#+}} zmm3 = -(zmm3 * zmm1) + zmm1 1400; AVX512-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm3 * zmm0) - zmm2 1401; AVX512-NEXT: vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm3) + zmm3 1402; AVX512-NEXT: retq 1403 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x 1404 ret <16 x float> %div 1405} 1406 1407attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" } 1408attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" } 1409attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" } 1410 1411