1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2     | FileCheck %s --check-prefix=SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx      | FileCheck %s --check-prefixes=AVX,AVX-RECIP
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefixes=AVX,FMA-RECIP
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 | FileCheck %s --check-prefixes=AVX,BDVER2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,BTVER2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,SANDY
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell | FileCheck %s --check-prefixes=AVX,HASWELL
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefixes=AVX,HASWELL-NO-FMA
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX,AVX512,KNL
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=AVX,AVX512,SKX
12
13; If the target's divss/divps instructions are substantially
14; slower than rcpss/rcpps with a Newton-Raphson refinement,
15; we should generate the estimate sequence.
16
17; See PR21385 ( http://llvm.org/bugs/show_bug.cgi?id=21385 )
18; for details about the accuracy, speed, and implementation
19; differences of x86 reciprocal estimates.
20
21define float @f32_no_estimate(float %x) #0 {
22; SSE-LABEL: f32_no_estimate:
23; SSE:       # %bb.0:
24; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
25; SSE-NEXT:    divss %xmm0, %xmm1
26; SSE-NEXT:    movaps %xmm1, %xmm0
27; SSE-NEXT:    retq
28;
29; AVX-LABEL: f32_no_estimate:
30; AVX:       # %bb.0:
31; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
32; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
33; AVX-NEXT:    retq
34  %div = fdiv fast float 1.0, %x
35  ret float %div
36}
37
38define float @f32_one_step(float %x) #1 {
39; SSE-LABEL: f32_one_step:
40; SSE:       # %bb.0:
41; SSE-NEXT:    rcpss %xmm0, %xmm2
42; SSE-NEXT:    mulss %xmm2, %xmm0
43; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
44; SSE-NEXT:    subss %xmm0, %xmm1
45; SSE-NEXT:    mulss %xmm2, %xmm1
46; SSE-NEXT:    addss %xmm2, %xmm1
47; SSE-NEXT:    movaps %xmm1, %xmm0
48; SSE-NEXT:    retq
49;
50; AVX-RECIP-LABEL: f32_one_step:
51; AVX-RECIP:       # %bb.0:
52; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
53; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
54; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
55; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0
56; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
57; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
58; AVX-RECIP-NEXT:    retq
59;
60; FMA-RECIP-LABEL: f32_one_step:
61; FMA-RECIP:       # %bb.0:
62; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
63; FMA-RECIP-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - mem
64; FMA-RECIP-NEXT:    vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
65; FMA-RECIP-NEXT:    retq
66;
67; BDVER2-LABEL: f32_one_step:
68; BDVER2:       # %bb.0:
69; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
70; BDVER2-NEXT:    vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm1) - mem
71; BDVER2-NEXT:    vfnmaddss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
72; BDVER2-NEXT:    retq
73;
74; BTVER2-LABEL: f32_one_step:
75; BTVER2:       # %bb.0:
76; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
77; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
78; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
79; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0
80; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
81; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
82; BTVER2-NEXT:    retq
83;
84; SANDY-LABEL: f32_one_step:
85; SANDY:       # %bb.0:
86; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
87; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0
88; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
89; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0
90; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
91; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0
92; SANDY-NEXT:    retq
93;
94; HASWELL-LABEL: f32_one_step:
95; HASWELL:       # %bb.0:
96; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
97; HASWELL-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - mem
98; HASWELL-NEXT:    vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
99; HASWELL-NEXT:    retq
100;
101; HASWELL-NO-FMA-LABEL: f32_one_step:
102; HASWELL-NO-FMA:       # %bb.0:
103; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
104; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
105; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
106; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0
107; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
108; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
109; HASWELL-NO-FMA-NEXT:    retq
110;
111; AVX512-LABEL: f32_one_step:
112; AVX512:       # %bb.0:
113; AVX512-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
114; AVX512-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - mem
115; AVX512-NEXT:    vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
116; AVX512-NEXT:    retq
117  %div = fdiv fast float 1.0, %x
118  ret float %div
119}
120
121define float @f32_one_step_variables(float %x, float %y) #1 {
122; SSE-LABEL: f32_one_step_variables:
123; SSE:       # %bb.0:
124; SSE-NEXT:    rcpss %xmm1, %xmm2
125; SSE-NEXT:    movaps %xmm0, %xmm3
126; SSE-NEXT:    mulss %xmm2, %xmm3
127; SSE-NEXT:    mulss %xmm3, %xmm1
128; SSE-NEXT:    subss %xmm1, %xmm0
129; SSE-NEXT:    mulss %xmm2, %xmm0
130; SSE-NEXT:    addss %xmm3, %xmm0
131; SSE-NEXT:    retq
132;
133; AVX-RECIP-LABEL: f32_one_step_variables:
134; AVX-RECIP:       # %bb.0:
135; AVX-RECIP-NEXT:    vrcpss %xmm1, %xmm1, %xmm2
136; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm0, %xmm3
137; AVX-RECIP-NEXT:    vmulss %xmm3, %xmm1, %xmm1
138; AVX-RECIP-NEXT:    vsubss %xmm1, %xmm0, %xmm0
139; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm2, %xmm0
140; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm3, %xmm0
141; AVX-RECIP-NEXT:    retq
142;
143; FMA-RECIP-LABEL: f32_one_step_variables:
144; FMA-RECIP:       # %bb.0:
145; FMA-RECIP-NEXT:    vrcpss %xmm1, %xmm1, %xmm2
146; FMA-RECIP-NEXT:    vmulss %xmm2, %xmm0, %xmm3
147; FMA-RECIP-NEXT:    vfmsub231ss {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
148; FMA-RECIP-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
149; FMA-RECIP-NEXT:    retq
150;
151; BDVER2-LABEL: f32_one_step_variables:
152; BDVER2:       # %bb.0:
153; BDVER2-NEXT:    vrcpss %xmm1, %xmm1, %xmm2
154; BDVER2-NEXT:    vmulss %xmm2, %xmm0, %xmm3
155; BDVER2-NEXT:    vfmsubss {{.*#+}} xmm0 = (xmm1 * xmm3) - xmm0
156; BDVER2-NEXT:    vfnmaddss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
157; BDVER2-NEXT:    retq
158;
159; BTVER2-LABEL: f32_one_step_variables:
160; BTVER2:       # %bb.0:
161; BTVER2-NEXT:    vrcpss %xmm1, %xmm1, %xmm2
162; BTVER2-NEXT:    vmulss %xmm2, %xmm0, %xmm3
163; BTVER2-NEXT:    vmulss %xmm3, %xmm1, %xmm1
164; BTVER2-NEXT:    vsubss %xmm1, %xmm0, %xmm0
165; BTVER2-NEXT:    vmulss %xmm0, %xmm2, %xmm0
166; BTVER2-NEXT:    vaddss %xmm0, %xmm3, %xmm0
167; BTVER2-NEXT:    retq
168;
169; SANDY-LABEL: f32_one_step_variables:
170; SANDY:       # %bb.0:
171; SANDY-NEXT:    vrcpss %xmm1, %xmm1, %xmm2
172; SANDY-NEXT:    vmulss %xmm2, %xmm0, %xmm3
173; SANDY-NEXT:    vmulss %xmm3, %xmm1, %xmm1
174; SANDY-NEXT:    vsubss %xmm1, %xmm0, %xmm0
175; SANDY-NEXT:    vmulss %xmm0, %xmm2, %xmm0
176; SANDY-NEXT:    vaddss %xmm0, %xmm3, %xmm0
177; SANDY-NEXT:    retq
178;
179; HASWELL-LABEL: f32_one_step_variables:
180; HASWELL:       # %bb.0:
181; HASWELL-NEXT:    vrcpss %xmm1, %xmm1, %xmm2
182; HASWELL-NEXT:    vmulss %xmm2, %xmm0, %xmm3
183; HASWELL-NEXT:    vfmsub231ss {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
184; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
185; HASWELL-NEXT:    retq
186;
187; HASWELL-NO-FMA-LABEL: f32_one_step_variables:
188; HASWELL-NO-FMA:       # %bb.0:
189; HASWELL-NO-FMA-NEXT:    vrcpss %xmm1, %xmm1, %xmm2
190; HASWELL-NO-FMA-NEXT:    vmulss %xmm2, %xmm0, %xmm3
191; HASWELL-NO-FMA-NEXT:    vmulss %xmm3, %xmm1, %xmm1
192; HASWELL-NO-FMA-NEXT:    vsubss %xmm1, %xmm0, %xmm0
193; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm2, %xmm0
194; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm3, %xmm0
195; HASWELL-NO-FMA-NEXT:    retq
196;
197; AVX512-LABEL: f32_one_step_variables:
198; AVX512:       # %bb.0:
199; AVX512-NEXT:    vrcpss %xmm1, %xmm1, %xmm2
200; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm3
201; AVX512-NEXT:    vfmsub231ss {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
202; AVX512-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
203; AVX512-NEXT:    retq
204  %div = fdiv fast float %x, %y
205  ret float %div
206}
207
208define float @f32_two_step(float %x) #2 {
209; SSE-LABEL: f32_two_step:
210; SSE:       # %bb.0:
211; SSE-NEXT:    rcpss %xmm0, %xmm2
212; SSE-NEXT:    movaps %xmm0, %xmm3
213; SSE-NEXT:    mulss %xmm2, %xmm3
214; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
215; SSE-NEXT:    movaps %xmm1, %xmm4
216; SSE-NEXT:    subss %xmm3, %xmm4
217; SSE-NEXT:    mulss %xmm2, %xmm4
218; SSE-NEXT:    addss %xmm2, %xmm4
219; SSE-NEXT:    mulss %xmm4, %xmm0
220; SSE-NEXT:    subss %xmm0, %xmm1
221; SSE-NEXT:    mulss %xmm4, %xmm1
222; SSE-NEXT:    addss %xmm4, %xmm1
223; SSE-NEXT:    movaps %xmm1, %xmm0
224; SSE-NEXT:    retq
225;
226; AVX-RECIP-LABEL: f32_two_step:
227; AVX-RECIP:       # %bb.0:
228; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
229; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm2
230; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
231; AVX-RECIP-NEXT:    vsubss %xmm2, %xmm3, %xmm2
232; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm1, %xmm2
233; AVX-RECIP-NEXT:    vaddss %xmm2, %xmm1, %xmm1
234; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
235; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm3, %xmm0
236; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
237; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
238; AVX-RECIP-NEXT:    retq
239;
240; FMA-RECIP-LABEL: f32_two_step:
241; FMA-RECIP:       # %bb.0:
242; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
243; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
244; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
245; FMA-RECIP-NEXT:    vfmsub213ss {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2
246; FMA-RECIP-NEXT:    vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1
247; FMA-RECIP-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2
248; FMA-RECIP-NEXT:    vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm3) + xmm3
249; FMA-RECIP-NEXT:    retq
250;
251; BDVER2-LABEL: f32_two_step:
252; BDVER2:       # %bb.0:
253; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
254; BDVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
255; BDVER2-NEXT:    vfmsubss {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm2
256; BDVER2-NEXT:    vfnmaddss {{.*#+}} xmm1 = -(xmm1 * xmm3) + xmm1
257; BDVER2-NEXT:    vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
258; BDVER2-NEXT:    vfnmaddss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
259; BDVER2-NEXT:    retq
260;
261; BTVER2-LABEL: f32_two_step:
262; BTVER2:       # %bb.0:
263; BTVER2-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
264; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
265; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm2
266; BTVER2-NEXT:    vsubss %xmm2, %xmm3, %xmm2
267; BTVER2-NEXT:    vmulss %xmm2, %xmm1, %xmm2
268; BTVER2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
269; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
270; BTVER2-NEXT:    vsubss %xmm0, %xmm3, %xmm0
271; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
272; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
273; BTVER2-NEXT:    retq
274;
275; SANDY-LABEL: f32_two_step:
276; SANDY:       # %bb.0:
277; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
278; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm2
279; SANDY-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
280; SANDY-NEXT:    vsubss %xmm2, %xmm3, %xmm2
281; SANDY-NEXT:    vmulss %xmm2, %xmm1, %xmm2
282; SANDY-NEXT:    vaddss %xmm2, %xmm1, %xmm1
283; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0
284; SANDY-NEXT:    vsubss %xmm0, %xmm3, %xmm0
285; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
286; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0
287; SANDY-NEXT:    retq
288;
289; HASWELL-LABEL: f32_two_step:
290; HASWELL:       # %bb.0:
291; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
292; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
293; HASWELL-NEXT:    vmovaps %xmm1, %xmm3
294; HASWELL-NEXT:    vfmsub213ss {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2
295; HASWELL-NEXT:    vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1
296; HASWELL-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2
297; HASWELL-NEXT:    vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm3) + xmm3
298; HASWELL-NEXT:    retq
299;
300; HASWELL-NO-FMA-LABEL: f32_two_step:
301; HASWELL-NO-FMA:       # %bb.0:
302; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
303; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm2
304; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
305; HASWELL-NO-FMA-NEXT:    vsubss %xmm2, %xmm3, %xmm2
306; HASWELL-NO-FMA-NEXT:    vmulss %xmm2, %xmm1, %xmm2
307; HASWELL-NO-FMA-NEXT:    vaddss %xmm2, %xmm1, %xmm1
308; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
309; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm3, %xmm0
310; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
311; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
312; HASWELL-NO-FMA-NEXT:    retq
313;
314; AVX512-LABEL: f32_two_step:
315; AVX512:       # %bb.0:
316; AVX512-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
317; AVX512-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
318; AVX512-NEXT:    vmovaps %xmm1, %xmm3
319; AVX512-NEXT:    vfmsub213ss {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2
320; AVX512-NEXT:    vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1
321; AVX512-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2
322; AVX512-NEXT:    vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm3) + xmm3
323; AVX512-NEXT:    retq
324  %div = fdiv fast float 1.0, %x
325  ret float %div
326}
327
328define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
329; SSE-LABEL: v4f32_no_estimate:
330; SSE:       # %bb.0:
331; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
332; SSE-NEXT:    divps %xmm0, %xmm1
333; SSE-NEXT:    movaps %xmm1, %xmm0
334; SSE-NEXT:    retq
335;
336; AVX-RECIP-LABEL: v4f32_no_estimate:
337; AVX-RECIP:       # %bb.0:
338; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
339; AVX-RECIP-NEXT:    vdivps %xmm0, %xmm1, %xmm0
340; AVX-RECIP-NEXT:    retq
341;
342; FMA-RECIP-LABEL: v4f32_no_estimate:
343; FMA-RECIP:       # %bb.0:
344; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
345; FMA-RECIP-NEXT:    vdivps %xmm0, %xmm1, %xmm0
346; FMA-RECIP-NEXT:    retq
347;
348; BDVER2-LABEL: v4f32_no_estimate:
349; BDVER2:       # %bb.0:
350; BDVER2-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
351; BDVER2-NEXT:    vdivps %xmm0, %xmm1, %xmm0
352; BDVER2-NEXT:    retq
353;
354; BTVER2-LABEL: v4f32_no_estimate:
355; BTVER2:       # %bb.0:
356; BTVER2-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
357; BTVER2-NEXT:    vdivps %xmm0, %xmm1, %xmm0
358; BTVER2-NEXT:    retq
359;
360; SANDY-LABEL: v4f32_no_estimate:
361; SANDY:       # %bb.0:
362; SANDY-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
363; SANDY-NEXT:    vdivps %xmm0, %xmm1, %xmm0
364; SANDY-NEXT:    retq
365;
366; HASWELL-LABEL: v4f32_no_estimate:
367; HASWELL:       # %bb.0:
368; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
369; HASWELL-NEXT:    vdivps %xmm0, %xmm1, %xmm0
370; HASWELL-NEXT:    retq
371;
372; HASWELL-NO-FMA-LABEL: v4f32_no_estimate:
373; HASWELL-NO-FMA:       # %bb.0:
374; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
375; HASWELL-NO-FMA-NEXT:    vdivps %xmm0, %xmm1, %xmm0
376; HASWELL-NO-FMA-NEXT:    retq
377;
378; AVX512-LABEL: v4f32_no_estimate:
379; AVX512:       # %bb.0:
380; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
381; AVX512-NEXT:    vdivps %xmm0, %xmm1, %xmm0
382; AVX512-NEXT:    retq
383  %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
384  ret <4 x float> %div
385}
386
387define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
388; SSE-LABEL: v4f32_one_step:
389; SSE:       # %bb.0:
390; SSE-NEXT:    rcpps %xmm0, %xmm2
391; SSE-NEXT:    mulps %xmm2, %xmm0
392; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
393; SSE-NEXT:    subps %xmm0, %xmm1
394; SSE-NEXT:    mulps %xmm2, %xmm1
395; SSE-NEXT:    addps %xmm2, %xmm1
396; SSE-NEXT:    movaps %xmm1, %xmm0
397; SSE-NEXT:    retq
398;
399; AVX-RECIP-LABEL: v4f32_one_step:
400; AVX-RECIP:       # %bb.0:
401; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
402; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
403; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
404; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
405; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
406; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
407; AVX-RECIP-NEXT:    retq
408;
409; FMA-RECIP-LABEL: v4f32_one_step:
410; FMA-RECIP:       # %bb.0:
411; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
412; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - mem
413; FMA-RECIP-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
414; FMA-RECIP-NEXT:    retq
415;
416; BDVER2-LABEL: v4f32_one_step:
417; BDVER2:       # %bb.0:
418; BDVER2-NEXT:    vrcpps %xmm0, %xmm1
419; BDVER2-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - mem
420; BDVER2-NEXT:    vfnmaddps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
421; BDVER2-NEXT:    retq
422;
423; BTVER2-LABEL: v4f32_one_step:
424; BTVER2:       # %bb.0:
425; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
426; BTVER2-NEXT:    vrcpps %xmm0, %xmm1
427; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
428; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0
429; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
430; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
431; BTVER2-NEXT:    retq
432;
433; SANDY-LABEL: v4f32_one_step:
434; SANDY:       # %bb.0:
435; SANDY-NEXT:    vrcpps %xmm0, %xmm1
436; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0
437; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
438; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0
439; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
440; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0
441; SANDY-NEXT:    retq
442;
443; HASWELL-LABEL: v4f32_one_step:
444; HASWELL:       # %bb.0:
445; HASWELL-NEXT:    vrcpps %xmm0, %xmm2
446; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
447; HASWELL-NEXT:    vfmsub231ps {{.*#+}} xmm1 = (xmm2 * xmm0) - xmm1
448; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm2
449; HASWELL-NEXT:    vmovaps %xmm1, %xmm0
450; HASWELL-NEXT:    retq
451;
452; HASWELL-NO-FMA-LABEL: v4f32_one_step:
453; HASWELL-NO-FMA:       # %bb.0:
454; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
455; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
456; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
457; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
458; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
459; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
460; HASWELL-NO-FMA-NEXT:    retq
461;
462; KNL-LABEL: v4f32_one_step:
463; KNL:       # %bb.0:
464; KNL-NEXT:    vrcpps %xmm0, %xmm2
465; KNL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
466; KNL-NEXT:    vfmsub231ps {{.*#+}} xmm1 = (xmm2 * xmm0) - xmm1
467; KNL-NEXT:    vfnmadd132ps {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm2
468; KNL-NEXT:    vmovaps %xmm1, %xmm0
469; KNL-NEXT:    retq
470;
471; SKX-LABEL: v4f32_one_step:
472; SKX:       # %bb.0:
473; SKX-NEXT:    vrcpps %xmm0, %xmm1
474; SKX-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - mem
475; SKX-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
476; SKX-NEXT:    retq
477  %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
478  ret <4 x float> %div
479}
480
481define <4 x float> @v4f32_one_step_variables(<4 x float> %x, <4 x float> %y) #1 {
482; SSE-LABEL: v4f32_one_step_variables:
483; SSE:       # %bb.0:
484; SSE-NEXT:    rcpps %xmm1, %xmm2
485; SSE-NEXT:    movaps %xmm0, %xmm3
486; SSE-NEXT:    mulps %xmm2, %xmm3
487; SSE-NEXT:    mulps %xmm3, %xmm1
488; SSE-NEXT:    subps %xmm1, %xmm0
489; SSE-NEXT:    mulps %xmm2, %xmm0
490; SSE-NEXT:    addps %xmm3, %xmm0
491; SSE-NEXT:    retq
492;
493; AVX-RECIP-LABEL: v4f32_one_step_variables:
494; AVX-RECIP:       # %bb.0:
495; AVX-RECIP-NEXT:    vrcpps %xmm1, %xmm2
496; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm0, %xmm3
497; AVX-RECIP-NEXT:    vmulps %xmm3, %xmm1, %xmm1
498; AVX-RECIP-NEXT:    vsubps %xmm1, %xmm0, %xmm0
499; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm2, %xmm0
500; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm3, %xmm0
501; AVX-RECIP-NEXT:    retq
502;
503; FMA-RECIP-LABEL: v4f32_one_step_variables:
504; FMA-RECIP:       # %bb.0:
505; FMA-RECIP-NEXT:    vrcpps %xmm1, %xmm2
506; FMA-RECIP-NEXT:    vmulps %xmm2, %xmm0, %xmm3
507; FMA-RECIP-NEXT:    vfmsub231ps {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
508; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
509; FMA-RECIP-NEXT:    retq
510;
511; BDVER2-LABEL: v4f32_one_step_variables:
512; BDVER2:       # %bb.0:
513; BDVER2-NEXT:    vrcpps %xmm1, %xmm2
514; BDVER2-NEXT:    vmulps %xmm2, %xmm0, %xmm3
515; BDVER2-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm1 * xmm3) - xmm0
516; BDVER2-NEXT:    vfnmaddps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
517; BDVER2-NEXT:    retq
518;
519; BTVER2-LABEL: v4f32_one_step_variables:
520; BTVER2:       # %bb.0:
521; BTVER2-NEXT:    vrcpps %xmm1, %xmm2
522; BTVER2-NEXT:    vmulps %xmm2, %xmm0, %xmm3
523; BTVER2-NEXT:    vmulps %xmm3, %xmm1, %xmm1
524; BTVER2-NEXT:    vsubps %xmm1, %xmm0, %xmm0
525; BTVER2-NEXT:    vmulps %xmm0, %xmm2, %xmm0
526; BTVER2-NEXT:    vaddps %xmm0, %xmm3, %xmm0
527; BTVER2-NEXT:    retq
528;
529; SANDY-LABEL: v4f32_one_step_variables:
530; SANDY:       # %bb.0:
531; SANDY-NEXT:    vrcpps %xmm1, %xmm2
532; SANDY-NEXT:    vmulps %xmm2, %xmm0, %xmm3
533; SANDY-NEXT:    vmulps %xmm3, %xmm1, %xmm1
534; SANDY-NEXT:    vsubps %xmm1, %xmm0, %xmm0
535; SANDY-NEXT:    vmulps %xmm0, %xmm2, %xmm0
536; SANDY-NEXT:    vaddps %xmm0, %xmm3, %xmm0
537; SANDY-NEXT:    retq
538;
539; HASWELL-LABEL: v4f32_one_step_variables:
540; HASWELL:       # %bb.0:
541; HASWELL-NEXT:    vrcpps %xmm1, %xmm2
542; HASWELL-NEXT:    vmulps %xmm2, %xmm0, %xmm3
543; HASWELL-NEXT:    vfmsub231ps {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
544; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
545; HASWELL-NEXT:    retq
546;
547; HASWELL-NO-FMA-LABEL: v4f32_one_step_variables:
548; HASWELL-NO-FMA:       # %bb.0:
549; HASWELL-NO-FMA-NEXT:    vrcpps %xmm1, %xmm2
550; HASWELL-NO-FMA-NEXT:    vmulps %xmm2, %xmm0, %xmm3
551; HASWELL-NO-FMA-NEXT:    vmulps %xmm3, %xmm1, %xmm1
552; HASWELL-NO-FMA-NEXT:    vsubps %xmm1, %xmm0, %xmm0
553; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm2, %xmm0
554; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm3, %xmm0
555; HASWELL-NO-FMA-NEXT:    retq
556;
557; AVX512-LABEL: v4f32_one_step_variables:
558; AVX512:       # %bb.0:
559; AVX512-NEXT:    vrcpps %xmm1, %xmm2
560; AVX512-NEXT:    vmulps %xmm2, %xmm0, %xmm3
561; AVX512-NEXT:    vfmsub231ps {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
562; AVX512-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
563; AVX512-NEXT:    retq
564  %div = fdiv fast <4 x float> %x, %y
565  ret <4 x float> %div
566}
567
568define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
569; SSE-LABEL: v4f32_two_step:
570; SSE:       # %bb.0:
571; SSE-NEXT:    rcpps %xmm0, %xmm2
572; SSE-NEXT:    movaps %xmm0, %xmm3
573; SSE-NEXT:    mulps %xmm2, %xmm3
574; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
575; SSE-NEXT:    movaps %xmm1, %xmm4
576; SSE-NEXT:    subps %xmm3, %xmm4
577; SSE-NEXT:    mulps %xmm2, %xmm4
578; SSE-NEXT:    addps %xmm2, %xmm4
579; SSE-NEXT:    mulps %xmm4, %xmm0
580; SSE-NEXT:    subps %xmm0, %xmm1
581; SSE-NEXT:    mulps %xmm4, %xmm1
582; SSE-NEXT:    addps %xmm4, %xmm1
583; SSE-NEXT:    movaps %xmm1, %xmm0
584; SSE-NEXT:    retq
585;
586; AVX-RECIP-LABEL: v4f32_two_step:
587; AVX-RECIP:       # %bb.0:
588; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
589; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm2
590; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
591; AVX-RECIP-NEXT:    vsubps %xmm2, %xmm3, %xmm2
592; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm1, %xmm2
593; AVX-RECIP-NEXT:    vaddps %xmm2, %xmm1, %xmm1
594; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
595; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm3, %xmm0
596; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
597; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
598; AVX-RECIP-NEXT:    retq
599;
600; FMA-RECIP-LABEL: v4f32_two_step:
601; FMA-RECIP:       # %bb.0:
602; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
603; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
604; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
605; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2
606; FMA-RECIP-NEXT:    vfnmadd132ps {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1
607; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2
608; FMA-RECIP-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm3) + xmm3
609; FMA-RECIP-NEXT:    retq
610;
611; BDVER2-LABEL: v4f32_two_step:
612; BDVER2:       # %bb.0:
613; BDVER2-NEXT:    vrcpps %xmm0, %xmm1
614; BDVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
615; BDVER2-NEXT:    vfmsubps {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm2
616; BDVER2-NEXT:    vfnmaddps {{.*#+}} xmm1 = -(xmm1 * xmm3) + xmm1
617; BDVER2-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
618; BDVER2-NEXT:    vfnmaddps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
619; BDVER2-NEXT:    retq
620;
621; BTVER2-LABEL: v4f32_two_step:
622; BTVER2:       # %bb.0:
623; BTVER2-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
624; BTVER2-NEXT:    vrcpps %xmm0, %xmm1
625; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm2
626; BTVER2-NEXT:    vsubps %xmm2, %xmm3, %xmm2
627; BTVER2-NEXT:    vmulps %xmm2, %xmm1, %xmm2
628; BTVER2-NEXT:    vaddps %xmm2, %xmm1, %xmm1
629; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
630; BTVER2-NEXT:    vsubps %xmm0, %xmm3, %xmm0
631; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
632; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
633; BTVER2-NEXT:    retq
634;
635; SANDY-LABEL: v4f32_two_step:
636; SANDY:       # %bb.0:
637; SANDY-NEXT:    vrcpps %xmm0, %xmm1
638; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm2
639; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
640; SANDY-NEXT:    vsubps %xmm2, %xmm3, %xmm2
641; SANDY-NEXT:    vmulps %xmm2, %xmm1, %xmm2
642; SANDY-NEXT:    vaddps %xmm2, %xmm1, %xmm1
643; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0
644; SANDY-NEXT:    vsubps %xmm0, %xmm3, %xmm0
645; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
646; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0
647; SANDY-NEXT:    retq
648;
649; HASWELL-LABEL: v4f32_two_step:
650; HASWELL:       # %bb.0:
651; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
652; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
653; HASWELL-NEXT:    vmovaps %xmm1, %xmm3
654; HASWELL-NEXT:    vfmsub213ps {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2
655; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1
656; HASWELL-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2
657; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm3) + xmm3
658; HASWELL-NEXT:    retq
659;
660; HASWELL-NO-FMA-LABEL: v4f32_two_step:
661; HASWELL-NO-FMA:       # %bb.0:
662; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
663; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm2
664; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
665; HASWELL-NO-FMA-NEXT:    vsubps %xmm2, %xmm3, %xmm2
666; HASWELL-NO-FMA-NEXT:    vmulps %xmm2, %xmm1, %xmm2
667; HASWELL-NO-FMA-NEXT:    vaddps %xmm2, %xmm1, %xmm1
668; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
669; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm3, %xmm0
670; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
671; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
672; HASWELL-NO-FMA-NEXT:    retq
673;
674; AVX512-LABEL: v4f32_two_step:
675; AVX512:       # %bb.0:
676; AVX512-NEXT:    vrcpps %xmm0, %xmm1
677; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
678; AVX512-NEXT:    vmovaps %xmm1, %xmm3
679; AVX512-NEXT:    vfmsub213ps {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2
680; AVX512-NEXT:    vfnmadd132ps {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1
681; AVX512-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2
682; AVX512-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm3) + xmm3
683; AVX512-NEXT:    retq
684  %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
685  ret <4 x float> %div
686}
687
688define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
689; SSE-LABEL: v8f32_no_estimate:
690; SSE:       # %bb.0:
691; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
692; SSE-NEXT:    movaps %xmm2, %xmm3
693; SSE-NEXT:    divps %xmm0, %xmm3
694; SSE-NEXT:    divps %xmm1, %xmm2
695; SSE-NEXT:    movaps %xmm3, %xmm0
696; SSE-NEXT:    movaps %xmm2, %xmm1
697; SSE-NEXT:    retq
698;
699; AVX-RECIP-LABEL: v8f32_no_estimate:
700; AVX-RECIP:       # %bb.0:
701; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
702; AVX-RECIP-NEXT:    vdivps %ymm0, %ymm1, %ymm0
703; AVX-RECIP-NEXT:    retq
704;
705; FMA-RECIP-LABEL: v8f32_no_estimate:
706; FMA-RECIP:       # %bb.0:
707; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
708; FMA-RECIP-NEXT:    vdivps %ymm0, %ymm1, %ymm0
709; FMA-RECIP-NEXT:    retq
710;
711; BDVER2-LABEL: v8f32_no_estimate:
712; BDVER2:       # %bb.0:
713; BDVER2-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
714; BDVER2-NEXT:    vdivps %ymm0, %ymm1, %ymm0
715; BDVER2-NEXT:    retq
716;
717; BTVER2-LABEL: v8f32_no_estimate:
718; BTVER2:       # %bb.0:
719; BTVER2-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
720; BTVER2-NEXT:    vdivps %ymm0, %ymm1, %ymm0
721; BTVER2-NEXT:    retq
722;
723; SANDY-LABEL: v8f32_no_estimate:
724; SANDY:       # %bb.0:
725; SANDY-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
726; SANDY-NEXT:    vdivps %ymm0, %ymm1, %ymm0
727; SANDY-NEXT:    retq
728;
729; HASWELL-LABEL: v8f32_no_estimate:
730; HASWELL:       # %bb.0:
731; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
732; HASWELL-NEXT:    vdivps %ymm0, %ymm1, %ymm0
733; HASWELL-NEXT:    retq
734;
735; HASWELL-NO-FMA-LABEL: v8f32_no_estimate:
736; HASWELL-NO-FMA:       # %bb.0:
737; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
738; HASWELL-NO-FMA-NEXT:    vdivps %ymm0, %ymm1, %ymm0
739; HASWELL-NO-FMA-NEXT:    retq
740;
741; AVX512-LABEL: v8f32_no_estimate:
742; AVX512:       # %bb.0:
743; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
744; AVX512-NEXT:    vdivps %ymm0, %ymm1, %ymm0
745; AVX512-NEXT:    retq
746  %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
747  ret <8 x float> %div
748}
749
750define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
751; SSE-LABEL: v8f32_one_step:
752; SSE:       # %bb.0:
753; SSE-NEXT:    rcpps %xmm0, %xmm4
754; SSE-NEXT:    mulps %xmm4, %xmm0
755; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
756; SSE-NEXT:    movaps %xmm2, %xmm3
757; SSE-NEXT:    subps %xmm0, %xmm3
758; SSE-NEXT:    mulps %xmm4, %xmm3
759; SSE-NEXT:    addps %xmm4, %xmm3
760; SSE-NEXT:    rcpps %xmm1, %xmm0
761; SSE-NEXT:    mulps %xmm0, %xmm1
762; SSE-NEXT:    subps %xmm1, %xmm2
763; SSE-NEXT:    mulps %xmm0, %xmm2
764; SSE-NEXT:    addps %xmm0, %xmm2
765; SSE-NEXT:    movaps %xmm3, %xmm0
766; SSE-NEXT:    movaps %xmm2, %xmm1
767; SSE-NEXT:    retq
768;
769; AVX-RECIP-LABEL: v8f32_one_step:
770; AVX-RECIP:       # %bb.0:
771; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
772; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
773; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
774; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
775; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
776; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
777; AVX-RECIP-NEXT:    retq
778;
779; FMA-RECIP-LABEL: v8f32_one_step:
780; FMA-RECIP:       # %bb.0:
781; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
782; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - mem
783; FMA-RECIP-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1
784; FMA-RECIP-NEXT:    retq
785;
786; BDVER2-LABEL: v8f32_one_step:
787; BDVER2:       # %bb.0:
788; BDVER2-NEXT:    vrcpps %ymm0, %ymm1
789; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm1) - mem
790; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm1
791; BDVER2-NEXT:    retq
792;
793; BTVER2-LABEL: v8f32_one_step:
794; BTVER2:       # %bb.0:
795; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
796; BTVER2-NEXT:    vrcpps %ymm0, %ymm1
797; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
798; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0
799; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
800; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
801; BTVER2-NEXT:    retq
802;
803; SANDY-LABEL: v8f32_one_step:
804; SANDY:       # %bb.0:
805; SANDY-NEXT:    vrcpps %ymm0, %ymm1
806; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0
807; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
808; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0
809; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
810; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0
811; SANDY-NEXT:    retq
812;
813; HASWELL-LABEL: v8f32_one_step:
814; HASWELL:       # %bb.0:
815; HASWELL-NEXT:    vrcpps %ymm0, %ymm2
816; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
817; HASWELL-NEXT:    vfmsub231ps {{.*#+}} ymm1 = (ymm2 * ymm0) - ymm1
818; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2
819; HASWELL-NEXT:    vmovaps %ymm1, %ymm0
820; HASWELL-NEXT:    retq
821;
822; HASWELL-NO-FMA-LABEL: v8f32_one_step:
823; HASWELL-NO-FMA:       # %bb.0:
824; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
825; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
826; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
827; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0
828; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
829; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
830; HASWELL-NO-FMA-NEXT:    retq
831;
832; KNL-LABEL: v8f32_one_step:
833; KNL:       # %bb.0:
834; KNL-NEXT:    vrcpps %ymm0, %ymm2
835; KNL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
836; KNL-NEXT:    vfmsub231ps {{.*#+}} ymm1 = (ymm2 * ymm0) - ymm1
837; KNL-NEXT:    vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2
838; KNL-NEXT:    vmovaps %ymm1, %ymm0
839; KNL-NEXT:    retq
840;
841; SKX-LABEL: v8f32_one_step:
842; SKX:       # %bb.0:
843; SKX-NEXT:    vrcpps %ymm0, %ymm1
844; SKX-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - mem
845; SKX-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1
846; SKX-NEXT:    retq
847  %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
848  ret <8 x float> %div
849}
850
851define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
852; SSE-LABEL: v8f32_two_step:
853; SSE:       # %bb.0:
854; SSE-NEXT:    movaps %xmm1, %xmm2
855; SSE-NEXT:    rcpps %xmm0, %xmm3
856; SSE-NEXT:    movaps %xmm0, %xmm4
857; SSE-NEXT:    mulps %xmm3, %xmm4
858; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
859; SSE-NEXT:    movaps %xmm1, %xmm5
860; SSE-NEXT:    subps %xmm4, %xmm5
861; SSE-NEXT:    mulps %xmm3, %xmm5
862; SSE-NEXT:    addps %xmm3, %xmm5
863; SSE-NEXT:    mulps %xmm5, %xmm0
864; SSE-NEXT:    movaps %xmm1, %xmm3
865; SSE-NEXT:    subps %xmm0, %xmm3
866; SSE-NEXT:    mulps %xmm5, %xmm3
867; SSE-NEXT:    addps %xmm5, %xmm3
868; SSE-NEXT:    rcpps %xmm2, %xmm0
869; SSE-NEXT:    movaps %xmm2, %xmm4
870; SSE-NEXT:    mulps %xmm0, %xmm4
871; SSE-NEXT:    movaps %xmm1, %xmm5
872; SSE-NEXT:    subps %xmm4, %xmm5
873; SSE-NEXT:    mulps %xmm0, %xmm5
874; SSE-NEXT:    addps %xmm0, %xmm5
875; SSE-NEXT:    mulps %xmm5, %xmm2
876; SSE-NEXT:    subps %xmm2, %xmm1
877; SSE-NEXT:    mulps %xmm5, %xmm1
878; SSE-NEXT:    addps %xmm5, %xmm1
879; SSE-NEXT:    movaps %xmm3, %xmm0
880; SSE-NEXT:    retq
881;
882; AVX-RECIP-LABEL: v8f32_two_step:
883; AVX-RECIP:       # %bb.0:
884; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
885; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm2
886; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
887; AVX-RECIP-NEXT:    vsubps %ymm2, %ymm3, %ymm2
888; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm2
889; AVX-RECIP-NEXT:    vaddps %ymm2, %ymm1, %ymm1
890; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
891; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0
892; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
893; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
894; AVX-RECIP-NEXT:    retq
895;
896; FMA-RECIP-LABEL: v8f32_two_step:
897; FMA-RECIP:       # %bb.0:
898; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
899; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
900; FMA-RECIP-NEXT:    vmovaps %ymm1, %ymm3
901; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} ymm3 = (ymm0 * ymm3) - ymm2
902; FMA-RECIP-NEXT:    vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm1) + ymm1
903; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm2
904; FMA-RECIP-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm3) + ymm3
905; FMA-RECIP-NEXT:    retq
906;
907; BDVER2-LABEL: v8f32_two_step:
908; BDVER2:       # %bb.0:
909; BDVER2-NEXT:    vrcpps %ymm0, %ymm1
910; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
911; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm3 = (ymm0 * ymm1) - ymm2
912; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm1 = -(ymm1 * ymm3) + ymm1
913; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
914; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm1
915; BDVER2-NEXT:    retq
916;
917; BTVER2-LABEL: v8f32_two_step:
918; BTVER2:       # %bb.0:
919; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
920; BTVER2-NEXT:    vrcpps %ymm0, %ymm1
921; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm2
922; BTVER2-NEXT:    vsubps %ymm2, %ymm3, %ymm2
923; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm2
924; BTVER2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
925; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
926; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0
927; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
928; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
929; BTVER2-NEXT:    retq
930;
931; SANDY-LABEL: v8f32_two_step:
932; SANDY:       # %bb.0:
933; SANDY-NEXT:    vrcpps %ymm0, %ymm1
934; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm2
935; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
936; SANDY-NEXT:    vsubps %ymm2, %ymm3, %ymm2
937; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm2
938; SANDY-NEXT:    vaddps %ymm2, %ymm1, %ymm1
939; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0
940; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0
941; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
942; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0
943; SANDY-NEXT:    retq
944;
945; HASWELL-LABEL: v8f32_two_step:
946; HASWELL:       # %bb.0:
947; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
948; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
949; HASWELL-NEXT:    vmovaps %ymm1, %ymm3
950; HASWELL-NEXT:    vfmsub213ps {{.*#+}} ymm3 = (ymm0 * ymm3) - ymm2
951; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm1) + ymm1
952; HASWELL-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm2
953; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm3) + ymm3
954; HASWELL-NEXT:    retq
955;
956; HASWELL-NO-FMA-LABEL: v8f32_two_step:
957; HASWELL-NO-FMA:       # %bb.0:
958; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
959; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm2
960; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
961; HASWELL-NO-FMA-NEXT:    vsubps %ymm2, %ymm3, %ymm2
962; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm2
963; HASWELL-NO-FMA-NEXT:    vaddps %ymm2, %ymm1, %ymm1
964; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
965; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0
966; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
967; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
968; HASWELL-NO-FMA-NEXT:    retq
969;
970; AVX512-LABEL: v8f32_two_step:
971; AVX512:       # %bb.0:
972; AVX512-NEXT:    vrcpps %ymm0, %ymm1
973; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
974; AVX512-NEXT:    vmovaps %ymm1, %ymm3
975; AVX512-NEXT:    vfmsub213ps {{.*#+}} ymm3 = (ymm0 * ymm3) - ymm2
976; AVX512-NEXT:    vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm1) + ymm1
977; AVX512-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm2
978; AVX512-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm3) + ymm3
979; AVX512-NEXT:    retq
980  %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
981  ret <8 x float> %div
982}
983
984define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
985; SSE-LABEL: v16f32_no_estimate:
986; SSE:       # %bb.0:
987; SSE-NEXT:    movaps {{.*#+}} xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
988; SSE-NEXT:    movaps %xmm4, %xmm5
989; SSE-NEXT:    divps %xmm0, %xmm5
990; SSE-NEXT:    movaps %xmm4, %xmm6
991; SSE-NEXT:    divps %xmm1, %xmm6
992; SSE-NEXT:    movaps %xmm4, %xmm7
993; SSE-NEXT:    divps %xmm2, %xmm7
994; SSE-NEXT:    divps %xmm3, %xmm4
995; SSE-NEXT:    movaps %xmm5, %xmm0
996; SSE-NEXT:    movaps %xmm6, %xmm1
997; SSE-NEXT:    movaps %xmm7, %xmm2
998; SSE-NEXT:    movaps %xmm4, %xmm3
999; SSE-NEXT:    retq
1000;
1001; AVX-RECIP-LABEL: v16f32_no_estimate:
1002; AVX-RECIP:       # %bb.0:
1003; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1004; AVX-RECIP-NEXT:    vdivps %ymm0, %ymm2, %ymm0
1005; AVX-RECIP-NEXT:    vdivps %ymm1, %ymm2, %ymm1
1006; AVX-RECIP-NEXT:    retq
1007;
1008; FMA-RECIP-LABEL: v16f32_no_estimate:
1009; FMA-RECIP:       # %bb.0:
1010; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1011; FMA-RECIP-NEXT:    vdivps %ymm0, %ymm2, %ymm0
1012; FMA-RECIP-NEXT:    vdivps %ymm1, %ymm2, %ymm1
1013; FMA-RECIP-NEXT:    retq
1014;
1015; BDVER2-LABEL: v16f32_no_estimate:
1016; BDVER2:       # %bb.0:
1017; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1018; BDVER2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
1019; BDVER2-NEXT:    vdivps %ymm1, %ymm2, %ymm1
1020; BDVER2-NEXT:    retq
1021;
1022; BTVER2-LABEL: v16f32_no_estimate:
1023; BTVER2:       # %bb.0:
1024; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1025; BTVER2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
1026; BTVER2-NEXT:    vdivps %ymm1, %ymm2, %ymm1
1027; BTVER2-NEXT:    retq
1028;
1029; SANDY-LABEL: v16f32_no_estimate:
1030; SANDY:       # %bb.0:
1031; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1032; SANDY-NEXT:    vdivps %ymm0, %ymm2, %ymm0
1033; SANDY-NEXT:    vdivps %ymm1, %ymm2, %ymm1
1034; SANDY-NEXT:    retq
1035;
1036; HASWELL-LABEL: v16f32_no_estimate:
1037; HASWELL:       # %bb.0:
1038; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1039; HASWELL-NEXT:    vdivps %ymm0, %ymm2, %ymm0
1040; HASWELL-NEXT:    vdivps %ymm1, %ymm2, %ymm1
1041; HASWELL-NEXT:    retq
1042;
1043; HASWELL-NO-FMA-LABEL: v16f32_no_estimate:
1044; HASWELL-NO-FMA:       # %bb.0:
1045; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1046; HASWELL-NO-FMA-NEXT:    vdivps %ymm0, %ymm2, %ymm0
1047; HASWELL-NO-FMA-NEXT:    vdivps %ymm1, %ymm2, %ymm1
1048; HASWELL-NO-FMA-NEXT:    retq
1049;
1050; AVX512-LABEL: v16f32_no_estimate:
1051; AVX512:       # %bb.0:
1052; AVX512-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1053; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
1054; AVX512-NEXT:    retq
1055  %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
1056  ret <16 x float> %div
1057}
1058
1059define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
1060; SSE-LABEL: v16f32_one_step:
1061; SSE:       # %bb.0:
1062; SSE-NEXT:    movaps %xmm3, %xmm4
1063; SSE-NEXT:    movaps %xmm0, %xmm5
1064; SSE-NEXT:    rcpps %xmm0, %xmm6
1065; SSE-NEXT:    mulps %xmm6, %xmm5
1066; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1067; SSE-NEXT:    movaps %xmm3, %xmm0
1068; SSE-NEXT:    subps %xmm5, %xmm0
1069; SSE-NEXT:    mulps %xmm6, %xmm0
1070; SSE-NEXT:    addps %xmm6, %xmm0
1071; SSE-NEXT:    rcpps %xmm1, %xmm6
1072; SSE-NEXT:    mulps %xmm6, %xmm1
1073; SSE-NEXT:    movaps %xmm3, %xmm5
1074; SSE-NEXT:    subps %xmm1, %xmm5
1075; SSE-NEXT:    mulps %xmm6, %xmm5
1076; SSE-NEXT:    addps %xmm6, %xmm5
1077; SSE-NEXT:    rcpps %xmm2, %xmm1
1078; SSE-NEXT:    mulps %xmm1, %xmm2
1079; SSE-NEXT:    movaps %xmm3, %xmm6
1080; SSE-NEXT:    subps %xmm2, %xmm6
1081; SSE-NEXT:    mulps %xmm1, %xmm6
1082; SSE-NEXT:    addps %xmm1, %xmm6
1083; SSE-NEXT:    rcpps %xmm4, %xmm1
1084; SSE-NEXT:    mulps %xmm1, %xmm4
1085; SSE-NEXT:    subps %xmm4, %xmm3
1086; SSE-NEXT:    mulps %xmm1, %xmm3
1087; SSE-NEXT:    addps %xmm1, %xmm3
1088; SSE-NEXT:    movaps %xmm5, %xmm1
1089; SSE-NEXT:    movaps %xmm6, %xmm2
1090; SSE-NEXT:    retq
1091;
1092; AVX-RECIP-LABEL: v16f32_one_step:
1093; AVX-RECIP:       # %bb.0:
1094; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm2
1095; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm0
1096; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1097; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0
1098; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1099; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm2, %ymm0
1100; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm2
1101; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm1
1102; AVX-RECIP-NEXT:    vsubps %ymm1, %ymm3, %ymm1
1103; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1104; AVX-RECIP-NEXT:    vaddps %ymm1, %ymm2, %ymm1
1105; AVX-RECIP-NEXT:    retq
1106;
1107; FMA-RECIP-LABEL: v16f32_one_step:
1108; FMA-RECIP:       # %bb.0:
1109; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm2
1110; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1111; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3
1112; FMA-RECIP-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2
1113; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm2
1114; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm3
1115; FMA-RECIP-NEXT:    vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2
1116; FMA-RECIP-NEXT:    retq
1117;
1118; BDVER2-LABEL: v16f32_one_step:
1119; BDVER2:       # %bb.0:
1120; BDVER2-NEXT:    vrcpps %ymm0, %ymm2
1121; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1122; BDVER2-NEXT:    vrcpps %ymm1, %ymm4
1123; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3
1124; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm3
1125; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2
1126; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm4
1127; BDVER2-NEXT:    retq
1128;
1129; BTVER2-LABEL: v16f32_one_step:
1130; BTVER2:       # %bb.0:
1131; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1132; BTVER2-NEXT:    vrcpps %ymm0, %ymm2
1133; BTVER2-NEXT:    vrcpps %ymm1, %ymm4
1134; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0
1135; BTVER2-NEXT:    vmulps %ymm4, %ymm1, %ymm1
1136; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0
1137; BTVER2-NEXT:    vsubps %ymm1, %ymm3, %ymm1
1138; BTVER2-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1139; BTVER2-NEXT:    vmulps %ymm1, %ymm4, %ymm1
1140; BTVER2-NEXT:    vaddps %ymm0, %ymm2, %ymm0
1141; BTVER2-NEXT:    vaddps %ymm1, %ymm4, %ymm1
1142; BTVER2-NEXT:    retq
1143;
1144; SANDY-LABEL: v16f32_one_step:
1145; SANDY:       # %bb.0:
1146; SANDY-NEXT:    vrcpps %ymm0, %ymm2
1147; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0
1148; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1149; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0
1150; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1151; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0
1152; SANDY-NEXT:    vrcpps %ymm1, %ymm2
1153; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm1
1154; SANDY-NEXT:    vsubps %ymm1, %ymm3, %ymm1
1155; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1156; SANDY-NEXT:    vaddps %ymm1, %ymm2, %ymm1
1157; SANDY-NEXT:    retq
1158;
1159; HASWELL-LABEL: v16f32_one_step:
1160; HASWELL:       # %bb.0:
1161; HASWELL-NEXT:    vrcpps %ymm0, %ymm2
1162; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1163; HASWELL-NEXT:    vrcpps %ymm1, %ymm4
1164; HASWELL-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3
1165; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2
1166; HASWELL-NEXT:    vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm3
1167; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm4) + ymm4
1168; HASWELL-NEXT:    retq
1169;
1170; HASWELL-NO-FMA-LABEL: v16f32_one_step:
1171; HASWELL-NO-FMA:       # %bb.0:
1172; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2
1173; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
1174; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1175; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0
1176; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1177; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm2, %ymm0
1178; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm2
1179; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm1
1180; HASWELL-NO-FMA-NEXT:    vsubps %ymm1, %ymm3, %ymm1
1181; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1182; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm2, %ymm1
1183; HASWELL-NO-FMA-NEXT:    retq
1184;
1185; AVX512-LABEL: v16f32_one_step:
1186; AVX512:       # %bb.0:
1187; AVX512-NEXT:    vrcp14ps %zmm0, %zmm1
1188; AVX512-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - mem
1189; AVX512-NEXT:    vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm1
1190; AVX512-NEXT:    retq
1191  %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
1192  ret <16 x float> %div
1193}
1194
1195define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
1196; SSE-LABEL: v16f32_two_step:
1197; SSE:       # %bb.0:
1198; SSE-NEXT:    movaps %xmm3, %xmm4
1199; SSE-NEXT:    movaps %xmm1, %xmm5
1200; SSE-NEXT:    movaps %xmm0, %xmm1
1201; SSE-NEXT:    rcpps %xmm0, %xmm0
1202; SSE-NEXT:    movaps %xmm1, %xmm6
1203; SSE-NEXT:    mulps %xmm0, %xmm6
1204; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1205; SSE-NEXT:    movaps %xmm3, %xmm7
1206; SSE-NEXT:    subps %xmm6, %xmm7
1207; SSE-NEXT:    mulps %xmm0, %xmm7
1208; SSE-NEXT:    addps %xmm0, %xmm7
1209; SSE-NEXT:    mulps %xmm7, %xmm1
1210; SSE-NEXT:    movaps %xmm3, %xmm0
1211; SSE-NEXT:    subps %xmm1, %xmm0
1212; SSE-NEXT:    mulps %xmm7, %xmm0
1213; SSE-NEXT:    addps %xmm7, %xmm0
1214; SSE-NEXT:    rcpps %xmm5, %xmm1
1215; SSE-NEXT:    movaps %xmm5, %xmm6
1216; SSE-NEXT:    mulps %xmm1, %xmm6
1217; SSE-NEXT:    movaps %xmm3, %xmm7
1218; SSE-NEXT:    subps %xmm6, %xmm7
1219; SSE-NEXT:    mulps %xmm1, %xmm7
1220; SSE-NEXT:    addps %xmm1, %xmm7
1221; SSE-NEXT:    mulps %xmm7, %xmm5
1222; SSE-NEXT:    movaps %xmm3, %xmm1
1223; SSE-NEXT:    subps %xmm5, %xmm1
1224; SSE-NEXT:    mulps %xmm7, %xmm1
1225; SSE-NEXT:    addps %xmm7, %xmm1
1226; SSE-NEXT:    rcpps %xmm2, %xmm5
1227; SSE-NEXT:    movaps %xmm2, %xmm6
1228; SSE-NEXT:    mulps %xmm5, %xmm6
1229; SSE-NEXT:    movaps %xmm3, %xmm7
1230; SSE-NEXT:    subps %xmm6, %xmm7
1231; SSE-NEXT:    mulps %xmm5, %xmm7
1232; SSE-NEXT:    addps %xmm5, %xmm7
1233; SSE-NEXT:    mulps %xmm7, %xmm2
1234; SSE-NEXT:    movaps %xmm3, %xmm5
1235; SSE-NEXT:    subps %xmm2, %xmm5
1236; SSE-NEXT:    mulps %xmm7, %xmm5
1237; SSE-NEXT:    addps %xmm7, %xmm5
1238; SSE-NEXT:    rcpps %xmm4, %xmm2
1239; SSE-NEXT:    movaps %xmm4, %xmm6
1240; SSE-NEXT:    mulps %xmm2, %xmm6
1241; SSE-NEXT:    movaps %xmm3, %xmm7
1242; SSE-NEXT:    subps %xmm6, %xmm7
1243; SSE-NEXT:    mulps %xmm2, %xmm7
1244; SSE-NEXT:    addps %xmm2, %xmm7
1245; SSE-NEXT:    mulps %xmm7, %xmm4
1246; SSE-NEXT:    subps %xmm4, %xmm3
1247; SSE-NEXT:    mulps %xmm7, %xmm3
1248; SSE-NEXT:    addps %xmm7, %xmm3
1249; SSE-NEXT:    movaps %xmm5, %xmm2
1250; SSE-NEXT:    retq
1251;
1252; AVX-RECIP-LABEL: v16f32_two_step:
1253; AVX-RECIP:       # %bb.0:
1254; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm2
1255; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm3
1256; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1257; AVX-RECIP-NEXT:    vsubps %ymm3, %ymm4, %ymm3
1258; AVX-RECIP-NEXT:    vmulps %ymm3, %ymm2, %ymm3
1259; AVX-RECIP-NEXT:    vaddps %ymm3, %ymm2, %ymm2
1260; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm0
1261; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm4, %ymm0
1262; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1263; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm2, %ymm0
1264; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm2
1265; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm3
1266; AVX-RECIP-NEXT:    vsubps %ymm3, %ymm4, %ymm3
1267; AVX-RECIP-NEXT:    vmulps %ymm3, %ymm2, %ymm3
1268; AVX-RECIP-NEXT:    vaddps %ymm3, %ymm2, %ymm2
1269; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm1
1270; AVX-RECIP-NEXT:    vsubps %ymm1, %ymm4, %ymm1
1271; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1272; AVX-RECIP-NEXT:    vaddps %ymm1, %ymm2, %ymm1
1273; AVX-RECIP-NEXT:    retq
1274;
1275; FMA-RECIP-LABEL: v16f32_two_step:
1276; FMA-RECIP:       # %bb.0:
1277; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm2
1278; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1279; FMA-RECIP-NEXT:    vmovaps %ymm2, %ymm4
1280; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3
1281; FMA-RECIP-NEXT:    vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2
1282; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm3
1283; FMA-RECIP-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm4) + ymm4
1284; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm2
1285; FMA-RECIP-NEXT:    vmovaps %ymm2, %ymm4
1286; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} ymm4 = (ymm1 * ymm4) - ymm3
1287; FMA-RECIP-NEXT:    vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2
1288; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm3
1289; FMA-RECIP-NEXT:    vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm4) + ymm4
1290; FMA-RECIP-NEXT:    retq
1291;
1292; BDVER2-LABEL: v16f32_two_step:
1293; BDVER2:       # %bb.0:
1294; BDVER2-NEXT:    vrcpps %ymm0, %ymm2
1295; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1296; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm4 = (ymm0 * ymm2) - ymm3
1297; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2
1298; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3
1299; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2
1300; BDVER2-NEXT:    vrcpps %ymm1, %ymm2
1301; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm4 = (ymm1 * ymm2) - ymm3
1302; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2
1303; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm2) - ymm3
1304; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm2
1305; BDVER2-NEXT:    retq
1306;
1307; BTVER2-LABEL: v16f32_two_step:
1308; BTVER2:       # %bb.0:
1309; BTVER2-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1310; BTVER2-NEXT:    vrcpps %ymm0, %ymm2
1311; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm3
1312; BTVER2-NEXT:    vsubps %ymm3, %ymm4, %ymm3
1313; BTVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm3
1314; BTVER2-NEXT:    vaddps %ymm3, %ymm2, %ymm2
1315; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0
1316; BTVER2-NEXT:    vsubps %ymm0, %ymm4, %ymm0
1317; BTVER2-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1318; BTVER2-NEXT:    vaddps %ymm0, %ymm2, %ymm0
1319; BTVER2-NEXT:    vrcpps %ymm1, %ymm2
1320; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm3
1321; BTVER2-NEXT:    vsubps %ymm3, %ymm4, %ymm3
1322; BTVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm3
1323; BTVER2-NEXT:    vaddps %ymm3, %ymm2, %ymm2
1324; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm1
1325; BTVER2-NEXT:    vsubps %ymm1, %ymm4, %ymm1
1326; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1327; BTVER2-NEXT:    vaddps %ymm1, %ymm2, %ymm1
1328; BTVER2-NEXT:    retq
1329;
1330; SANDY-LABEL: v16f32_two_step:
1331; SANDY:       # %bb.0:
1332; SANDY-NEXT:    vrcpps %ymm0, %ymm2
1333; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm3
1334; SANDY-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1335; SANDY-NEXT:    vsubps %ymm3, %ymm4, %ymm3
1336; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm3
1337; SANDY-NEXT:    vaddps %ymm3, %ymm2, %ymm2
1338; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0
1339; SANDY-NEXT:    vsubps %ymm0, %ymm4, %ymm0
1340; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1341; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0
1342; SANDY-NEXT:    vrcpps %ymm1, %ymm2
1343; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm3
1344; SANDY-NEXT:    vsubps %ymm3, %ymm4, %ymm3
1345; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm3
1346; SANDY-NEXT:    vaddps %ymm3, %ymm2, %ymm2
1347; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm1
1348; SANDY-NEXT:    vsubps %ymm1, %ymm4, %ymm1
1349; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1350; SANDY-NEXT:    vaddps %ymm1, %ymm2, %ymm1
1351; SANDY-NEXT:    retq
1352;
1353; HASWELL-LABEL: v16f32_two_step:
1354; HASWELL:       # %bb.0:
1355; HASWELL-NEXT:    vrcpps %ymm0, %ymm2
1356; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1357; HASWELL-NEXT:    vmovaps %ymm2, %ymm4
1358; HASWELL-NEXT:    vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3
1359; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2
1360; HASWELL-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm3
1361; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm4) + ymm4
1362; HASWELL-NEXT:    vrcpps %ymm1, %ymm2
1363; HASWELL-NEXT:    vmovaps %ymm2, %ymm4
1364; HASWELL-NEXT:    vfmsub213ps {{.*#+}} ymm4 = (ymm1 * ymm4) - ymm3
1365; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2
1366; HASWELL-NEXT:    vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm3
1367; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm4) + ymm4
1368; HASWELL-NEXT:    retq
1369;
1370; HASWELL-NO-FMA-LABEL: v16f32_two_step:
1371; HASWELL-NO-FMA:       # %bb.0:
1372; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2
1373; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm3
1374; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1375; HASWELL-NO-FMA-NEXT:    vsubps %ymm3, %ymm4, %ymm3
1376; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm2, %ymm3
1377; HASWELL-NO-FMA-NEXT:    vaddps %ymm3, %ymm2, %ymm2
1378; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
1379; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm4, %ymm0
1380; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1381; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm2, %ymm0
1382; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm2
1383; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm3
1384; HASWELL-NO-FMA-NEXT:    vsubps %ymm3, %ymm4, %ymm3
1385; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm2, %ymm3
1386; HASWELL-NO-FMA-NEXT:    vaddps %ymm3, %ymm2, %ymm2
1387; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm1
1388; HASWELL-NO-FMA-NEXT:    vsubps %ymm1, %ymm4, %ymm1
1389; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1390; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm2, %ymm1
1391; HASWELL-NO-FMA-NEXT:    retq
1392;
1393; AVX512-LABEL: v16f32_two_step:
1394; AVX512:       # %bb.0:
1395; AVX512-NEXT:    vrcp14ps %zmm0, %zmm1
1396; AVX512-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1397; AVX512-NEXT:    vmovaps %zmm1, %zmm3
1398; AVX512-NEXT:    vfmsub213ps {{.*#+}} zmm3 = (zmm0 * zmm3) - zmm2
1399; AVX512-NEXT:    vfnmadd132ps {{.*#+}} zmm3 = -(zmm3 * zmm1) + zmm1
1400; AVX512-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm3 * zmm0) - zmm2
1401; AVX512-NEXT:    vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm3) + zmm3
1402; AVX512-NEXT:    retq
1403  %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
1404  ret <16 x float> %div
1405}
1406
1407attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" }
1408attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" }
1409attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" }
1410
1411