1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=avx2,fma | FileCheck %s --check-prefixes=CHECK,X64
3; RUN: llc < %s -mtriple=i686-- -mattr=avx2,fma | FileCheck %s --check-prefixes=CHECK,X86
4
5define float @fneg_v4f32(<4 x float> %x) nounwind {
6; X64-LABEL: fneg_v4f32:
7; X64:       # %bb.0:
8; X64-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
9; X64-NEXT:    vxorps %xmm1, %xmm0, %xmm0
10; X64-NEXT:    retq
11;
12; X86-LABEL: fneg_v4f32:
13; X86:       # %bb.0:
14; X86-NEXT:    pushl %eax
15; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
16; X86-NEXT:    vxorps %xmm1, %xmm0, %xmm0
17; X86-NEXT:    vmovss %xmm0, (%esp)
18; X86-NEXT:    flds (%esp)
19; X86-NEXT:    popl %eax
20; X86-NEXT:    retl
21  %v = fneg <4 x float> %x
22  %r = extractelement <4 x float> %v, i32 0
23  ret float %r
24}
25
26define double @fneg_v4f64(<4 x double> %x) nounwind {
27; X64-LABEL: fneg_v4f64:
28; X64:       # %bb.0:
29; X64-NEXT:    vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
30; X64-NEXT:    # xmm1 = mem[0,0]
31; X64-NEXT:    vxorps %xmm1, %xmm0, %xmm0
32; X64-NEXT:    vzeroupper
33; X64-NEXT:    retq
34;
35; X86-LABEL: fneg_v4f64:
36; X86:       # %bb.0:
37; X86-NEXT:    pushl %ebp
38; X86-NEXT:    movl %esp, %ebp
39; X86-NEXT:    andl $-8, %esp
40; X86-NEXT:    subl $8, %esp
41; X86-NEXT:    vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
42; X86-NEXT:    # xmm1 = mem[0,0]
43; X86-NEXT:    vxorps %xmm1, %xmm0, %xmm0
44; X86-NEXT:    vmovlps %xmm0, (%esp)
45; X86-NEXT:    fldl (%esp)
46; X86-NEXT:    movl %ebp, %esp
47; X86-NEXT:    popl %ebp
48; X86-NEXT:    vzeroupper
49; X86-NEXT:    retl
50  %v = fneg <4 x double> %x
51  %r = extractelement <4 x double> %v, i32 0
52  ret double %r
53}
54
55define float @fadd_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
56; X64-LABEL: fadd_v4f32:
57; X64:       # %bb.0:
58; X64-NEXT:    vaddss %xmm1, %xmm0, %xmm0
59; X64-NEXT:    retq
60;
61; X86-LABEL: fadd_v4f32:
62; X86:       # %bb.0:
63; X86-NEXT:    pushl %eax
64; X86-NEXT:    vaddss %xmm1, %xmm0, %xmm0
65; X86-NEXT:    vmovss %xmm0, (%esp)
66; X86-NEXT:    flds (%esp)
67; X86-NEXT:    popl %eax
68; X86-NEXT:    retl
69  %v = fadd <4 x float> %x, %y
70  %r = extractelement <4 x float> %v, i32 0
71  ret float %r
72}
73
74define double @fadd_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
75; X64-LABEL: fadd_v4f64:
76; X64:       # %bb.0:
77; X64-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
78; X64-NEXT:    vzeroupper
79; X64-NEXT:    retq
80;
81; X86-LABEL: fadd_v4f64:
82; X86:       # %bb.0:
83; X86-NEXT:    pushl %ebp
84; X86-NEXT:    movl %esp, %ebp
85; X86-NEXT:    andl $-8, %esp
86; X86-NEXT:    subl $8, %esp
87; X86-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
88; X86-NEXT:    vmovsd %xmm0, (%esp)
89; X86-NEXT:    fldl (%esp)
90; X86-NEXT:    movl %ebp, %esp
91; X86-NEXT:    popl %ebp
92; X86-NEXT:    vzeroupper
93; X86-NEXT:    retl
94  %v = fadd <4 x double> %x, %y
95  %r = extractelement <4 x double> %v, i32 0
96  ret double %r
97}
98
99define float @fsub_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
100; X64-LABEL: fsub_v4f32:
101; X64:       # %bb.0:
102; X64-NEXT:    vsubss %xmm1, %xmm0, %xmm0
103; X64-NEXT:    retq
104;
105; X86-LABEL: fsub_v4f32:
106; X86:       # %bb.0:
107; X86-NEXT:    pushl %eax
108; X86-NEXT:    vsubss %xmm1, %xmm0, %xmm0
109; X86-NEXT:    vmovss %xmm0, (%esp)
110; X86-NEXT:    flds (%esp)
111; X86-NEXT:    popl %eax
112; X86-NEXT:    retl
113  %v = fsub <4 x float> %x, %y
114  %r = extractelement <4 x float> %v, i32 0
115  ret float %r
116}
117
118define double @fsub_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
119; X64-LABEL: fsub_v4f64:
120; X64:       # %bb.0:
121; X64-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
122; X64-NEXT:    vzeroupper
123; X64-NEXT:    retq
124;
125; X86-LABEL: fsub_v4f64:
126; X86:       # %bb.0:
127; X86-NEXT:    pushl %ebp
128; X86-NEXT:    movl %esp, %ebp
129; X86-NEXT:    andl $-8, %esp
130; X86-NEXT:    subl $8, %esp
131; X86-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
132; X86-NEXT:    vmovsd %xmm0, (%esp)
133; X86-NEXT:    fldl (%esp)
134; X86-NEXT:    movl %ebp, %esp
135; X86-NEXT:    popl %ebp
136; X86-NEXT:    vzeroupper
137; X86-NEXT:    retl
138  %v = fsub <4 x double> %x, %y
139  %r = extractelement <4 x double> %v, i32 0
140  ret double %r
141}
142
143define float @fmul_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
144; X64-LABEL: fmul_v4f32:
145; X64:       # %bb.0:
146; X64-NEXT:    vmulss %xmm1, %xmm0, %xmm0
147; X64-NEXT:    retq
148;
149; X86-LABEL: fmul_v4f32:
150; X86:       # %bb.0:
151; X86-NEXT:    pushl %eax
152; X86-NEXT:    vmulss %xmm1, %xmm0, %xmm0
153; X86-NEXT:    vmovss %xmm0, (%esp)
154; X86-NEXT:    flds (%esp)
155; X86-NEXT:    popl %eax
156; X86-NEXT:    retl
157  %v = fmul <4 x float> %x, %y
158  %r = extractelement <4 x float> %v, i32 0
159  ret float %r
160}
161
162define double @fmul_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
163; X64-LABEL: fmul_v4f64:
164; X64:       # %bb.0:
165; X64-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
166; X64-NEXT:    vzeroupper
167; X64-NEXT:    retq
168;
169; X86-LABEL: fmul_v4f64:
170; X86:       # %bb.0:
171; X86-NEXT:    pushl %ebp
172; X86-NEXT:    movl %esp, %ebp
173; X86-NEXT:    andl $-8, %esp
174; X86-NEXT:    subl $8, %esp
175; X86-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
176; X86-NEXT:    vmovsd %xmm0, (%esp)
177; X86-NEXT:    fldl (%esp)
178; X86-NEXT:    movl %ebp, %esp
179; X86-NEXT:    popl %ebp
180; X86-NEXT:    vzeroupper
181; X86-NEXT:    retl
182  %v = fmul <4 x double> %x, %y
183  %r = extractelement <4 x double> %v, i32 0
184  ret double %r
185}
186
187define float @fdiv_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
188; X64-LABEL: fdiv_v4f32:
189; X64:       # %bb.0:
190; X64-NEXT:    vdivss %xmm1, %xmm0, %xmm0
191; X64-NEXT:    retq
192;
193; X86-LABEL: fdiv_v4f32:
194; X86:       # %bb.0:
195; X86-NEXT:    pushl %eax
196; X86-NEXT:    vdivss %xmm1, %xmm0, %xmm0
197; X86-NEXT:    vmovss %xmm0, (%esp)
198; X86-NEXT:    flds (%esp)
199; X86-NEXT:    popl %eax
200; X86-NEXT:    retl
201  %v = fdiv <4 x float> %x, %y
202  %r = extractelement <4 x float> %v, i32 0
203  ret float %r
204}
205
206define double @fdiv_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
207; X64-LABEL: fdiv_v4f64:
208; X64:       # %bb.0:
209; X64-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
210; X64-NEXT:    vzeroupper
211; X64-NEXT:    retq
212;
213; X86-LABEL: fdiv_v4f64:
214; X86:       # %bb.0:
215; X86-NEXT:    pushl %ebp
216; X86-NEXT:    movl %esp, %ebp
217; X86-NEXT:    andl $-8, %esp
218; X86-NEXT:    subl $8, %esp
219; X86-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
220; X86-NEXT:    vmovsd %xmm0, (%esp)
221; X86-NEXT:    fldl (%esp)
222; X86-NEXT:    movl %ebp, %esp
223; X86-NEXT:    popl %ebp
224; X86-NEXT:    vzeroupper
225; X86-NEXT:    retl
226  %v = fdiv <4 x double> %x, %y
227  %r = extractelement <4 x double> %v, i32 0
228  ret double %r
229}
230
231define float @frem_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
232; X64-LABEL: frem_v4f32:
233; X64:       # %bb.0:
234; X64-NEXT:    jmp fmodf@PLT # TAILCALL
235;
236; X86-LABEL: frem_v4f32:
237; X86:       # %bb.0:
238; X86-NEXT:    subl $8, %esp
239; X86-NEXT:    vmovss %xmm1, {{[0-9]+}}(%esp)
240; X86-NEXT:    vmovss %xmm0, (%esp)
241; X86-NEXT:    calll fmodf
242; X86-NEXT:    addl $8, %esp
243; X86-NEXT:    retl
244  %v = frem <4 x float> %x, %y
245  %r = extractelement <4 x float> %v, i32 0
246  ret float %r
247}
248
249define double @frem_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
250; X64-LABEL: frem_v4f64:
251; X64:       # %bb.0:
252; X64-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
253; X64-NEXT:    # kill: def $xmm1 killed $xmm1 killed $ymm1
254; X64-NEXT:    vzeroupper
255; X64-NEXT:    jmp fmod@PLT # TAILCALL
256;
257; X86-LABEL: frem_v4f64:
258; X86:       # %bb.0:
259; X86-NEXT:    subl $16, %esp
260; X86-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
261; X86-NEXT:    vmovups %xmm0, (%esp)
262; X86-NEXT:    vzeroupper
263; X86-NEXT:    calll fmod
264; X86-NEXT:    addl $16, %esp
265; X86-NEXT:    retl
266  %v = frem <4 x double> %x, %y
267  %r = extractelement <4 x double> %v, i32 0
268  ret double %r
269}
270
271define i1 @fcmp_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
272; CHECK-LABEL: fcmp_v4f32:
273; CHECK:       # %bb.0:
274; CHECK-NEXT:    vucomiss %xmm1, %xmm0
275; CHECK-NEXT:    seta %al
276; CHECK-NEXT:    ret{{[l|q]}}
277  %v = fcmp ogt <4 x float> %x, %y
278  %r = extractelement <4 x i1> %v, i32 0
279  ret i1 %r
280}
281
282define i1 @fcmp_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
283; CHECK-LABEL: fcmp_v4f64:
284; CHECK:       # %bb.0:
285; CHECK-NEXT:    vucomisd %xmm0, %xmm1
286; CHECK-NEXT:    setb %al
287; CHECK-NEXT:    vzeroupper
288; CHECK-NEXT:    ret{{[l|q]}}
289  %v = fcmp ugt <4 x double> %x, %y
290  %r = extractelement <4 x i1> %v, i32 0
291  ret i1 %r
292}
293
294; If we do the fcmp transform late, make sure we have the right types.
295; https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=13700
296
297define void @extsetcc(<4 x float> %x) {
298; X64-LABEL: extsetcc:
299; X64:       # %bb.0:
300; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
301; X64-NEXT:    vucomiss %xmm1, %xmm0
302; X64-NEXT:    setb (%rax)
303; X64-NEXT:    retq
304;
305; X86-LABEL: extsetcc:
306; X86:       # %bb.0:
307; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
308; X86-NEXT:    vucomiss %xmm1, %xmm0
309; X86-NEXT:    setb (%eax)
310; X86-NEXT:    retl
311  %cmp = fcmp ult <4 x float> %x, zeroinitializer
312  %sext = sext <4 x i1> %cmp to <4 x i32>
313  %e = extractelement <4 x i1> %cmp, i1 0
314  store i1 %e, i1* undef
315  ret void
316}
317
318; This used to crash by creating a setcc with an i64 condition on a 32-bit target.
319define <3 x double> @extvselectsetcc_crash(<2 x double> %x) {
320; X64-LABEL: extvselectsetcc_crash:
321; X64:       # %bb.0:
322; X64-NEXT:    vcmpeqpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
323; X64-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
324; X64-NEXT:    vandpd %xmm2, %xmm1, %xmm1
325; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
326; X64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
327; X64-NEXT:    retq
328;
329; X86-LABEL: extvselectsetcc_crash:
330; X86:       # %bb.0:
331; X86-NEXT:    vcmpeqpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
332; X86-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
333; X86-NEXT:    vandpd %xmm2, %xmm1, %xmm1
334; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
335; X86-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
336; X86-NEXT:    retl
337  %cmp = fcmp oeq <2 x double> %x, <double 5.0, double 5.0>
338  %s = select <2 x i1> %cmp, <2 x double> <double 1.0, double undef>, <2 x double> <double 0.0, double undef>
339  %r = shufflevector <2 x double> %s, <2 x double> %x, <3 x i32> <i32 0, i32 2, i32 3>
340  ret <3 x double> %r
341}
342
343define float @select_fcmp_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) nounwind {
344; X64-LABEL: select_fcmp_v4f32:
345; X64:       # %bb.0:
346; X64-NEXT:    vcmpneq_oqss %xmm1, %xmm0, %xmm0
347; X64-NEXT:    vblendvps %xmm0, %xmm2, %xmm3, %xmm0
348; X64-NEXT:    retq
349;
350; X86-LABEL: select_fcmp_v4f32:
351; X86:       # %bb.0:
352; X86-NEXT:    pushl %ebp
353; X86-NEXT:    movl %esp, %ebp
354; X86-NEXT:    andl $-16, %esp
355; X86-NEXT:    subl $16, %esp
356; X86-NEXT:    vmovaps 8(%ebp), %xmm3
357; X86-NEXT:    vcmpneq_oqss %xmm1, %xmm0, %xmm0
358; X86-NEXT:    vblendvps %xmm0, %xmm2, %xmm3, %xmm0
359; X86-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
360; X86-NEXT:    flds {{[0-9]+}}(%esp)
361; X86-NEXT:    movl %ebp, %esp
362; X86-NEXT:    popl %ebp
363; X86-NEXT:    retl
364  %c = fcmp one <4 x float> %x, %y
365  %s = select <4 x i1> %c, <4 x float> %z, <4 x float> %w
366  %r = extractelement <4 x float> %s, i32 0
367  ret float %r
368}
369
370define double @select_fcmp_v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z, <4 x double> %w) nounwind {
371; X64-LABEL: select_fcmp_v4f64:
372; X64:       # %bb.0:
373; X64-NEXT:    vcmpnltsd %xmm0, %xmm1, %xmm0
374; X64-NEXT:    vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
375; X64-NEXT:    vzeroupper
376; X64-NEXT:    retq
377;
378; X86-LABEL: select_fcmp_v4f64:
379; X86:       # %bb.0:
380; X86-NEXT:    pushl %ebp
381; X86-NEXT:    movl %esp, %ebp
382; X86-NEXT:    andl $-32, %esp
383; X86-NEXT:    subl $32, %esp
384; X86-NEXT:    vcmpnltsd %xmm0, %xmm1, %xmm0
385; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
386; X86-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
387; X86-NEXT:    vmovlpd %xmm0, {{[0-9]+}}(%esp)
388; X86-NEXT:    fldl {{[0-9]+}}(%esp)
389; X86-NEXT:    movl %ebp, %esp
390; X86-NEXT:    popl %ebp
391; X86-NEXT:    vzeroupper
392; X86-NEXT:    retl
393  %c = fcmp ule <4 x double> %x, %y
394  %s = select <4 x i1> %c, <4 x double> %z, <4 x double> %w
395  %r = extractelement <4 x double> %s, i32 0
396  ret double %r
397}
398
399define float @fsqrt_v4f32(<4 x float> %x) nounwind {
400; X64-LABEL: fsqrt_v4f32:
401; X64:       # %bb.0:
402; X64-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
403; X64-NEXT:    retq
404;
405; X86-LABEL: fsqrt_v4f32:
406; X86:       # %bb.0:
407; X86-NEXT:    pushl %eax
408; X86-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
409; X86-NEXT:    vmovss %xmm0, (%esp)
410; X86-NEXT:    flds (%esp)
411; X86-NEXT:    popl %eax
412; X86-NEXT:    retl
413  %v = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
414  %r = extractelement <4 x float> %v, i32 0
415  ret float %r
416}
417
418define double @fsqrt_v4f64(<4 x double> %x) nounwind {
419; X64-LABEL: fsqrt_v4f64:
420; X64:       # %bb.0:
421; X64-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
422; X64-NEXT:    vzeroupper
423; X64-NEXT:    retq
424;
425; X86-LABEL: fsqrt_v4f64:
426; X86:       # %bb.0:
427; X86-NEXT:    pushl %ebp
428; X86-NEXT:    movl %esp, %ebp
429; X86-NEXT:    andl $-8, %esp
430; X86-NEXT:    subl $8, %esp
431; X86-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
432; X86-NEXT:    vmovsd %xmm0, (%esp)
433; X86-NEXT:    fldl (%esp)
434; X86-NEXT:    movl %ebp, %esp
435; X86-NEXT:    popl %ebp
436; X86-NEXT:    vzeroupper
437; X86-NEXT:    retl
438  %v = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %x)
439  %r = extractelement <4 x double> %v, i32 0
440  ret double %r
441}
442
443define float @fsin_v4f32(<4 x float> %x) nounwind {
444; X64-LABEL: fsin_v4f32:
445; X64:       # %bb.0:
446; X64-NEXT:    jmp sinf@PLT # TAILCALL
447;
448; X86-LABEL: fsin_v4f32:
449; X86:       # %bb.0:
450; X86-NEXT:    pushl %eax
451; X86-NEXT:    vmovss %xmm0, (%esp)
452; X86-NEXT:    calll sinf
453; X86-NEXT:    popl %eax
454; X86-NEXT:    retl
455  %v = call <4 x float> @llvm.sin.v4f32(<4 x float> %x)
456  %r = extractelement <4 x float> %v, i32 0
457  ret float %r
458}
459
460define double @fsin_v4f64(<4 x double> %x) nounwind {
461; X64-LABEL: fsin_v4f64:
462; X64:       # %bb.0:
463; X64-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
464; X64-NEXT:    vzeroupper
465; X64-NEXT:    jmp sin@PLT # TAILCALL
466;
467; X86-LABEL: fsin_v4f64:
468; X86:       # %bb.0:
469; X86-NEXT:    subl $8, %esp
470; X86-NEXT:    vmovlps %xmm0, (%esp)
471; X86-NEXT:    vzeroupper
472; X86-NEXT:    calll sin
473; X86-NEXT:    addl $8, %esp
474; X86-NEXT:    retl
475  %v = call <4 x double> @llvm.sin.v4f64(<4 x double> %x)
476  %r = extractelement <4 x double> %v, i32 0
477  ret double %r
478}
479
480define float @fma_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z) nounwind {
481; X64-LABEL: fma_v4f32:
482; X64:       # %bb.0:
483; X64-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
484; X64-NEXT:    retq
485;
486; X86-LABEL: fma_v4f32:
487; X86:       # %bb.0:
488; X86-NEXT:    pushl %eax
489; X86-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
490; X86-NEXT:    vmovss %xmm0, (%esp)
491; X86-NEXT:    flds (%esp)
492; X86-NEXT:    popl %eax
493; X86-NEXT:    retl
494  %v = call <4 x float> @llvm.fma.v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z)
495  %r = extractelement <4 x float> %v, i32 0
496  ret float %r
497}
498
499define double @fma_v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z) nounwind {
500; X64-LABEL: fma_v4f64:
501; X64:       # %bb.0:
502; X64-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
503; X64-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
504; X64-NEXT:    vzeroupper
505; X64-NEXT:    retq
506;
507; X86-LABEL: fma_v4f64:
508; X86:       # %bb.0:
509; X86-NEXT:    pushl %ebp
510; X86-NEXT:    movl %esp, %ebp
511; X86-NEXT:    andl $-8, %esp
512; X86-NEXT:    subl $8, %esp
513; X86-NEXT:    vfmadd213sd {{.*#+}} xmm1 = (xmm0 * xmm1) + xmm2
514; X86-NEXT:    vmovsd %xmm1, (%esp)
515; X86-NEXT:    fldl (%esp)
516; X86-NEXT:    movl %ebp, %esp
517; X86-NEXT:    popl %ebp
518; X86-NEXT:    vzeroupper
519; X86-NEXT:    retl
520  %v = call <4 x double> @llvm.fma.v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z)
521  %r = extractelement <4 x double> %v, i32 0
522  ret double %r
523}
524
525define float @fabs_v4f32(<4 x float> %x) nounwind {
526; X64-LABEL: fabs_v4f32:
527; X64:       # %bb.0:
528; X64-NEXT:    vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
529; X64-NEXT:    vandps %xmm1, %xmm0, %xmm0
530; X64-NEXT:    retq
531;
532; X86-LABEL: fabs_v4f32:
533; X86:       # %bb.0:
534; X86-NEXT:    pushl %eax
535; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
536; X86-NEXT:    vandps %xmm1, %xmm0, %xmm0
537; X86-NEXT:    vmovss %xmm0, (%esp)
538; X86-NEXT:    flds (%esp)
539; X86-NEXT:    popl %eax
540; X86-NEXT:    retl
541  %v = call <4 x float> @llvm.fabs.v4f32(<4 x float> %x)
542  %r = extractelement <4 x float> %v, i32 0
543  ret float %r
544}
545
546define double @fabs_v4f64(<4 x double> %x) nounwind {
547; X64-LABEL: fabs_v4f64:
548; X64:       # %bb.0:
549; X64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
550; X64-NEXT:    vzeroupper
551; X64-NEXT:    retq
552;
553; X86-LABEL: fabs_v4f64:
554; X86:       # %bb.0:
555; X86-NEXT:    pushl %ebp
556; X86-NEXT:    movl %esp, %ebp
557; X86-NEXT:    andl $-8, %esp
558; X86-NEXT:    subl $8, %esp
559; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
560; X86-NEXT:    vmovlps %xmm0, (%esp)
561; X86-NEXT:    fldl (%esp)
562; X86-NEXT:    movl %ebp, %esp
563; X86-NEXT:    popl %ebp
564; X86-NEXT:    vzeroupper
565; X86-NEXT:    retl
566  %v = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x)
567  %r = extractelement <4 x double> %v, i32 0
568  ret double %r
569}
570
571define float @fmaxnum_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
572; X64-LABEL: fmaxnum_v4f32:
573; X64:       # %bb.0:
574; X64-NEXT:    vmaxss %xmm0, %xmm1, %xmm2
575; X64-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
576; X64-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
577; X64-NEXT:    retq
578;
579; X86-LABEL: fmaxnum_v4f32:
580; X86:       # %bb.0:
581; X86-NEXT:    pushl %eax
582; X86-NEXT:    vmaxss %xmm0, %xmm1, %xmm2
583; X86-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
584; X86-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
585; X86-NEXT:    vmovss %xmm0, (%esp)
586; X86-NEXT:    flds (%esp)
587; X86-NEXT:    popl %eax
588; X86-NEXT:    retl
589  %v = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y)
590  %r = extractelement <4 x float> %v, i32 0
591  ret float %r
592}
593
594define double @fmaxnum_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
595; X64-LABEL: fmaxnum_v4f64:
596; X64:       # %bb.0:
597; X64-NEXT:    vmaxsd %xmm0, %xmm1, %xmm2
598; X64-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0
599; X64-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
600; X64-NEXT:    vzeroupper
601; X64-NEXT:    retq
602;
603; X86-LABEL: fmaxnum_v4f64:
604; X86:       # %bb.0:
605; X86-NEXT:    pushl %ebp
606; X86-NEXT:    movl %esp, %ebp
607; X86-NEXT:    andl $-8, %esp
608; X86-NEXT:    subl $8, %esp
609; X86-NEXT:    vmaxsd %xmm0, %xmm1, %xmm2
610; X86-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0
611; X86-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
612; X86-NEXT:    vmovlpd %xmm0, (%esp)
613; X86-NEXT:    fldl (%esp)
614; X86-NEXT:    movl %ebp, %esp
615; X86-NEXT:    popl %ebp
616; X86-NEXT:    vzeroupper
617; X86-NEXT:    retl
618  %v = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %x, <4 x double> %y)
619  %r = extractelement <4 x double> %v, i32 0
620  ret double %r
621}
622
623define float @fminnum_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
624; X64-LABEL: fminnum_v4f32:
625; X64:       # %bb.0:
626; X64-NEXT:    vminss %xmm0, %xmm1, %xmm2
627; X64-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
628; X64-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
629; X64-NEXT:    retq
630;
631; X86-LABEL: fminnum_v4f32:
632; X86:       # %bb.0:
633; X86-NEXT:    pushl %eax
634; X86-NEXT:    vminss %xmm0, %xmm1, %xmm2
635; X86-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
636; X86-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
637; X86-NEXT:    vmovss %xmm0, (%esp)
638; X86-NEXT:    flds (%esp)
639; X86-NEXT:    popl %eax
640; X86-NEXT:    retl
641  %v = call <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y)
642  %r = extractelement <4 x float> %v, i32 0
643  ret float %r
644}
645
646define double @fminnum_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
647; X64-LABEL: fminnum_v4f64:
648; X64:       # %bb.0:
649; X64-NEXT:    vminsd %xmm0, %xmm1, %xmm2
650; X64-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0
651; X64-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
652; X64-NEXT:    vzeroupper
653; X64-NEXT:    retq
654;
655; X86-LABEL: fminnum_v4f64:
656; X86:       # %bb.0:
657; X86-NEXT:    pushl %ebp
658; X86-NEXT:    movl %esp, %ebp
659; X86-NEXT:    andl $-8, %esp
660; X86-NEXT:    subl $8, %esp
661; X86-NEXT:    vminsd %xmm0, %xmm1, %xmm2
662; X86-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0
663; X86-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
664; X86-NEXT:    vmovlpd %xmm0, (%esp)
665; X86-NEXT:    fldl (%esp)
666; X86-NEXT:    movl %ebp, %esp
667; X86-NEXT:    popl %ebp
668; X86-NEXT:    vzeroupper
669; X86-NEXT:    retl
670  %v = call <4 x double> @llvm.minnum.v4f64(<4 x double> %x, <4 x double> %y)
671  %r = extractelement <4 x double> %v, i32 0
672  ret double %r
673}
674
675;define float @fmaximum_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
676;  %v = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> %y)
677;  %r = extractelement <4 x float> %v, i32 0
678;  ret float %r
679;}
680
681;define double @fmaximum_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
682;  %v = call <4 x double> @llvm.maximum.v4f64(<4 x double> %x, <4 x double> %y)
683;  %r = extractelement <4 x double> %v, i32 0
684;  ret double %r
685;}
686
687;define float @fminimum_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
688;  %v = call <4 x float> @llvm.minimum.v4f32(<4 x float> %x, <4 x float> %y)
689;  %r = extractelement <4 x float> %v, i32 0
690;  ret float %r
691;}
692
693;define double @fminimum_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
694;  %v = call <4 x double> @llvm.minimum.v4f64(<4 x double> %x, <4 x double> %y)
695;  %r = extractelement <4 x double> %v, i32 0
696;  ret double %r
697;}
698
699define float @maxps_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
700; X64-LABEL: maxps_v4f32:
701; X64:       # %bb.0:
702; X64-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
703; X64-NEXT:    retq
704;
705; X86-LABEL: maxps_v4f32:
706; X86:       # %bb.0:
707; X86-NEXT:    pushl %eax
708; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
709; X86-NEXT:    vmovss %xmm0, (%esp)
710; X86-NEXT:    flds (%esp)
711; X86-NEXT:    popl %eax
712; X86-NEXT:    retl
713  %cmp = fcmp ogt <4 x float> %x, %y
714  %v = select <4 x i1> %cmp, <4 x float> %x, <4 x float> %y
715  %r = extractelement <4 x float> %v, i32 0
716  ret float %r
717}
718
719define double @maxpd_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
720; X64-LABEL: maxpd_v4f64:
721; X64:       # %bb.0:
722; X64-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
723; X64-NEXT:    vzeroupper
724; X64-NEXT:    retq
725;
726; X86-LABEL: maxpd_v4f64:
727; X86:       # %bb.0:
728; X86-NEXT:    pushl %ebp
729; X86-NEXT:    movl %esp, %ebp
730; X86-NEXT:    andl $-8, %esp
731; X86-NEXT:    subl $8, %esp
732; X86-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
733; X86-NEXT:    vmovsd %xmm0, (%esp)
734; X86-NEXT:    fldl (%esp)
735; X86-NEXT:    movl %ebp, %esp
736; X86-NEXT:    popl %ebp
737; X86-NEXT:    vzeroupper
738; X86-NEXT:    retl
739  %cmp = fcmp ogt <4 x double> %x, %y
740  %v = select <4 x i1> %cmp, <4 x double> %x, <4 x double> %y
741  %r = extractelement <4 x double> %v, i32 0
742  ret double %r
743}
744
745define float @minps_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
746; X64-LABEL: minps_v4f32:
747; X64:       # %bb.0:
748; X64-NEXT:    vminss %xmm1, %xmm0, %xmm0
749; X64-NEXT:    retq
750;
751; X86-LABEL: minps_v4f32:
752; X86:       # %bb.0:
753; X86-NEXT:    pushl %eax
754; X86-NEXT:    vminss %xmm1, %xmm0, %xmm0
755; X86-NEXT:    vmovss %xmm0, (%esp)
756; X86-NEXT:    flds (%esp)
757; X86-NEXT:    popl %eax
758; X86-NEXT:    retl
759  %cmp = fcmp olt <4 x float> %x, %y
760  %v = select <4 x i1> %cmp, <4 x float> %x, <4 x float> %y
761  %r = extractelement <4 x float> %v, i32 0
762  ret float %r
763}
764
765define double @minpd_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
766; X64-LABEL: minpd_v4f64:
767; X64:       # %bb.0:
768; X64-NEXT:    vminsd %xmm1, %xmm0, %xmm0
769; X64-NEXT:    vzeroupper
770; X64-NEXT:    retq
771;
772; X86-LABEL: minpd_v4f64:
773; X86:       # %bb.0:
774; X86-NEXT:    pushl %ebp
775; X86-NEXT:    movl %esp, %ebp
776; X86-NEXT:    andl $-8, %esp
777; X86-NEXT:    subl $8, %esp
778; X86-NEXT:    vminsd %xmm1, %xmm0, %xmm0
779; X86-NEXT:    vmovsd %xmm0, (%esp)
780; X86-NEXT:    fldl (%esp)
781; X86-NEXT:    movl %ebp, %esp
782; X86-NEXT:    popl %ebp
783; X86-NEXT:    vzeroupper
784; X86-NEXT:    retl
785  %cmp = fcmp olt <4 x double> %x, %y
786  %v = select <4 x i1> %cmp, <4 x double> %x, <4 x double> %y
787  %r = extractelement <4 x double> %v, i32 0
788  ret double %r
789}
790
791define float @copysign_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
792; X64-LABEL: copysign_v4f32:
793; X64:       # %bb.0:
794; X64-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
795; X64-NEXT:    vandps %xmm2, %xmm1, %xmm1
796; X64-NEXT:    vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
797; X64-NEXT:    vandps %xmm2, %xmm0, %xmm0
798; X64-NEXT:    vorps %xmm1, %xmm0, %xmm0
799; X64-NEXT:    retq
800;
801; X86-LABEL: copysign_v4f32:
802; X86:       # %bb.0:
803; X86-NEXT:    pushl %eax
804; X86-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
805; X86-NEXT:    vandps %xmm2, %xmm1, %xmm1
806; X86-NEXT:    vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
807; X86-NEXT:    vandps %xmm2, %xmm0, %xmm0
808; X86-NEXT:    vorps %xmm1, %xmm0, %xmm0
809; X86-NEXT:    vmovss %xmm0, (%esp)
810; X86-NEXT:    flds (%esp)
811; X86-NEXT:    popl %eax
812; X86-NEXT:    retl
813  %v = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> %y)
814  %r = extractelement <4 x float> %v, i32 0
815  ret float %r
816}
817
818define double @copysign_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
819; X64-LABEL: copysign_v4f64:
820; X64:       # %bb.0:
821; X64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
822; X64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
823; X64-NEXT:    vorps %xmm1, %xmm0, %xmm0
824; X64-NEXT:    vzeroupper
825; X64-NEXT:    retq
826;
827; X86-LABEL: copysign_v4f64:
828; X86:       # %bb.0:
829; X86-NEXT:    pushl %ebp
830; X86-NEXT:    movl %esp, %ebp
831; X86-NEXT:    andl $-8, %esp
832; X86-NEXT:    subl $8, %esp
833; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
834; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
835; X86-NEXT:    vorps %xmm1, %xmm0, %xmm0
836; X86-NEXT:    vmovlps %xmm0, (%esp)
837; X86-NEXT:    fldl (%esp)
838; X86-NEXT:    movl %ebp, %esp
839; X86-NEXT:    popl %ebp
840; X86-NEXT:    vzeroupper
841; X86-NEXT:    retl
842  %v = call <4 x double> @llvm.copysign.v4f64(<4 x double> %x, <4 x double> %y)
843  %r = extractelement <4 x double> %v, i32 0
844  ret double %r
845}
846
847define float @floor_v4f32(<4 x float> %x) nounwind {
848; X64-LABEL: floor_v4f32:
849; X64:       # %bb.0:
850; X64-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
851; X64-NEXT:    retq
852;
853; X86-LABEL: floor_v4f32:
854; X86:       # %bb.0:
855; X86-NEXT:    pushl %eax
856; X86-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
857; X86-NEXT:    vmovss %xmm0, (%esp)
858; X86-NEXT:    flds (%esp)
859; X86-NEXT:    popl %eax
860; X86-NEXT:    retl
861  %v = call <4 x float> @llvm.floor.v4f32(<4 x float> %x)
862  %r = extractelement <4 x float> %v, i32 0
863  ret float %r
864}
865
866define double @floor_v4f64(<4 x double> %x) nounwind {
867; X64-LABEL: floor_v4f64:
868; X64:       # %bb.0:
869; X64-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0
870; X64-NEXT:    vzeroupper
871; X64-NEXT:    retq
872;
873; X86-LABEL: floor_v4f64:
874; X86:       # %bb.0:
875; X86-NEXT:    pushl %ebp
876; X86-NEXT:    movl %esp, %ebp
877; X86-NEXT:    andl $-8, %esp
878; X86-NEXT:    subl $8, %esp
879; X86-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0
880; X86-NEXT:    vmovsd %xmm0, (%esp)
881; X86-NEXT:    fldl (%esp)
882; X86-NEXT:    movl %ebp, %esp
883; X86-NEXT:    popl %ebp
884; X86-NEXT:    vzeroupper
885; X86-NEXT:    retl
886  %v = call <4 x double> @llvm.floor.v4f64(<4 x double> %x)
887  %r = extractelement <4 x double> %v, i32 0
888  ret double %r
889}
890
891define float @ceil_v4f32(<4 x float> %x) nounwind {
892; X64-LABEL: ceil_v4f32:
893; X64:       # %bb.0:
894; X64-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
895; X64-NEXT:    retq
896;
897; X86-LABEL: ceil_v4f32:
898; X86:       # %bb.0:
899; X86-NEXT:    pushl %eax
900; X86-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
901; X86-NEXT:    vmovss %xmm0, (%esp)
902; X86-NEXT:    flds (%esp)
903; X86-NEXT:    popl %eax
904; X86-NEXT:    retl
905  %v = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x)
906  %r = extractelement <4 x float> %v, i32 0
907  ret float %r
908}
909
910define double @ceil_v4f64(<4 x double> %x) nounwind {
911; X64-LABEL: ceil_v4f64:
912; X64:       # %bb.0:
913; X64-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0
914; X64-NEXT:    vzeroupper
915; X64-NEXT:    retq
916;
917; X86-LABEL: ceil_v4f64:
918; X86:       # %bb.0:
919; X86-NEXT:    pushl %ebp
920; X86-NEXT:    movl %esp, %ebp
921; X86-NEXT:    andl $-8, %esp
922; X86-NEXT:    subl $8, %esp
923; X86-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0
924; X86-NEXT:    vmovsd %xmm0, (%esp)
925; X86-NEXT:    fldl (%esp)
926; X86-NEXT:    movl %ebp, %esp
927; X86-NEXT:    popl %ebp
928; X86-NEXT:    vzeroupper
929; X86-NEXT:    retl
930  %v = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x)
931  %r = extractelement <4 x double> %v, i32 0
932  ret double %r
933}
934
935define float @trunc_v4f32(<4 x float> %x) nounwind {
936; X64-LABEL: trunc_v4f32:
937; X64:       # %bb.0:
938; X64-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
939; X64-NEXT:    retq
940;
941; X86-LABEL: trunc_v4f32:
942; X86:       # %bb.0:
943; X86-NEXT:    pushl %eax
944; X86-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
945; X86-NEXT:    vmovss %xmm0, (%esp)
946; X86-NEXT:    flds (%esp)
947; X86-NEXT:    popl %eax
948; X86-NEXT:    retl
949  %v = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x)
950  %r = extractelement <4 x float> %v, i32 0
951  ret float %r
952}
953
954define double @trunc_v4f64(<4 x double> %x) nounwind {
955; X64-LABEL: trunc_v4f64:
956; X64:       # %bb.0:
957; X64-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
958; X64-NEXT:    vzeroupper
959; X64-NEXT:    retq
960;
961; X86-LABEL: trunc_v4f64:
962; X86:       # %bb.0:
963; X86-NEXT:    pushl %ebp
964; X86-NEXT:    movl %esp, %ebp
965; X86-NEXT:    andl $-8, %esp
966; X86-NEXT:    subl $8, %esp
967; X86-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
968; X86-NEXT:    vmovsd %xmm0, (%esp)
969; X86-NEXT:    fldl (%esp)
970; X86-NEXT:    movl %ebp, %esp
971; X86-NEXT:    popl %ebp
972; X86-NEXT:    vzeroupper
973; X86-NEXT:    retl
974  %v = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x)
975  %r = extractelement <4 x double> %v, i32 0
976  ret double %r
977}
978
979define float @rint_v4f32(<4 x float> %x) nounwind {
980; X64-LABEL: rint_v4f32:
981; X64:       # %bb.0:
982; X64-NEXT:    vroundss $4, %xmm0, %xmm0, %xmm0
983; X64-NEXT:    retq
984;
985; X86-LABEL: rint_v4f32:
986; X86:       # %bb.0:
987; X86-NEXT:    pushl %eax
988; X86-NEXT:    vroundss $4, %xmm0, %xmm0, %xmm0
989; X86-NEXT:    vmovss %xmm0, (%esp)
990; X86-NEXT:    flds (%esp)
991; X86-NEXT:    popl %eax
992; X86-NEXT:    retl
993  %v = call <4 x float> @llvm.rint.v4f32(<4 x float> %x)
994  %r = extractelement <4 x float> %v, i32 0
995  ret float %r
996}
997
998define double @rint_v4f64(<4 x double> %x) nounwind {
999; X64-LABEL: rint_v4f64:
1000; X64:       # %bb.0:
1001; X64-NEXT:    vroundsd $4, %xmm0, %xmm0, %xmm0
1002; X64-NEXT:    vzeroupper
1003; X64-NEXT:    retq
1004;
1005; X86-LABEL: rint_v4f64:
1006; X86:       # %bb.0:
1007; X86-NEXT:    pushl %ebp
1008; X86-NEXT:    movl %esp, %ebp
1009; X86-NEXT:    andl $-8, %esp
1010; X86-NEXT:    subl $8, %esp
1011; X86-NEXT:    vroundsd $4, %xmm0, %xmm0, %xmm0
1012; X86-NEXT:    vmovsd %xmm0, (%esp)
1013; X86-NEXT:    fldl (%esp)
1014; X86-NEXT:    movl %ebp, %esp
1015; X86-NEXT:    popl %ebp
1016; X86-NEXT:    vzeroupper
1017; X86-NEXT:    retl
1018  %v = call <4 x double> @llvm.rint.v4f64(<4 x double> %x)
1019  %r = extractelement <4 x double> %v, i32 0
1020  ret double %r
1021}
1022
1023define float @nearbyint_v4f32(<4 x float> %x) nounwind {
1024; X64-LABEL: nearbyint_v4f32:
1025; X64:       # %bb.0:
1026; X64-NEXT:    vroundss $12, %xmm0, %xmm0, %xmm0
1027; X64-NEXT:    retq
1028;
1029; X86-LABEL: nearbyint_v4f32:
1030; X86:       # %bb.0:
1031; X86-NEXT:    pushl %eax
1032; X86-NEXT:    vroundss $12, %xmm0, %xmm0, %xmm0
1033; X86-NEXT:    vmovss %xmm0, (%esp)
1034; X86-NEXT:    flds (%esp)
1035; X86-NEXT:    popl %eax
1036; X86-NEXT:    retl
1037  %v = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %x)
1038  %r = extractelement <4 x float> %v, i32 0
1039  ret float %r
1040}
1041
1042define double @nearbyint_v4f64(<4 x double> %x) nounwind {
1043; X64-LABEL: nearbyint_v4f64:
1044; X64:       # %bb.0:
1045; X64-NEXT:    vroundsd $12, %xmm0, %xmm0, %xmm0
1046; X64-NEXT:    vzeroupper
1047; X64-NEXT:    retq
1048;
1049; X86-LABEL: nearbyint_v4f64:
1050; X86:       # %bb.0:
1051; X86-NEXT:    pushl %ebp
1052; X86-NEXT:    movl %esp, %ebp
1053; X86-NEXT:    andl $-8, %esp
1054; X86-NEXT:    subl $8, %esp
1055; X86-NEXT:    vroundsd $12, %xmm0, %xmm0, %xmm0
1056; X86-NEXT:    vmovsd %xmm0, (%esp)
1057; X86-NEXT:    fldl (%esp)
1058; X86-NEXT:    movl %ebp, %esp
1059; X86-NEXT:    popl %ebp
1060; X86-NEXT:    vzeroupper
1061; X86-NEXT:    retl
1062  %v = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %x)
1063  %r = extractelement <4 x double> %v, i32 0
1064  ret double %r
1065}
1066
1067define float @round_v4f32(<4 x float> %x) nounwind {
1068; X64-LABEL: round_v4f32:
1069; X64:       # %bb.0:
1070; X64-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1071; X64-NEXT:    vandps %xmm1, %xmm0, %xmm1
1072; X64-NEXT:    vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
1073; X64-NEXT:    vorps %xmm1, %xmm2, %xmm1
1074; X64-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1075; X64-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
1076; X64-NEXT:    retq
1077;
1078; X86-LABEL: round_v4f32:
1079; X86:       # %bb.0:
1080; X86-NEXT:    pushl %eax
1081; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1082; X86-NEXT:    vandps %xmm1, %xmm0, %xmm1
1083; X86-NEXT:    vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
1084; X86-NEXT:    vorps %xmm1, %xmm2, %xmm1
1085; X86-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1086; X86-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
1087; X86-NEXT:    vmovss %xmm0, (%esp)
1088; X86-NEXT:    flds (%esp)
1089; X86-NEXT:    popl %eax
1090; X86-NEXT:    retl
1091  %v = call <4 x float> @llvm.round.v4f32(<4 x float> %x)
1092  %r = extractelement <4 x float> %v, i32 0
1093  ret float %r
1094}
1095
1096define double @round_v4f64(<4 x double> %x) nounwind {
1097; X64-LABEL: round_v4f64:
1098; X64:       # %bb.0:
1099; X64-NEXT:    vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1100; X64-NEXT:    vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1]
1101; X64-NEXT:    # xmm2 = mem[0,0]
1102; X64-NEXT:    vorpd %xmm1, %xmm2, %xmm1
1103; X64-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1104; X64-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
1105; X64-NEXT:    vzeroupper
1106; X64-NEXT:    retq
1107;
1108; X86-LABEL: round_v4f64:
1109; X86:       # %bb.0:
1110; X86-NEXT:    pushl %ebp
1111; X86-NEXT:    movl %esp, %ebp
1112; X86-NEXT:    andl $-8, %esp
1113; X86-NEXT:    subl $8, %esp
1114; X86-NEXT:    vandpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
1115; X86-NEXT:    vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1]
1116; X86-NEXT:    # xmm2 = mem[0,0]
1117; X86-NEXT:    vorpd %xmm1, %xmm2, %xmm1
1118; X86-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1119; X86-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
1120; X86-NEXT:    vmovsd %xmm0, (%esp)
1121; X86-NEXT:    fldl (%esp)
1122; X86-NEXT:    movl %ebp, %esp
1123; X86-NEXT:    popl %ebp
1124; X86-NEXT:    vzeroupper
1125; X86-NEXT:    retl
1126  %v = call <4 x double> @llvm.round.v4f64(<4 x double> %x)
1127  %r = extractelement <4 x double> %v, i32 0
1128  ret double %r
1129}
1130
1131define float @rcp_v4f32(<4 x float> %x) nounwind {
1132; X64-LABEL: rcp_v4f32:
1133; X64:       # %bb.0:
1134; X64-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
1135; X64-NEXT:    retq
1136;
1137; X86-LABEL: rcp_v4f32:
1138; X86:       # %bb.0:
1139; X86-NEXT:    pushl %eax
1140; X86-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
1141; X86-NEXT:    vmovss %xmm0, (%esp)
1142; X86-NEXT:    flds (%esp)
1143; X86-NEXT:    popl %eax
1144; X86-NEXT:    retl
1145  %v = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %x)
1146  %r = extractelement <4 x float> %v, i32 0
1147  ret float %r
1148}
1149
1150define float @rcp_v8f32(<8 x float> %x) nounwind {
1151; X64-LABEL: rcp_v8f32:
1152; X64:       # %bb.0:
1153; X64-NEXT:    vrcpps %ymm0, %ymm0
1154; X64-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1155; X64-NEXT:    vzeroupper
1156; X64-NEXT:    retq
1157;
1158; X86-LABEL: rcp_v8f32:
1159; X86:       # %bb.0:
1160; X86-NEXT:    pushl %eax
1161; X86-NEXT:    vrcpps %ymm0, %ymm0
1162; X86-NEXT:    vmovss %xmm0, (%esp)
1163; X86-NEXT:    flds (%esp)
1164; X86-NEXT:    popl %eax
1165; X86-NEXT:    vzeroupper
1166; X86-NEXT:    retl
1167  %v = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %x)
1168  %r = extractelement <8 x float> %v, i32 0
1169  ret float %r
1170}
1171
1172define float @rsqrt_v4f32(<4 x float> %x) nounwind {
1173; X64-LABEL: rsqrt_v4f32:
1174; X64:       # %bb.0:
1175; X64-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0
1176; X64-NEXT:    retq
1177;
1178; X86-LABEL: rsqrt_v4f32:
1179; X86:       # %bb.0:
1180; X86-NEXT:    pushl %eax
1181; X86-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0
1182; X86-NEXT:    vmovss %xmm0, (%esp)
1183; X86-NEXT:    flds (%esp)
1184; X86-NEXT:    popl %eax
1185; X86-NEXT:    retl
1186  %v = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %x)
1187  %r = extractelement <4 x float> %v, i32 0
1188  ret float %r
1189}
1190
1191define float @rsqrt_v8f32(<8 x float> %x) nounwind {
1192; X64-LABEL: rsqrt_v8f32:
1193; X64:       # %bb.0:
1194; X64-NEXT:    vrsqrtps %ymm0, %ymm0
1195; X64-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1196; X64-NEXT:    vzeroupper
1197; X64-NEXT:    retq
1198;
1199; X86-LABEL: rsqrt_v8f32:
1200; X86:       # %bb.0:
1201; X86-NEXT:    pushl %eax
1202; X86-NEXT:    vrsqrtps %ymm0, %ymm0
1203; X86-NEXT:    vmovss %xmm0, (%esp)
1204; X86-NEXT:    flds (%esp)
1205; X86-NEXT:    popl %eax
1206; X86-NEXT:    vzeroupper
1207; X86-NEXT:    retl
1208  %v = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %x)
1209  %r = extractelement <8 x float> %v, i32 0
1210  ret float %r
1211}
1212
1213declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
1214declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
1215declare <4 x float> @llvm.sin.v4f32(<4 x float>)
1216declare <4 x double> @llvm.sin.v4f64(<4 x double>)
1217declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
1218declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>)
1219declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
1220declare <4 x double> @llvm.fabs.v4f64(<4 x double>)
1221declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
1222declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>)
1223declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
1224declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>)
1225declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>)
1226declare <4 x double> @llvm.maximum.v4f64(<4 x double>, <4 x double>)
1227declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>)
1228declare <4 x double> @llvm.minimum.v4f64(<4 x double>, <4 x double>)
1229declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>)
1230declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>)
1231declare <4 x float> @llvm.floor.v4f32(<4 x float>)
1232declare <4 x double> @llvm.floor.v4f64(<4 x double>)
1233declare <4 x float> @llvm.ceil.v4f32(<4 x float>)
1234declare <4 x double> @llvm.ceil.v4f64(<4 x double>)
1235declare <4 x float> @llvm.trunc.v4f32(<4 x float>)
1236declare <4 x double> @llvm.trunc.v4f64(<4 x double>)
1237declare <4 x float> @llvm.rint.v4f32(<4 x float>)
1238declare <4 x double> @llvm.rint.v4f64(<4 x double>)
1239declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>)
1240declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>)
1241declare <4 x float> @llvm.round.v4f32(<4 x float>)
1242declare <4 x double> @llvm.round.v4f64(<4 x double>)
1243
1244declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>)
1245declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>)
1246declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>)
1247declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>)
1248