1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41
4; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512
6
7define float @round_f32(float %x) {
8; SSE2-LABEL: round_f32:
9; SSE2:       ## %bb.0:
10; SSE2-NEXT:    jmp _roundf ## TAILCALL
11;
12; SSE41-LABEL: round_f32:
13; SSE41:       ## %bb.0:
14; SSE41-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
15; SSE41-NEXT:    andps %xmm0, %xmm1
16; SSE41-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
17; SSE41-NEXT:    addss %xmm0, %xmm1
18; SSE41-NEXT:    xorps %xmm0, %xmm0
19; SSE41-NEXT:    roundss $11, %xmm1, %xmm0
20; SSE41-NEXT:    retq
21;
22; AVX1-LABEL: round_f32:
23; AVX1:       ## %bb.0:
24; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
25; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
26; AVX1-NEXT:    vorps %xmm1, %xmm2, %xmm1
27; AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm0
28; AVX1-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
29; AVX1-NEXT:    retq
30;
31; AVX512-LABEL: round_f32:
32; AVX512:       ## %bb.0:
33; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
34; AVX512-NEXT:    vandps %xmm1, %xmm0, %xmm1
35; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
36; AVX512-NEXT:    vorps %xmm1, %xmm2, %xmm1
37; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
38; AVX512-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
39; AVX512-NEXT:    retq
40  %a = call float @llvm.round.f32(float %x)
41  ret float %a
42}
43
44define double @round_f64(double %x) {
45; SSE2-LABEL: round_f64:
46; SSE2:       ## %bb.0:
47; SSE2-NEXT:    jmp _round ## TAILCALL
48;
49; SSE41-LABEL: round_f64:
50; SSE41:       ## %bb.0:
51; SSE41-NEXT:    movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
52; SSE41-NEXT:    andpd %xmm0, %xmm1
53; SSE41-NEXT:    orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
54; SSE41-NEXT:    addsd %xmm0, %xmm1
55; SSE41-NEXT:    xorps %xmm0, %xmm0
56; SSE41-NEXT:    roundsd $11, %xmm1, %xmm0
57; SSE41-NEXT:    retq
58;
59; AVX-LABEL: round_f64:
60; AVX:       ## %bb.0:
61; AVX-NEXT:    vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
62; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1]
63; AVX-NEXT:    ## xmm2 = mem[0,0]
64; AVX-NEXT:    vorpd %xmm1, %xmm2, %xmm1
65; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
66; AVX-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
67; AVX-NEXT:    retq
68  %a = call double @llvm.round.f64(double %x)
69  ret double %a
70}
71
72define <4 x float> @round_v4f32(<4 x float> %x) {
73; SSE2-LABEL: round_v4f32:
74; SSE2:       ## %bb.0:
75; SSE2-NEXT:    subq $56, %rsp
76; SSE2-NEXT:    .cfi_def_cfa_offset 64
77; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
78; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
79; SSE2-NEXT:    callq _roundf
80; SSE2-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
81; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
82; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
83; SSE2-NEXT:    callq _roundf
84; SSE2-NEXT:    unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload
85; SSE2-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
86; SSE2-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
87; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
88; SSE2-NEXT:    callq _roundf
89; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
90; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
91; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
92; SSE2-NEXT:    callq _roundf
93; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
94; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
95; SSE2-NEXT:    unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload
96; SSE2-NEXT:    ## xmm1 = xmm1[0],mem[0]
97; SSE2-NEXT:    movaps %xmm1, %xmm0
98; SSE2-NEXT:    addq $56, %rsp
99; SSE2-NEXT:    retq
100;
101; SSE41-LABEL: round_v4f32:
102; SSE41:       ## %bb.0:
103; SSE41-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
104; SSE41-NEXT:    andps %xmm0, %xmm1
105; SSE41-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
106; SSE41-NEXT:    addps %xmm0, %xmm1
107; SSE41-NEXT:    roundps $11, %xmm1, %xmm0
108; SSE41-NEXT:    retq
109;
110; AVX1-LABEL: round_v4f32:
111; AVX1:       ## %bb.0:
112; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
113; AVX1-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
114; AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0
115; AVX1-NEXT:    vroundps $11, %xmm0, %xmm0
116; AVX1-NEXT:    retq
117;
118; AVX512-LABEL: round_v4f32:
119; AVX512:       ## %bb.0:
120; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
121; AVX512-NEXT:    vandps %xmm1, %xmm0, %xmm1
122; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
123; AVX512-NEXT:    vorps %xmm1, %xmm2, %xmm1
124; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
125; AVX512-NEXT:    vroundps $11, %xmm0, %xmm0
126; AVX512-NEXT:    retq
127  %a = call <4 x float> @llvm.round.v4f32(<4 x float> %x)
128  ret <4 x float> %a
129}
130
131define <2 x double> @round_v2f64(<2 x double> %x) {
132; SSE2-LABEL: round_v2f64:
133; SSE2:       ## %bb.0:
134; SSE2-NEXT:    subq $40, %rsp
135; SSE2-NEXT:    .cfi_def_cfa_offset 48
136; SSE2-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
137; SSE2-NEXT:    callq _round
138; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
139; SSE2-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
140; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
141; SSE2-NEXT:    callq _round
142; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
143; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
144; SSE2-NEXT:    movaps %xmm1, %xmm0
145; SSE2-NEXT:    addq $40, %rsp
146; SSE2-NEXT:    retq
147;
148; SSE41-LABEL: round_v2f64:
149; SSE41:       ## %bb.0:
150; SSE41-NEXT:    movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
151; SSE41-NEXT:    andpd %xmm0, %xmm1
152; SSE41-NEXT:    orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
153; SSE41-NEXT:    addpd %xmm0, %xmm1
154; SSE41-NEXT:    roundpd $11, %xmm1, %xmm0
155; SSE41-NEXT:    retq
156;
157; AVX-LABEL: round_v2f64:
158; AVX:       ## %bb.0:
159; AVX-NEXT:    vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
160; AVX-NEXT:    vorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
161; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
162; AVX-NEXT:    vroundpd $11, %xmm0, %xmm0
163; AVX-NEXT:    retq
164  %a = call <2 x double> @llvm.round.v2f64(<2 x double> %x)
165  ret <2 x double> %a
166}
167
168define <8 x float> @round_v8f32(<8 x float> %x) {
169; SSE2-LABEL: round_v8f32:
170; SSE2:       ## %bb.0:
171; SSE2-NEXT:    subq $72, %rsp
172; SSE2-NEXT:    .cfi_def_cfa_offset 80
173; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
174; SSE2-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
175; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
176; SSE2-NEXT:    callq _roundf
177; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
178; SSE2-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
179; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
180; SSE2-NEXT:    callq _roundf
181; SSE2-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
182; SSE2-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
183; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
184; SSE2-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
185; SSE2-NEXT:    callq _roundf
186; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
187; SSE2-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
188; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
189; SSE2-NEXT:    callq _roundf
190; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
191; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
192; SSE2-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
193; SSE2-NEXT:    ## xmm1 = xmm1[0],mem[0]
194; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
195; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
196; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
197; SSE2-NEXT:    callq _roundf
198; SSE2-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
199; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
200; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
201; SSE2-NEXT:    callq _roundf
202; SSE2-NEXT:    unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload
203; SSE2-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
204; SSE2-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
205; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
206; SSE2-NEXT:    callq _roundf
207; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
208; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
209; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
210; SSE2-NEXT:    callq _roundf
211; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
212; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
213; SSE2-NEXT:    unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload
214; SSE2-NEXT:    ## xmm1 = xmm1[0],mem[0]
215; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
216; SSE2-NEXT:    addq $72, %rsp
217; SSE2-NEXT:    retq
218;
219; SSE41-LABEL: round_v8f32:
220; SSE41:       ## %bb.0:
221; SSE41-NEXT:    movaps {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
222; SSE41-NEXT:    movaps %xmm0, %xmm3
223; SSE41-NEXT:    andps %xmm2, %xmm3
224; SSE41-NEXT:    movaps {{.*#+}} xmm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
225; SSE41-NEXT:    orps %xmm4, %xmm3
226; SSE41-NEXT:    addps %xmm0, %xmm3
227; SSE41-NEXT:    roundps $11, %xmm3, %xmm0
228; SSE41-NEXT:    andps %xmm1, %xmm2
229; SSE41-NEXT:    orps %xmm4, %xmm2
230; SSE41-NEXT:    addps %xmm1, %xmm2
231; SSE41-NEXT:    roundps $11, %xmm2, %xmm1
232; SSE41-NEXT:    retq
233;
234; AVX1-LABEL: round_v8f32:
235; AVX1:       ## %bb.0:
236; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
237; AVX1-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
238; AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
239; AVX1-NEXT:    vroundps $11, %ymm0, %ymm0
240; AVX1-NEXT:    retq
241;
242; AVX512-LABEL: round_v8f32:
243; AVX512:       ## %bb.0:
244; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
245; AVX512-NEXT:    vandps %ymm1, %ymm0, %ymm1
246; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
247; AVX512-NEXT:    vorps %ymm1, %ymm2, %ymm1
248; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
249; AVX512-NEXT:    vroundps $11, %ymm0, %ymm0
250; AVX512-NEXT:    retq
251  %a = call <8 x float> @llvm.round.v8f32(<8 x float> %x)
252  ret <8 x float> %a
253}
254
255define <4 x double> @round_v4f64(<4 x double> %x) {
256; SSE2-LABEL: round_v4f64:
257; SSE2:       ## %bb.0:
258; SSE2-NEXT:    subq $56, %rsp
259; SSE2-NEXT:    .cfi_def_cfa_offset 64
260; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
261; SSE2-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
262; SSE2-NEXT:    callq _round
263; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
264; SSE2-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
265; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
266; SSE2-NEXT:    callq _round
267; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
268; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
269; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
270; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
271; SSE2-NEXT:    callq _round
272; SSE2-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
273; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
274; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
275; SSE2-NEXT:    callq _round
276; SSE2-NEXT:    movaps (%rsp), %xmm1 ## 16-byte Reload
277; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
278; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
279; SSE2-NEXT:    addq $56, %rsp
280; SSE2-NEXT:    retq
281;
282; SSE41-LABEL: round_v4f64:
283; SSE41:       ## %bb.0:
284; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0]
285; SSE41-NEXT:    movapd %xmm0, %xmm3
286; SSE41-NEXT:    andpd %xmm2, %xmm3
287; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [4.9999999999999994E-1,4.9999999999999994E-1]
288; SSE41-NEXT:    orpd %xmm4, %xmm3
289; SSE41-NEXT:    addpd %xmm0, %xmm3
290; SSE41-NEXT:    roundpd $11, %xmm3, %xmm0
291; SSE41-NEXT:    andpd %xmm1, %xmm2
292; SSE41-NEXT:    orpd %xmm4, %xmm2
293; SSE41-NEXT:    addpd %xmm1, %xmm2
294; SSE41-NEXT:    roundpd $11, %xmm2, %xmm1
295; SSE41-NEXT:    retq
296;
297; AVX1-LABEL: round_v4f64:
298; AVX1:       ## %bb.0:
299; AVX1-NEXT:    vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
300; AVX1-NEXT:    vorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
301; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
302; AVX1-NEXT:    vroundpd $11, %ymm0, %ymm0
303; AVX1-NEXT:    retq
304;
305; AVX512-LABEL: round_v4f64:
306; AVX512:       ## %bb.0:
307; AVX512-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
308; AVX512-NEXT:    vandpd %ymm1, %ymm0, %ymm1
309; AVX512-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
310; AVX512-NEXT:    vorpd %ymm1, %ymm2, %ymm1
311; AVX512-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
312; AVX512-NEXT:    vroundpd $11, %ymm0, %ymm0
313; AVX512-NEXT:    retq
314  %a = call <4 x double> @llvm.round.v4f64(<4 x double> %x)
315  ret <4 x double> %a
316}
317
318define <16 x float> @round_v16f32(<16 x float> %x) {
319; SSE2-LABEL: round_v16f32:
320; SSE2:       ## %bb.0:
321; SSE2-NEXT:    subq $104, %rsp
322; SSE2-NEXT:    .cfi_def_cfa_offset 112
323; SSE2-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
324; SSE2-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
325; SSE2-NEXT:    movaps %xmm1, (%rsp) ## 16-byte Spill
326; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
327; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
328; SSE2-NEXT:    callq _roundf
329; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
330; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
331; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
332; SSE2-NEXT:    callq _roundf
333; SSE2-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
334; SSE2-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
335; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
336; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
337; SSE2-NEXT:    callq _roundf
338; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
339; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
340; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
341; SSE2-NEXT:    callq _roundf
342; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
343; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
344; SSE2-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
345; SSE2-NEXT:    ## xmm1 = xmm1[0],mem[0]
346; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
347; SSE2-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
348; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
349; SSE2-NEXT:    callq _roundf
350; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
351; SSE2-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
352; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
353; SSE2-NEXT:    callq _roundf
354; SSE2-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
355; SSE2-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
356; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
357; SSE2-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
358; SSE2-NEXT:    callq _roundf
359; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
360; SSE2-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
361; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
362; SSE2-NEXT:    callq _roundf
363; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
364; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
365; SSE2-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
366; SSE2-NEXT:    ## xmm1 = xmm1[0],mem[0]
367; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
368; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
369; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
370; SSE2-NEXT:    callq _roundf
371; SSE2-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
372; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
373; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
374; SSE2-NEXT:    callq _roundf
375; SSE2-NEXT:    unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload
376; SSE2-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
377; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
378; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
379; SSE2-NEXT:    callq _roundf
380; SSE2-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
381; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
382; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
383; SSE2-NEXT:    callq _roundf
384; SSE2-NEXT:    movaps (%rsp), %xmm1 ## 16-byte Reload
385; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
386; SSE2-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
387; SSE2-NEXT:    ## xmm1 = xmm1[0],mem[0]
388; SSE2-NEXT:    movaps %xmm1, (%rsp) ## 16-byte Spill
389; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
390; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
391; SSE2-NEXT:    callq _roundf
392; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
393; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
394; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
395; SSE2-NEXT:    callq _roundf
396; SSE2-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
397; SSE2-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
398; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
399; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
400; SSE2-NEXT:    callq _roundf
401; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
402; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
403; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
404; SSE2-NEXT:    callq _roundf
405; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload
406; SSE2-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
407; SSE2-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Folded Reload
408; SSE2-NEXT:    ## xmm3 = xmm3[0],mem[0]
409; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
410; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
411; SSE2-NEXT:    movaps (%rsp), %xmm2 ## 16-byte Reload
412; SSE2-NEXT:    addq $104, %rsp
413; SSE2-NEXT:    retq
414;
415; SSE41-LABEL: round_v16f32:
416; SSE41:       ## %bb.0:
417; SSE41-NEXT:    movaps {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
418; SSE41-NEXT:    movaps %xmm0, %xmm5
419; SSE41-NEXT:    andps %xmm4, %xmm5
420; SSE41-NEXT:    movaps {{.*#+}} xmm6 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
421; SSE41-NEXT:    orps %xmm6, %xmm5
422; SSE41-NEXT:    addps %xmm0, %xmm5
423; SSE41-NEXT:    roundps $11, %xmm5, %xmm0
424; SSE41-NEXT:    movaps %xmm1, %xmm5
425; SSE41-NEXT:    andps %xmm4, %xmm5
426; SSE41-NEXT:    orps %xmm6, %xmm5
427; SSE41-NEXT:    addps %xmm1, %xmm5
428; SSE41-NEXT:    roundps $11, %xmm5, %xmm1
429; SSE41-NEXT:    movaps %xmm2, %xmm5
430; SSE41-NEXT:    andps %xmm4, %xmm5
431; SSE41-NEXT:    orps %xmm6, %xmm5
432; SSE41-NEXT:    addps %xmm2, %xmm5
433; SSE41-NEXT:    roundps $11, %xmm5, %xmm2
434; SSE41-NEXT:    andps %xmm3, %xmm4
435; SSE41-NEXT:    orps %xmm6, %xmm4
436; SSE41-NEXT:    addps %xmm3, %xmm4
437; SSE41-NEXT:    roundps $11, %xmm4, %xmm3
438; SSE41-NEXT:    retq
439;
440; AVX1-LABEL: round_v16f32:
441; AVX1:       ## %bb.0:
442; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
443; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm3
444; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
445; AVX1-NEXT:    vorps %ymm3, %ymm4, %ymm3
446; AVX1-NEXT:    vaddps %ymm3, %ymm0, %ymm0
447; AVX1-NEXT:    vroundps $11, %ymm0, %ymm0
448; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm2
449; AVX1-NEXT:    vorps %ymm2, %ymm4, %ymm2
450; AVX1-NEXT:    vaddps %ymm2, %ymm1, %ymm1
451; AVX1-NEXT:    vroundps $11, %ymm1, %ymm1
452; AVX1-NEXT:    retq
453;
454; AVX512-LABEL: round_v16f32:
455; AVX512:       ## %bb.0:
456; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
457; AVX512-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1
458; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
459; AVX512-NEXT:    vrndscaleps $11, %zmm0, %zmm0
460; AVX512-NEXT:    retq
461  %a = call <16 x float> @llvm.round.v16f32(<16 x float> %x)
462  ret <16 x float> %a
463}
464
465define <8 x double> @round_v8f64(<8 x double> %x) {
466; SSE2-LABEL: round_v8f64:
467; SSE2:       ## %bb.0:
468; SSE2-NEXT:    subq $88, %rsp
469; SSE2-NEXT:    .cfi_def_cfa_offset 96
470; SSE2-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
471; SSE2-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
472; SSE2-NEXT:    movaps %xmm1, (%rsp) ## 16-byte Spill
473; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
474; SSE2-NEXT:    callq _round
475; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
476; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
477; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
478; SSE2-NEXT:    callq _round
479; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
480; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
481; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
482; SSE2-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
483; SSE2-NEXT:    callq _round
484; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
485; SSE2-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
486; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
487; SSE2-NEXT:    callq _round
488; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
489; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
490; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
491; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
492; SSE2-NEXT:    callq _round
493; SSE2-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
494; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
495; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
496; SSE2-NEXT:    callq _round
497; SSE2-NEXT:    movaps (%rsp), %xmm1 ## 16-byte Reload
498; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
499; SSE2-NEXT:    movaps %xmm1, (%rsp) ## 16-byte Spill
500; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
501; SSE2-NEXT:    callq _round
502; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
503; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
504; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
505; SSE2-NEXT:    callq _round
506; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload
507; SSE2-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
508; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
509; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
510; SSE2-NEXT:    movaps (%rsp), %xmm2 ## 16-byte Reload
511; SSE2-NEXT:    addq $88, %rsp
512; SSE2-NEXT:    retq
513;
514; SSE41-LABEL: round_v8f64:
515; SSE41:       ## %bb.0:
516; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0]
517; SSE41-NEXT:    movapd %xmm0, %xmm5
518; SSE41-NEXT:    andpd %xmm4, %xmm5
519; SSE41-NEXT:    movapd {{.*#+}} xmm6 = [4.9999999999999994E-1,4.9999999999999994E-1]
520; SSE41-NEXT:    orpd %xmm6, %xmm5
521; SSE41-NEXT:    addpd %xmm0, %xmm5
522; SSE41-NEXT:    roundpd $11, %xmm5, %xmm0
523; SSE41-NEXT:    movapd %xmm1, %xmm5
524; SSE41-NEXT:    andpd %xmm4, %xmm5
525; SSE41-NEXT:    orpd %xmm6, %xmm5
526; SSE41-NEXT:    addpd %xmm1, %xmm5
527; SSE41-NEXT:    roundpd $11, %xmm5, %xmm1
528; SSE41-NEXT:    movapd %xmm2, %xmm5
529; SSE41-NEXT:    andpd %xmm4, %xmm5
530; SSE41-NEXT:    orpd %xmm6, %xmm5
531; SSE41-NEXT:    addpd %xmm2, %xmm5
532; SSE41-NEXT:    roundpd $11, %xmm5, %xmm2
533; SSE41-NEXT:    andpd %xmm3, %xmm4
534; SSE41-NEXT:    orpd %xmm6, %xmm4
535; SSE41-NEXT:    addpd %xmm3, %xmm4
536; SSE41-NEXT:    roundpd $11, %xmm4, %xmm3
537; SSE41-NEXT:    retq
538;
539; AVX1-LABEL: round_v8f64:
540; AVX1:       ## %bb.0:
541; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
542; AVX1-NEXT:    vandpd %ymm2, %ymm0, %ymm3
543; AVX1-NEXT:    vmovapd {{.*#+}} ymm4 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
544; AVX1-NEXT:    vorpd %ymm3, %ymm4, %ymm3
545; AVX1-NEXT:    vaddpd %ymm3, %ymm0, %ymm0
546; AVX1-NEXT:    vroundpd $11, %ymm0, %ymm0
547; AVX1-NEXT:    vandpd %ymm2, %ymm1, %ymm2
548; AVX1-NEXT:    vorpd %ymm2, %ymm4, %ymm2
549; AVX1-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
550; AVX1-NEXT:    vroundpd $11, %ymm1, %ymm1
551; AVX1-NEXT:    retq
552;
553; AVX512-LABEL: round_v8f64:
554; AVX512:       ## %bb.0:
555; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
556; AVX512-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1
557; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
558; AVX512-NEXT:    vrndscalepd $11, %zmm0, %zmm0
559; AVX512-NEXT:    retq
560  %a = call <8 x double> @llvm.round.v8f64(<8 x double> %x)
561  ret <8 x double> %a
562}
563
564declare float @llvm.round.f32(float)
565declare double @llvm.round.f64(double)
566declare <4 x float> @llvm.round.v4f32(<4 x float>)
567declare <2 x double> @llvm.round.v2f64(<2 x double>)
568declare <8 x float> @llvm.round.v8f32(<8 x float>)
569declare <4 x double> @llvm.round.v4f64(<4 x double>)
570declare <16 x float> @llvm.round.v16f32(<16 x float>)
571declare <8 x double> @llvm.round.v8f64(<8 x double>)
572