1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX1-SLOW
5; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx,+fast-hops | FileCheck %s --check-prefix=AVX1-FAST
6; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
7; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512
8; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512
9
10;
11; vXf32 (accum)
12;
13
14define float @test_v2f32(float %a0, <2 x float> %a1) {
15; SSE2-LABEL: test_v2f32:
16; SSE2:       # %bb.0:
17; SSE2-NEXT:    movaps %xmm1, %xmm2
18; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
19; SSE2-NEXT:    addss %xmm1, %xmm2
20; SSE2-NEXT:    addss %xmm2, %xmm0
21; SSE2-NEXT:    retq
22;
23; SSE41-LABEL: test_v2f32:
24; SSE41:       # %bb.0:
25; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
26; SSE41-NEXT:    addss %xmm1, %xmm2
27; SSE41-NEXT:    addss %xmm2, %xmm0
28; SSE41-NEXT:    retq
29;
30; AVX1-SLOW-LABEL: test_v2f32:
31; AVX1-SLOW:       # %bb.0:
32; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
33; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
34; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
35; AVX1-SLOW-NEXT:    retq
36;
37; AVX1-FAST-LABEL: test_v2f32:
38; AVX1-FAST:       # %bb.0:
39; AVX1-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
40; AVX1-FAST-NEXT:    vaddss %xmm1, %xmm0, %xmm0
41; AVX1-FAST-NEXT:    retq
42;
43; AVX2-LABEL: test_v2f32:
44; AVX2:       # %bb.0:
45; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
46; AVX2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
47; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
48; AVX2-NEXT:    retq
49;
50; AVX512-LABEL: test_v2f32:
51; AVX512:       # %bb.0:
52; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
53; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
54; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
55; AVX512-NEXT:    retq
56  %1 = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1)
57  ret float %1
58}
59
60define float @test_v4f32(float %a0, <4 x float> %a1) {
61; SSE2-LABEL: test_v4f32:
62; SSE2:       # %bb.0:
63; SSE2-NEXT:    movaps %xmm1, %xmm2
64; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
65; SSE2-NEXT:    addps %xmm1, %xmm2
66; SSE2-NEXT:    movaps %xmm2, %xmm1
67; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
68; SSE2-NEXT:    addss %xmm2, %xmm1
69; SSE2-NEXT:    addss %xmm1, %xmm0
70; SSE2-NEXT:    retq
71;
72; SSE41-LABEL: test_v4f32:
73; SSE41:       # %bb.0:
74; SSE41-NEXT:    movaps %xmm1, %xmm2
75; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
76; SSE41-NEXT:    addps %xmm1, %xmm2
77; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
78; SSE41-NEXT:    addss %xmm2, %xmm1
79; SSE41-NEXT:    addss %xmm1, %xmm0
80; SSE41-NEXT:    retq
81;
82; AVX1-SLOW-LABEL: test_v4f32:
83; AVX1-SLOW:       # %bb.0:
84; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
85; AVX1-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
86; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
87; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
88; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
89; AVX1-SLOW-NEXT:    retq
90;
91; AVX1-FAST-LABEL: test_v4f32:
92; AVX1-FAST:       # %bb.0:
93; AVX1-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
94; AVX1-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
95; AVX1-FAST-NEXT:    vaddss %xmm1, %xmm0, %xmm0
96; AVX1-FAST-NEXT:    retq
97;
98; AVX2-LABEL: test_v4f32:
99; AVX2:       # %bb.0:
100; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
101; AVX2-NEXT:    vaddps %xmm2, %xmm1, %xmm1
102; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
103; AVX2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
104; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
105; AVX2-NEXT:    retq
106;
107; AVX512-LABEL: test_v4f32:
108; AVX512:       # %bb.0:
109; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
110; AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1
111; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
112; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
113; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
114; AVX512-NEXT:    retq
115  %1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1)
116  ret float %1
117}
118
119define float @test_v8f32(float %a0, <8 x float> %a1) {
120; SSE2-LABEL: test_v8f32:
121; SSE2:       # %bb.0:
122; SSE2-NEXT:    addps %xmm2, %xmm1
123; SSE2-NEXT:    movaps %xmm1, %xmm2
124; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
125; SSE2-NEXT:    addps %xmm1, %xmm2
126; SSE2-NEXT:    movaps %xmm2, %xmm1
127; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
128; SSE2-NEXT:    addss %xmm2, %xmm1
129; SSE2-NEXT:    addss %xmm1, %xmm0
130; SSE2-NEXT:    retq
131;
132; SSE41-LABEL: test_v8f32:
133; SSE41:       # %bb.0:
134; SSE41-NEXT:    addps %xmm2, %xmm1
135; SSE41-NEXT:    movaps %xmm1, %xmm2
136; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
137; SSE41-NEXT:    addps %xmm1, %xmm2
138; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
139; SSE41-NEXT:    addss %xmm2, %xmm1
140; SSE41-NEXT:    addss %xmm1, %xmm0
141; SSE41-NEXT:    retq
142;
143; AVX1-SLOW-LABEL: test_v8f32:
144; AVX1-SLOW:       # %bb.0:
145; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
146; AVX1-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
147; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
148; AVX1-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
149; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
150; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
151; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
152; AVX1-SLOW-NEXT:    vzeroupper
153; AVX1-SLOW-NEXT:    retq
154;
155; AVX1-FAST-LABEL: test_v8f32:
156; AVX1-FAST:       # %bb.0:
157; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
158; AVX1-FAST-NEXT:    vhaddps %xmm1, %xmm2, %xmm1
159; AVX1-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
160; AVX1-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
161; AVX1-FAST-NEXT:    vaddss %xmm1, %xmm0, %xmm0
162; AVX1-FAST-NEXT:    vzeroupper
163; AVX1-FAST-NEXT:    retq
164;
165; AVX2-LABEL: test_v8f32:
166; AVX2:       # %bb.0:
167; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
168; AVX2-NEXT:    vaddps %xmm2, %xmm1, %xmm1
169; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
170; AVX2-NEXT:    vaddps %xmm2, %xmm1, %xmm1
171; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
172; AVX2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
173; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
174; AVX2-NEXT:    vzeroupper
175; AVX2-NEXT:    retq
176;
177; AVX512-LABEL: test_v8f32:
178; AVX512:       # %bb.0:
179; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
180; AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1
181; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
182; AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1
183; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
184; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
185; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
186; AVX512-NEXT:    vzeroupper
187; AVX512-NEXT:    retq
188  %1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
189  ret float %1
190}
191
192define float @test_v16f32(float %a0, <16 x float> %a1) {
193; SSE2-LABEL: test_v16f32:
194; SSE2:       # %bb.0:
195; SSE2-NEXT:    addps %xmm4, %xmm2
196; SSE2-NEXT:    addps %xmm3, %xmm1
197; SSE2-NEXT:    addps %xmm2, %xmm1
198; SSE2-NEXT:    movaps %xmm1, %xmm2
199; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
200; SSE2-NEXT:    addps %xmm1, %xmm2
201; SSE2-NEXT:    movaps %xmm2, %xmm1
202; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
203; SSE2-NEXT:    addss %xmm2, %xmm1
204; SSE2-NEXT:    addss %xmm1, %xmm0
205; SSE2-NEXT:    retq
206;
207; SSE41-LABEL: test_v16f32:
208; SSE41:       # %bb.0:
209; SSE41-NEXT:    addps %xmm4, %xmm2
210; SSE41-NEXT:    addps %xmm3, %xmm1
211; SSE41-NEXT:    addps %xmm2, %xmm1
212; SSE41-NEXT:    movaps %xmm1, %xmm2
213; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
214; SSE41-NEXT:    addps %xmm1, %xmm2
215; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
216; SSE41-NEXT:    addss %xmm2, %xmm1
217; SSE41-NEXT:    addss %xmm1, %xmm0
218; SSE41-NEXT:    retq
219;
220; AVX1-SLOW-LABEL: test_v16f32:
221; AVX1-SLOW:       # %bb.0:
222; AVX1-SLOW-NEXT:    vaddps %ymm2, %ymm1, %ymm1
223; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
224; AVX1-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
225; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
226; AVX1-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
227; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
228; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
229; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
230; AVX1-SLOW-NEXT:    vzeroupper
231; AVX1-SLOW-NEXT:    retq
232;
233; AVX1-FAST-LABEL: test_v16f32:
234; AVX1-FAST:       # %bb.0:
235; AVX1-FAST-NEXT:    vaddps %ymm2, %ymm1, %ymm1
236; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
237; AVX1-FAST-NEXT:    vaddps %xmm2, %xmm1, %xmm1
238; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
239; AVX1-FAST-NEXT:    vaddps %xmm2, %xmm1, %xmm1
240; AVX1-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
241; AVX1-FAST-NEXT:    vaddss %xmm1, %xmm0, %xmm0
242; AVX1-FAST-NEXT:    vzeroupper
243; AVX1-FAST-NEXT:    retq
244;
245; AVX2-LABEL: test_v16f32:
246; AVX2:       # %bb.0:
247; AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
248; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
249; AVX2-NEXT:    vaddps %xmm2, %xmm1, %xmm1
250; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
251; AVX2-NEXT:    vaddps %xmm2, %xmm1, %xmm1
252; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
253; AVX2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
254; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
255; AVX2-NEXT:    vzeroupper
256; AVX2-NEXT:    retq
257;
258; AVX512-LABEL: test_v16f32:
259; AVX512:       # %bb.0:
260; AVX512-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
261; AVX512-NEXT:    vaddps %zmm2, %zmm1, %zmm1
262; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
263; AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1
264; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
265; AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1
266; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
267; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
268; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
269; AVX512-NEXT:    vzeroupper
270; AVX512-NEXT:    retq
271  %1 = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1)
272  ret float %1
273}
274
275;
276; vXf32 (zero)
277;
278
279define float @test_v2f32_zero(<2 x float> %a0) {
280; SSE2-LABEL: test_v2f32_zero:
281; SSE2:       # %bb.0:
282; SSE2-NEXT:    movaps %xmm0, %xmm1
283; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
284; SSE2-NEXT:    addss %xmm0, %xmm1
285; SSE2-NEXT:    movaps %xmm1, %xmm0
286; SSE2-NEXT:    retq
287;
288; SSE41-LABEL: test_v2f32_zero:
289; SSE41:       # %bb.0:
290; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
291; SSE41-NEXT:    addss %xmm1, %xmm0
292; SSE41-NEXT:    retq
293;
294; AVX1-SLOW-LABEL: test_v2f32_zero:
295; AVX1-SLOW:       # %bb.0:
296; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
297; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
298; AVX1-SLOW-NEXT:    retq
299;
300; AVX1-FAST-LABEL: test_v2f32_zero:
301; AVX1-FAST:       # %bb.0:
302; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
303; AVX1-FAST-NEXT:    retq
304;
305; AVX2-LABEL: test_v2f32_zero:
306; AVX2:       # %bb.0:
307; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
308; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
309; AVX2-NEXT:    retq
310;
311; AVX512-LABEL: test_v2f32_zero:
312; AVX512:       # %bb.0:
313; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
314; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
315; AVX512-NEXT:    retq
316  %1 = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
317  ret float %1
318}
319
320define float @test_v4f32_zero(<4 x float> %a0) {
321; SSE2-LABEL: test_v4f32_zero:
322; SSE2:       # %bb.0:
323; SSE2-NEXT:    movaps %xmm0, %xmm1
324; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
325; SSE2-NEXT:    addps %xmm0, %xmm1
326; SSE2-NEXT:    movaps %xmm1, %xmm0
327; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
328; SSE2-NEXT:    addss %xmm1, %xmm0
329; SSE2-NEXT:    retq
330;
331; SSE41-LABEL: test_v4f32_zero:
332; SSE41:       # %bb.0:
333; SSE41-NEXT:    movaps %xmm0, %xmm1
334; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
335; SSE41-NEXT:    addps %xmm0, %xmm1
336; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
337; SSE41-NEXT:    addss %xmm0, %xmm1
338; SSE41-NEXT:    movaps %xmm1, %xmm0
339; SSE41-NEXT:    retq
340;
341; AVX1-SLOW-LABEL: test_v4f32_zero:
342; AVX1-SLOW:       # %bb.0:
343; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
344; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
345; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
346; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
347; AVX1-SLOW-NEXT:    retq
348;
349; AVX1-FAST-LABEL: test_v4f32_zero:
350; AVX1-FAST:       # %bb.0:
351; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
352; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
353; AVX1-FAST-NEXT:    retq
354;
355; AVX2-LABEL: test_v4f32_zero:
356; AVX2:       # %bb.0:
357; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
358; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
359; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
360; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
361; AVX2-NEXT:    retq
362;
363; AVX512-LABEL: test_v4f32_zero:
364; AVX512:       # %bb.0:
365; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
366; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
367; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
368; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
369; AVX512-NEXT:    retq
370  %1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
371  ret float %1
372}
373
374define float @test_v8f32_zero(<8 x float> %a0) {
375; SSE2-LABEL: test_v8f32_zero:
376; SSE2:       # %bb.0:
377; SSE2-NEXT:    addps %xmm1, %xmm0
378; SSE2-NEXT:    movaps %xmm0, %xmm1
379; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
380; SSE2-NEXT:    addps %xmm0, %xmm1
381; SSE2-NEXT:    movaps %xmm1, %xmm0
382; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
383; SSE2-NEXT:    addss %xmm1, %xmm0
384; SSE2-NEXT:    retq
385;
386; SSE41-LABEL: test_v8f32_zero:
387; SSE41:       # %bb.0:
388; SSE41-NEXT:    addps %xmm1, %xmm0
389; SSE41-NEXT:    movaps %xmm0, %xmm1
390; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
391; SSE41-NEXT:    addps %xmm0, %xmm1
392; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
393; SSE41-NEXT:    addss %xmm0, %xmm1
394; SSE41-NEXT:    movaps %xmm1, %xmm0
395; SSE41-NEXT:    retq
396;
397; AVX1-SLOW-LABEL: test_v8f32_zero:
398; AVX1-SLOW:       # %bb.0:
399; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
400; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
401; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
402; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
403; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
404; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
405; AVX1-SLOW-NEXT:    vzeroupper
406; AVX1-SLOW-NEXT:    retq
407;
408; AVX1-FAST-LABEL: test_v8f32_zero:
409; AVX1-FAST:       # %bb.0:
410; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
411; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm1, %xmm0
412; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
413; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
414; AVX1-FAST-NEXT:    vzeroupper
415; AVX1-FAST-NEXT:    retq
416;
417; AVX2-LABEL: test_v8f32_zero:
418; AVX2:       # %bb.0:
419; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
420; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
421; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
422; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
423; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
424; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
425; AVX2-NEXT:    vzeroupper
426; AVX2-NEXT:    retq
427;
428; AVX512-LABEL: test_v8f32_zero:
429; AVX512:       # %bb.0:
430; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
431; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
432; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
433; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
434; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
435; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
436; AVX512-NEXT:    vzeroupper
437; AVX512-NEXT:    retq
438  %1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
439  ret float %1
440}
441
442define float @test_v16f32_zero(<16 x float> %a0) {
443; SSE2-LABEL: test_v16f32_zero:
444; SSE2:       # %bb.0:
445; SSE2-NEXT:    addps %xmm3, %xmm1
446; SSE2-NEXT:    addps %xmm2, %xmm0
447; SSE2-NEXT:    addps %xmm1, %xmm0
448; SSE2-NEXT:    movaps %xmm0, %xmm1
449; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
450; SSE2-NEXT:    addps %xmm0, %xmm1
451; SSE2-NEXT:    movaps %xmm1, %xmm0
452; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
453; SSE2-NEXT:    addss %xmm1, %xmm0
454; SSE2-NEXT:    retq
455;
456; SSE41-LABEL: test_v16f32_zero:
457; SSE41:       # %bb.0:
458; SSE41-NEXT:    addps %xmm3, %xmm1
459; SSE41-NEXT:    addps %xmm2, %xmm0
460; SSE41-NEXT:    addps %xmm1, %xmm0
461; SSE41-NEXT:    movaps %xmm0, %xmm1
462; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
463; SSE41-NEXT:    addps %xmm0, %xmm1
464; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
465; SSE41-NEXT:    addss %xmm0, %xmm1
466; SSE41-NEXT:    movaps %xmm1, %xmm0
467; SSE41-NEXT:    retq
468;
469; AVX1-SLOW-LABEL: test_v16f32_zero:
470; AVX1-SLOW:       # %bb.0:
471; AVX1-SLOW-NEXT:    vaddps %ymm1, %ymm0, %ymm0
472; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
473; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
474; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
475; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
476; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
477; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
478; AVX1-SLOW-NEXT:    vzeroupper
479; AVX1-SLOW-NEXT:    retq
480;
481; AVX1-FAST-LABEL: test_v16f32_zero:
482; AVX1-FAST:       # %bb.0:
483; AVX1-FAST-NEXT:    vaddps %ymm1, %ymm0, %ymm0
484; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
485; AVX1-FAST-NEXT:    vaddps %xmm1, %xmm0, %xmm0
486; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
487; AVX1-FAST-NEXT:    vaddps %xmm1, %xmm0, %xmm0
488; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
489; AVX1-FAST-NEXT:    vzeroupper
490; AVX1-FAST-NEXT:    retq
491;
492; AVX2-LABEL: test_v16f32_zero:
493; AVX2:       # %bb.0:
494; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
495; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
496; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
497; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
498; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
499; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
500; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
501; AVX2-NEXT:    vzeroupper
502; AVX2-NEXT:    retq
503;
504; AVX512-LABEL: test_v16f32_zero:
505; AVX512:       # %bb.0:
506; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
507; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
508; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
509; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
510; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
511; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
512; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
513; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
514; AVX512-NEXT:    vzeroupper
515; AVX512-NEXT:    retq
516  %1 = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
517  ret float %1
518}
519
520;
521; vXf32 (undef)
522;
523
524define float @test_v2f32_undef(<2 x float> %a0) {
525; SSE2-LABEL: test_v2f32_undef:
526; SSE2:       # %bb.0:
527; SSE2-NEXT:    movaps %xmm0, %xmm1
528; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
529; SSE2-NEXT:    addss %xmm0, %xmm1
530; SSE2-NEXT:    movaps %xmm1, %xmm0
531; SSE2-NEXT:    retq
532;
533; SSE41-LABEL: test_v2f32_undef:
534; SSE41:       # %bb.0:
535; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
536; SSE41-NEXT:    addss %xmm1, %xmm0
537; SSE41-NEXT:    retq
538;
539; AVX1-SLOW-LABEL: test_v2f32_undef:
540; AVX1-SLOW:       # %bb.0:
541; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
542; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
543; AVX1-SLOW-NEXT:    retq
544;
545; AVX1-FAST-LABEL: test_v2f32_undef:
546; AVX1-FAST:       # %bb.0:
547; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
548; AVX1-FAST-NEXT:    retq
549;
550; AVX2-LABEL: test_v2f32_undef:
551; AVX2:       # %bb.0:
552; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
553; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
554; AVX2-NEXT:    retq
555;
556; AVX512-LABEL: test_v2f32_undef:
557; AVX512:       # %bb.0:
558; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
559; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
560; AVX512-NEXT:    retq
561  %1 = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
562  ret float %1
563}
564
565define float @test_v4f32_undef(<4 x float> %a0) {
566; SSE2-LABEL: test_v4f32_undef:
567; SSE2:       # %bb.0:
568; SSE2-NEXT:    movaps %xmm0, %xmm1
569; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
570; SSE2-NEXT:    addps %xmm0, %xmm1
571; SSE2-NEXT:    movaps %xmm1, %xmm0
572; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
573; SSE2-NEXT:    addss %xmm1, %xmm0
574; SSE2-NEXT:    retq
575;
576; SSE41-LABEL: test_v4f32_undef:
577; SSE41:       # %bb.0:
578; SSE41-NEXT:    movaps %xmm0, %xmm1
579; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
580; SSE41-NEXT:    addps %xmm0, %xmm1
581; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
582; SSE41-NEXT:    addss %xmm0, %xmm1
583; SSE41-NEXT:    movaps %xmm1, %xmm0
584; SSE41-NEXT:    retq
585;
586; AVX1-SLOW-LABEL: test_v4f32_undef:
587; AVX1-SLOW:       # %bb.0:
588; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
589; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
590; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
591; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
592; AVX1-SLOW-NEXT:    retq
593;
594; AVX1-FAST-LABEL: test_v4f32_undef:
595; AVX1-FAST:       # %bb.0:
596; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
597; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
598; AVX1-FAST-NEXT:    retq
599;
600; AVX2-LABEL: test_v4f32_undef:
601; AVX2:       # %bb.0:
602; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
603; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
604; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
605; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
606; AVX2-NEXT:    retq
607;
608; AVX512-LABEL: test_v4f32_undef:
609; AVX512:       # %bb.0:
610; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
611; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
612; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
613; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
614; AVX512-NEXT:    retq
615  %1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
616  ret float %1
617}
618
619define float @test_v8f32_undef(<8 x float> %a0) {
620; SSE2-LABEL: test_v8f32_undef:
621; SSE2:       # %bb.0:
622; SSE2-NEXT:    addps %xmm1, %xmm0
623; SSE2-NEXT:    movaps %xmm0, %xmm1
624; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
625; SSE2-NEXT:    addps %xmm0, %xmm1
626; SSE2-NEXT:    movaps %xmm1, %xmm0
627; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
628; SSE2-NEXT:    addss %xmm1, %xmm0
629; SSE2-NEXT:    retq
630;
631; SSE41-LABEL: test_v8f32_undef:
632; SSE41:       # %bb.0:
633; SSE41-NEXT:    addps %xmm1, %xmm0
634; SSE41-NEXT:    movaps %xmm0, %xmm1
635; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
636; SSE41-NEXT:    addps %xmm0, %xmm1
637; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
638; SSE41-NEXT:    addss %xmm0, %xmm1
639; SSE41-NEXT:    movaps %xmm1, %xmm0
640; SSE41-NEXT:    retq
641;
642; AVX1-SLOW-LABEL: test_v8f32_undef:
643; AVX1-SLOW:       # %bb.0:
644; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
645; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
646; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
647; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
648; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
649; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
650; AVX1-SLOW-NEXT:    vzeroupper
651; AVX1-SLOW-NEXT:    retq
652;
653; AVX1-FAST-LABEL: test_v8f32_undef:
654; AVX1-FAST:       # %bb.0:
655; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
656; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm1, %xmm0
657; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
658; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
659; AVX1-FAST-NEXT:    vzeroupper
660; AVX1-FAST-NEXT:    retq
661;
662; AVX2-LABEL: test_v8f32_undef:
663; AVX2:       # %bb.0:
664; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
665; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
666; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
667; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
668; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
669; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
670; AVX2-NEXT:    vzeroupper
671; AVX2-NEXT:    retq
672;
673; AVX512-LABEL: test_v8f32_undef:
674; AVX512:       # %bb.0:
675; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
676; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
677; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
678; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
679; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
680; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
681; AVX512-NEXT:    vzeroupper
682; AVX512-NEXT:    retq
683  %1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
684  ret float %1
685}
686
687define float @test_v16f32_undef(<16 x float> %a0) {
688; SSE2-LABEL: test_v16f32_undef:
689; SSE2:       # %bb.0:
690; SSE2-NEXT:    addps %xmm3, %xmm1
691; SSE2-NEXT:    addps %xmm2, %xmm0
692; SSE2-NEXT:    addps %xmm1, %xmm0
693; SSE2-NEXT:    movaps %xmm0, %xmm1
694; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
695; SSE2-NEXT:    addps %xmm0, %xmm1
696; SSE2-NEXT:    movaps %xmm1, %xmm0
697; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
698; SSE2-NEXT:    addss %xmm1, %xmm0
699; SSE2-NEXT:    retq
700;
701; SSE41-LABEL: test_v16f32_undef:
702; SSE41:       # %bb.0:
703; SSE41-NEXT:    addps %xmm3, %xmm1
704; SSE41-NEXT:    addps %xmm2, %xmm0
705; SSE41-NEXT:    addps %xmm1, %xmm0
706; SSE41-NEXT:    movaps %xmm0, %xmm1
707; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
708; SSE41-NEXT:    addps %xmm0, %xmm1
709; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
710; SSE41-NEXT:    addss %xmm0, %xmm1
711; SSE41-NEXT:    movaps %xmm1, %xmm0
712; SSE41-NEXT:    retq
713;
714; AVX1-SLOW-LABEL: test_v16f32_undef:
715; AVX1-SLOW:       # %bb.0:
716; AVX1-SLOW-NEXT:    vaddps %ymm1, %ymm0, %ymm0
717; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
718; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
719; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
720; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
721; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
722; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
723; AVX1-SLOW-NEXT:    vzeroupper
724; AVX1-SLOW-NEXT:    retq
725;
726; AVX1-FAST-LABEL: test_v16f32_undef:
727; AVX1-FAST:       # %bb.0:
728; AVX1-FAST-NEXT:    vaddps %ymm1, %ymm0, %ymm0
729; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
730; AVX1-FAST-NEXT:    vaddps %xmm1, %xmm0, %xmm0
731; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
732; AVX1-FAST-NEXT:    vaddps %xmm1, %xmm0, %xmm0
733; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
734; AVX1-FAST-NEXT:    vzeroupper
735; AVX1-FAST-NEXT:    retq
736;
737; AVX2-LABEL: test_v16f32_undef:
738; AVX2:       # %bb.0:
739; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
740; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
741; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
742; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
743; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
744; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
745; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
746; AVX2-NEXT:    vzeroupper
747; AVX2-NEXT:    retq
748;
749; AVX512-LABEL: test_v16f32_undef:
750; AVX512:       # %bb.0:
751; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
752; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
753; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
754; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
755; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
756; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
757; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
758; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
759; AVX512-NEXT:    vzeroupper
760; AVX512-NEXT:    retq
761  %1 = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
762  ret float %1
763}
764
765;
766; vXf64 (accum)
767;
768
769define double @test_v2f64(double %a0, <2 x double> %a1) {
770; SSE-LABEL: test_v2f64:
771; SSE:       # %bb.0:
772; SSE-NEXT:    movapd %xmm1, %xmm2
773; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
774; SSE-NEXT:    addsd %xmm1, %xmm2
775; SSE-NEXT:    addsd %xmm2, %xmm0
776; SSE-NEXT:    retq
777;
778; AVX1-SLOW-LABEL: test_v2f64:
779; AVX1-SLOW:       # %bb.0:
780; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
781; AVX1-SLOW-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
782; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
783; AVX1-SLOW-NEXT:    retq
784;
785; AVX1-FAST-LABEL: test_v2f64:
786; AVX1-FAST:       # %bb.0:
787; AVX1-FAST-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
788; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
789; AVX1-FAST-NEXT:    retq
790;
791; AVX2-LABEL: test_v2f64:
792; AVX2:       # %bb.0:
793; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
794; AVX2-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
795; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
796; AVX2-NEXT:    retq
797;
798; AVX512-LABEL: test_v2f64:
799; AVX512:       # %bb.0:
800; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
801; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
802; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
803; AVX512-NEXT:    retq
804  %1 = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1)
805  ret double %1
806}
807
808define double @test_v4f64(double %a0, <4 x double> %a1) {
809; SSE-LABEL: test_v4f64:
810; SSE:       # %bb.0:
811; SSE-NEXT:    addpd %xmm2, %xmm1
812; SSE-NEXT:    movapd %xmm1, %xmm2
813; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
814; SSE-NEXT:    addsd %xmm1, %xmm2
815; SSE-NEXT:    addsd %xmm2, %xmm0
816; SSE-NEXT:    retq
817;
818; AVX1-SLOW-LABEL: test_v4f64:
819; AVX1-SLOW:       # %bb.0:
820; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
821; AVX1-SLOW-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
822; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
823; AVX1-SLOW-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
824; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
825; AVX1-SLOW-NEXT:    vzeroupper
826; AVX1-SLOW-NEXT:    retq
827;
828; AVX1-FAST-LABEL: test_v4f64:
829; AVX1-FAST:       # %bb.0:
830; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
831; AVX1-FAST-NEXT:    vhaddpd %xmm1, %xmm2, %xmm1
832; AVX1-FAST-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
833; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
834; AVX1-FAST-NEXT:    vzeroupper
835; AVX1-FAST-NEXT:    retq
836;
837; AVX2-LABEL: test_v4f64:
838; AVX2:       # %bb.0:
839; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
840; AVX2-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
841; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
842; AVX2-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
843; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
844; AVX2-NEXT:    vzeroupper
845; AVX2-NEXT:    retq
846;
847; AVX512-LABEL: test_v4f64:
848; AVX512:       # %bb.0:
849; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
850; AVX512-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
851; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
852; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
853; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
854; AVX512-NEXT:    vzeroupper
855; AVX512-NEXT:    retq
856  %1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
857  ret double %1
858}
859
860define double @test_v8f64(double %a0, <8 x double> %a1) {
861; SSE-LABEL: test_v8f64:
862; SSE:       # %bb.0:
863; SSE-NEXT:    addpd %xmm4, %xmm2
864; SSE-NEXT:    addpd %xmm3, %xmm1
865; SSE-NEXT:    addpd %xmm2, %xmm1
866; SSE-NEXT:    movapd %xmm1, %xmm2
867; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
868; SSE-NEXT:    addsd %xmm1, %xmm2
869; SSE-NEXT:    addsd %xmm2, %xmm0
870; SSE-NEXT:    retq
871;
872; AVX1-SLOW-LABEL: test_v8f64:
873; AVX1-SLOW:       # %bb.0:
874; AVX1-SLOW-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
875; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
876; AVX1-SLOW-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
877; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
878; AVX1-SLOW-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
879; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
880; AVX1-SLOW-NEXT:    vzeroupper
881; AVX1-SLOW-NEXT:    retq
882;
883; AVX1-FAST-LABEL: test_v8f64:
884; AVX1-FAST:       # %bb.0:
885; AVX1-FAST-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
886; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
887; AVX1-FAST-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
888; AVX1-FAST-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
889; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
890; AVX1-FAST-NEXT:    vzeroupper
891; AVX1-FAST-NEXT:    retq
892;
893; AVX2-LABEL: test_v8f64:
894; AVX2:       # %bb.0:
895; AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
896; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
897; AVX2-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
898; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
899; AVX2-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
900; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
901; AVX2-NEXT:    vzeroupper
902; AVX2-NEXT:    retq
903;
904; AVX512-LABEL: test_v8f64:
905; AVX512:       # %bb.0:
906; AVX512-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
907; AVX512-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
908; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
909; AVX512-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
910; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
911; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
912; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
913; AVX512-NEXT:    vzeroupper
914; AVX512-NEXT:    retq
915  %1 = call fast double @llvm.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1)
916  ret double %1
917}
918
919define double @test_v16f64(double %a0, <16 x double> %a1) {
920; SSE-LABEL: test_v16f64:
921; SSE:       # %bb.0:
922; SSE-NEXT:    addpd %xmm6, %xmm2
923; SSE-NEXT:    addpd %xmm7, %xmm3
924; SSE-NEXT:    addpd %xmm5, %xmm1
925; SSE-NEXT:    addpd %xmm3, %xmm1
926; SSE-NEXT:    addpd {{[0-9]+}}(%rsp), %xmm4
927; SSE-NEXT:    addpd %xmm2, %xmm4
928; SSE-NEXT:    addpd %xmm1, %xmm4
929; SSE-NEXT:    movapd %xmm4, %xmm1
930; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
931; SSE-NEXT:    addsd %xmm4, %xmm1
932; SSE-NEXT:    addsd %xmm1, %xmm0
933; SSE-NEXT:    retq
934;
935; AVX1-SLOW-LABEL: test_v16f64:
936; AVX1-SLOW:       # %bb.0:
937; AVX1-SLOW-NEXT:    vaddpd %ymm4, %ymm2, %ymm2
938; AVX1-SLOW-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
939; AVX1-SLOW-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
940; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
941; AVX1-SLOW-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
942; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
943; AVX1-SLOW-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
944; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
945; AVX1-SLOW-NEXT:    vzeroupper
946; AVX1-SLOW-NEXT:    retq
947;
948; AVX1-FAST-LABEL: test_v16f64:
949; AVX1-FAST:       # %bb.0:
950; AVX1-FAST-NEXT:    vaddpd %ymm4, %ymm2, %ymm2
951; AVX1-FAST-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
952; AVX1-FAST-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
953; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
954; AVX1-FAST-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
955; AVX1-FAST-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
956; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
957; AVX1-FAST-NEXT:    vzeroupper
958; AVX1-FAST-NEXT:    retq
959;
960; AVX2-LABEL: test_v16f64:
961; AVX2:       # %bb.0:
962; AVX2-NEXT:    vaddpd %ymm4, %ymm2, %ymm2
963; AVX2-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
964; AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
965; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
966; AVX2-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
967; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
968; AVX2-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
969; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
970; AVX2-NEXT:    vzeroupper
971; AVX2-NEXT:    retq
972;
973; AVX512-LABEL: test_v16f64:
974; AVX512:       # %bb.0:
975; AVX512-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
976; AVX512-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
977; AVX512-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
978; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
979; AVX512-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
980; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
981; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
982; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
983; AVX512-NEXT:    vzeroupper
984; AVX512-NEXT:    retq
985  %1 = call fast double @llvm.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1)
986  ret double %1
987}
988
989;
990; vXf64 (zero)
991;
992
993define double @test_v2f64_zero(<2 x double> %a0) {
994; SSE-LABEL: test_v2f64_zero:
995; SSE:       # %bb.0:
996; SSE-NEXT:    movapd %xmm0, %xmm1
997; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
998; SSE-NEXT:    addsd %xmm0, %xmm1
999; SSE-NEXT:    movapd %xmm1, %xmm0
1000; SSE-NEXT:    retq
1001;
1002; AVX1-SLOW-LABEL: test_v2f64_zero:
1003; AVX1-SLOW:       # %bb.0:
1004; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1005; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1006; AVX1-SLOW-NEXT:    retq
1007;
1008; AVX1-FAST-LABEL: test_v2f64_zero:
1009; AVX1-FAST:       # %bb.0:
1010; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1011; AVX1-FAST-NEXT:    retq
1012;
1013; AVX2-LABEL: test_v2f64_zero:
1014; AVX2:       # %bb.0:
1015; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1016; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1017; AVX2-NEXT:    retq
1018;
1019; AVX512-LABEL: test_v2f64_zero:
1020; AVX512:       # %bb.0:
1021; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1022; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1023; AVX512-NEXT:    retq
1024  %1 = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
1025  ret double %1
1026}
1027
1028define double @test_v4f64_zero(<4 x double> %a0) {
1029; SSE-LABEL: test_v4f64_zero:
1030; SSE:       # %bb.0:
1031; SSE-NEXT:    addpd %xmm1, %xmm0
1032; SSE-NEXT:    movapd %xmm0, %xmm1
1033; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1034; SSE-NEXT:    addsd %xmm0, %xmm1
1035; SSE-NEXT:    movapd %xmm1, %xmm0
1036; SSE-NEXT:    retq
1037;
1038; AVX1-SLOW-LABEL: test_v4f64_zero:
1039; AVX1-SLOW:       # %bb.0:
1040; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
1041; AVX1-SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1042; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1043; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1044; AVX1-SLOW-NEXT:    vzeroupper
1045; AVX1-SLOW-NEXT:    retq
1046;
1047; AVX1-FAST-LABEL: test_v4f64_zero:
1048; AVX1-FAST:       # %bb.0:
1049; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
1050; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm1, %xmm0
1051; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1052; AVX1-FAST-NEXT:    vzeroupper
1053; AVX1-FAST-NEXT:    retq
1054;
1055; AVX2-LABEL: test_v4f64_zero:
1056; AVX2:       # %bb.0:
1057; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
1058; AVX2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1059; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1060; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1061; AVX2-NEXT:    vzeroupper
1062; AVX2-NEXT:    retq
1063;
1064; AVX512-LABEL: test_v4f64_zero:
1065; AVX512:       # %bb.0:
1066; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
1067; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1068; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1069; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1070; AVX512-NEXT:    vzeroupper
1071; AVX512-NEXT:    retq
1072  %1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
1073  ret double %1
1074}
1075
1076define double @test_v8f64_zero(<8 x double> %a0) {
1077; SSE-LABEL: test_v8f64_zero:
1078; SSE:       # %bb.0:
1079; SSE-NEXT:    addpd %xmm3, %xmm1
1080; SSE-NEXT:    addpd %xmm2, %xmm0
1081; SSE-NEXT:    addpd %xmm1, %xmm0
1082; SSE-NEXT:    movapd %xmm0, %xmm1
1083; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1084; SSE-NEXT:    addsd %xmm0, %xmm1
1085; SSE-NEXT:    movapd %xmm1, %xmm0
1086; SSE-NEXT:    retq
1087;
1088; AVX1-SLOW-LABEL: test_v8f64_zero:
1089; AVX1-SLOW:       # %bb.0:
1090; AVX1-SLOW-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1091; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
1092; AVX1-SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1093; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1094; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1095; AVX1-SLOW-NEXT:    vzeroupper
1096; AVX1-SLOW-NEXT:    retq
1097;
1098; AVX1-FAST-LABEL: test_v8f64_zero:
1099; AVX1-FAST:       # %bb.0:
1100; AVX1-FAST-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1101; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
1102; AVX1-FAST-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1103; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1104; AVX1-FAST-NEXT:    vzeroupper
1105; AVX1-FAST-NEXT:    retq
1106;
1107; AVX2-LABEL: test_v8f64_zero:
1108; AVX2:       # %bb.0:
1109; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1110; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
1111; AVX2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1112; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1113; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1114; AVX2-NEXT:    vzeroupper
1115; AVX2-NEXT:    retq
1116;
1117; AVX512-LABEL: test_v8f64_zero:
1118; AVX512:       # %bb.0:
1119; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
1120; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
1121; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
1122; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1123; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1124; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1125; AVX512-NEXT:    vzeroupper
1126; AVX512-NEXT:    retq
1127  %1 = call fast double @llvm.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
1128  ret double %1
1129}
1130
1131define double @test_v16f64_zero(<16 x double> %a0) {
1132; SSE-LABEL: test_v16f64_zero:
1133; SSE:       # %bb.0:
1134; SSE-NEXT:    addpd %xmm6, %xmm2
1135; SSE-NEXT:    addpd %xmm4, %xmm0
1136; SSE-NEXT:    addpd %xmm2, %xmm0
1137; SSE-NEXT:    addpd %xmm7, %xmm3
1138; SSE-NEXT:    addpd %xmm5, %xmm1
1139; SSE-NEXT:    addpd %xmm3, %xmm1
1140; SSE-NEXT:    addpd %xmm0, %xmm1
1141; SSE-NEXT:    movapd %xmm1, %xmm0
1142; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1143; SSE-NEXT:    addsd %xmm1, %xmm0
1144; SSE-NEXT:    retq
1145;
1146; AVX1-SLOW-LABEL: test_v16f64_zero:
1147; AVX1-SLOW:       # %bb.0:
1148; AVX1-SLOW-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
1149; AVX1-SLOW-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
1150; AVX1-SLOW-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1151; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
1152; AVX1-SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1153; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1154; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1155; AVX1-SLOW-NEXT:    vzeroupper
1156; AVX1-SLOW-NEXT:    retq
1157;
1158; AVX1-FAST-LABEL: test_v16f64_zero:
1159; AVX1-FAST:       # %bb.0:
1160; AVX1-FAST-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
1161; AVX1-FAST-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
1162; AVX1-FAST-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1163; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
1164; AVX1-FAST-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1165; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1166; AVX1-FAST-NEXT:    vzeroupper
1167; AVX1-FAST-NEXT:    retq
1168;
1169; AVX2-LABEL: test_v16f64_zero:
1170; AVX2:       # %bb.0:
1171; AVX2-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
1172; AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
1173; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1174; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
1175; AVX2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1176; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1177; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1178; AVX2-NEXT:    vzeroupper
1179; AVX2-NEXT:    retq
1180;
1181; AVX512-LABEL: test_v16f64_zero:
1182; AVX512:       # %bb.0:
1183; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
1184; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
1185; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
1186; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
1187; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1188; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1189; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1190; AVX512-NEXT:    vzeroupper
1191; AVX512-NEXT:    retq
1192  %1 = call fast double @llvm.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
1193  ret double %1
1194}
1195
1196;
1197; vXf64 (undef)
1198;
1199
1200define double @test_v2f64_undef(<2 x double> %a0) {
1201; SSE-LABEL: test_v2f64_undef:
1202; SSE:       # %bb.0:
1203; SSE-NEXT:    movapd %xmm0, %xmm1
1204; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1205; SSE-NEXT:    addsd %xmm0, %xmm1
1206; SSE-NEXT:    movapd %xmm1, %xmm0
1207; SSE-NEXT:    retq
1208;
1209; AVX1-SLOW-LABEL: test_v2f64_undef:
1210; AVX1-SLOW:       # %bb.0:
1211; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1212; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1213; AVX1-SLOW-NEXT:    retq
1214;
1215; AVX1-FAST-LABEL: test_v2f64_undef:
1216; AVX1-FAST:       # %bb.0:
1217; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1218; AVX1-FAST-NEXT:    retq
1219;
1220; AVX2-LABEL: test_v2f64_undef:
1221; AVX2:       # %bb.0:
1222; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1223; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1224; AVX2-NEXT:    retq
1225;
1226; AVX512-LABEL: test_v2f64_undef:
1227; AVX512:       # %bb.0:
1228; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1229; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1230; AVX512-NEXT:    retq
1231  %1 = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
1232  ret double %1
1233}
1234
1235define double @test_v4f64_undef(<4 x double> %a0) {
1236; SSE-LABEL: test_v4f64_undef:
1237; SSE:       # %bb.0:
1238; SSE-NEXT:    addpd %xmm1, %xmm0
1239; SSE-NEXT:    movapd %xmm0, %xmm1
1240; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1241; SSE-NEXT:    addsd %xmm0, %xmm1
1242; SSE-NEXT:    movapd %xmm1, %xmm0
1243; SSE-NEXT:    retq
1244;
1245; AVX1-SLOW-LABEL: test_v4f64_undef:
1246; AVX1-SLOW:       # %bb.0:
1247; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
1248; AVX1-SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1249; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1250; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1251; AVX1-SLOW-NEXT:    vzeroupper
1252; AVX1-SLOW-NEXT:    retq
1253;
1254; AVX1-FAST-LABEL: test_v4f64_undef:
1255; AVX1-FAST:       # %bb.0:
1256; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
1257; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm1, %xmm0
1258; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1259; AVX1-FAST-NEXT:    vzeroupper
1260; AVX1-FAST-NEXT:    retq
1261;
1262; AVX2-LABEL: test_v4f64_undef:
1263; AVX2:       # %bb.0:
1264; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
1265; AVX2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1266; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1267; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1268; AVX2-NEXT:    vzeroupper
1269; AVX2-NEXT:    retq
1270;
1271; AVX512-LABEL: test_v4f64_undef:
1272; AVX512:       # %bb.0:
1273; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
1274; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1275; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1276; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1277; AVX512-NEXT:    vzeroupper
1278; AVX512-NEXT:    retq
1279  %1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
1280  ret double %1
1281}
1282
1283define double @test_v8f64_undef(<8 x double> %a0) {
1284; SSE-LABEL: test_v8f64_undef:
1285; SSE:       # %bb.0:
1286; SSE-NEXT:    addpd %xmm3, %xmm1
1287; SSE-NEXT:    addpd %xmm2, %xmm0
1288; SSE-NEXT:    addpd %xmm1, %xmm0
1289; SSE-NEXT:    movapd %xmm0, %xmm1
1290; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1291; SSE-NEXT:    addsd %xmm0, %xmm1
1292; SSE-NEXT:    movapd %xmm1, %xmm0
1293; SSE-NEXT:    retq
1294;
1295; AVX1-SLOW-LABEL: test_v8f64_undef:
1296; AVX1-SLOW:       # %bb.0:
1297; AVX1-SLOW-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1298; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
1299; AVX1-SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1300; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1301; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1302; AVX1-SLOW-NEXT:    vzeroupper
1303; AVX1-SLOW-NEXT:    retq
1304;
1305; AVX1-FAST-LABEL: test_v8f64_undef:
1306; AVX1-FAST:       # %bb.0:
1307; AVX1-FAST-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1308; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
1309; AVX1-FAST-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1310; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1311; AVX1-FAST-NEXT:    vzeroupper
1312; AVX1-FAST-NEXT:    retq
1313;
1314; AVX2-LABEL: test_v8f64_undef:
1315; AVX2:       # %bb.0:
1316; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1317; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
1318; AVX2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1319; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1320; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1321; AVX2-NEXT:    vzeroupper
1322; AVX2-NEXT:    retq
1323;
1324; AVX512-LABEL: test_v8f64_undef:
1325; AVX512:       # %bb.0:
1326; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
1327; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
1328; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
1329; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1330; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1331; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1332; AVX512-NEXT:    vzeroupper
1333; AVX512-NEXT:    retq
1334  %1 = call fast double @llvm.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
1335  ret double %1
1336}
1337
1338define double @test_v16f64_undef(<16 x double> %a0) {
1339; SSE-LABEL: test_v16f64_undef:
1340; SSE:       # %bb.0:
1341; SSE-NEXT:    addpd %xmm6, %xmm2
1342; SSE-NEXT:    addpd %xmm4, %xmm0
1343; SSE-NEXT:    addpd %xmm2, %xmm0
1344; SSE-NEXT:    addpd %xmm7, %xmm3
1345; SSE-NEXT:    addpd %xmm5, %xmm1
1346; SSE-NEXT:    addpd %xmm3, %xmm1
1347; SSE-NEXT:    addpd %xmm0, %xmm1
1348; SSE-NEXT:    movapd %xmm1, %xmm0
1349; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1350; SSE-NEXT:    addsd %xmm1, %xmm0
1351; SSE-NEXT:    retq
1352;
1353; AVX1-SLOW-LABEL: test_v16f64_undef:
1354; AVX1-SLOW:       # %bb.0:
1355; AVX1-SLOW-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
1356; AVX1-SLOW-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
1357; AVX1-SLOW-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1358; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
1359; AVX1-SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1360; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1361; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1362; AVX1-SLOW-NEXT:    vzeroupper
1363; AVX1-SLOW-NEXT:    retq
1364;
1365; AVX1-FAST-LABEL: test_v16f64_undef:
1366; AVX1-FAST:       # %bb.0:
1367; AVX1-FAST-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
1368; AVX1-FAST-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
1369; AVX1-FAST-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1370; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
1371; AVX1-FAST-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1372; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1373; AVX1-FAST-NEXT:    vzeroupper
1374; AVX1-FAST-NEXT:    retq
1375;
1376; AVX2-LABEL: test_v16f64_undef:
1377; AVX2:       # %bb.0:
1378; AVX2-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
1379; AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
1380; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1381; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
1382; AVX2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1383; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1384; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1385; AVX2-NEXT:    vzeroupper
1386; AVX2-NEXT:    retq
1387;
1388; AVX512-LABEL: test_v16f64_undef:
1389; AVX512:       # %bb.0:
1390; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
1391; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
1392; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
1393; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
1394; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1395; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1396; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1397; AVX512-NEXT:    vzeroupper
1398; AVX512-NEXT:    retq
1399  %1 = call fast double @llvm.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
1400  ret double %1
1401}
1402
1403declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
1404declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
1405declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
1406declare float @llvm.vector.reduce.fadd.f32.v16f32(float, <16 x float>)
1407
1408declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
1409declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
1410declare double @llvm.vector.reduce.fadd.f64.v8f64(double, <8 x double>)
1411declare double @llvm.vector.reduce.fadd.f64.v16f64(double, <16 x double>)
1412