1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX
5; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX
6; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512
7; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512
8
9;
10; vXf32 (accum)
11;
12
13define float @test_v2f32(float %a0, <2 x float> %a1) {
14; SSE2-LABEL: test_v2f32:
15; SSE2:       # %bb.0:
16; SSE2-NEXT:    movaps %xmm1, %xmm2
17; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
18; SSE2-NEXT:    mulss %xmm1, %xmm2
19; SSE2-NEXT:    mulss %xmm2, %xmm0
20; SSE2-NEXT:    retq
21;
22; SSE41-LABEL: test_v2f32:
23; SSE41:       # %bb.0:
24; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
25; SSE41-NEXT:    mulss %xmm1, %xmm2
26; SSE41-NEXT:    mulss %xmm2, %xmm0
27; SSE41-NEXT:    retq
28;
29; AVX-LABEL: test_v2f32:
30; AVX:       # %bb.0:
31; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
32; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
33; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
34; AVX-NEXT:    retq
35;
36; AVX512-LABEL: test_v2f32:
37; AVX512:       # %bb.0:
38; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
39; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
40; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
41; AVX512-NEXT:    retq
42  %1 = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float %a0, <2 x float> %a1)
43  ret float %1
44}
45
46define float @test_v4f32(float %a0, <4 x float> %a1) {
47; SSE2-LABEL: test_v4f32:
48; SSE2:       # %bb.0:
49; SSE2-NEXT:    movaps %xmm1, %xmm2
50; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
51; SSE2-NEXT:    mulps %xmm1, %xmm2
52; SSE2-NEXT:    movaps %xmm2, %xmm1
53; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
54; SSE2-NEXT:    mulss %xmm2, %xmm1
55; SSE2-NEXT:    mulss %xmm1, %xmm0
56; SSE2-NEXT:    retq
57;
58; SSE41-LABEL: test_v4f32:
59; SSE41:       # %bb.0:
60; SSE41-NEXT:    movaps %xmm1, %xmm2
61; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
62; SSE41-NEXT:    mulps %xmm1, %xmm2
63; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
64; SSE41-NEXT:    mulss %xmm2, %xmm1
65; SSE41-NEXT:    mulss %xmm1, %xmm0
66; SSE41-NEXT:    retq
67;
68; AVX-LABEL: test_v4f32:
69; AVX:       # %bb.0:
70; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
71; AVX-NEXT:    vmulps %xmm2, %xmm1, %xmm1
72; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
73; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
74; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
75; AVX-NEXT:    retq
76;
77; AVX512-LABEL: test_v4f32:
78; AVX512:       # %bb.0:
79; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
80; AVX512-NEXT:    vmulps %xmm2, %xmm1, %xmm1
81; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
82; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
83; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
84; AVX512-NEXT:    retq
85  %1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1)
86  ret float %1
87}
88
89define float @test_v8f32(float %a0, <8 x float> %a1) {
90; SSE2-LABEL: test_v8f32:
91; SSE2:       # %bb.0:
92; SSE2-NEXT:    mulps %xmm2, %xmm1
93; SSE2-NEXT:    movaps %xmm1, %xmm2
94; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
95; SSE2-NEXT:    mulps %xmm1, %xmm2
96; SSE2-NEXT:    movaps %xmm2, %xmm1
97; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
98; SSE2-NEXT:    mulss %xmm2, %xmm1
99; SSE2-NEXT:    mulss %xmm1, %xmm0
100; SSE2-NEXT:    retq
101;
102; SSE41-LABEL: test_v8f32:
103; SSE41:       # %bb.0:
104; SSE41-NEXT:    mulps %xmm2, %xmm1
105; SSE41-NEXT:    movaps %xmm1, %xmm2
106; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
107; SSE41-NEXT:    mulps %xmm1, %xmm2
108; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
109; SSE41-NEXT:    mulss %xmm2, %xmm1
110; SSE41-NEXT:    mulss %xmm1, %xmm0
111; SSE41-NEXT:    retq
112;
113; AVX-LABEL: test_v8f32:
114; AVX:       # %bb.0:
115; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
116; AVX-NEXT:    vmulps %xmm2, %xmm1, %xmm1
117; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
118; AVX-NEXT:    vmulps %xmm2, %xmm1, %xmm1
119; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
120; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
121; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
122; AVX-NEXT:    vzeroupper
123; AVX-NEXT:    retq
124;
125; AVX512-LABEL: test_v8f32:
126; AVX512:       # %bb.0:
127; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
128; AVX512-NEXT:    vmulps %xmm2, %xmm1, %xmm1
129; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
130; AVX512-NEXT:    vmulps %xmm2, %xmm1, %xmm1
131; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
132; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
133; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
134; AVX512-NEXT:    vzeroupper
135; AVX512-NEXT:    retq
136  %1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float %a0, <8 x float> %a1)
137  ret float %1
138}
139
140define float @test_v16f32(float %a0, <16 x float> %a1) {
141; SSE2-LABEL: test_v16f32:
142; SSE2:       # %bb.0:
143; SSE2-NEXT:    mulps %xmm4, %xmm2
144; SSE2-NEXT:    mulps %xmm3, %xmm1
145; SSE2-NEXT:    mulps %xmm2, %xmm1
146; SSE2-NEXT:    movaps %xmm1, %xmm2
147; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
148; SSE2-NEXT:    mulps %xmm1, %xmm2
149; SSE2-NEXT:    movaps %xmm2, %xmm1
150; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
151; SSE2-NEXT:    mulss %xmm2, %xmm1
152; SSE2-NEXT:    mulss %xmm1, %xmm0
153; SSE2-NEXT:    retq
154;
155; SSE41-LABEL: test_v16f32:
156; SSE41:       # %bb.0:
157; SSE41-NEXT:    mulps %xmm4, %xmm2
158; SSE41-NEXT:    mulps %xmm3, %xmm1
159; SSE41-NEXT:    mulps %xmm2, %xmm1
160; SSE41-NEXT:    movaps %xmm1, %xmm2
161; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
162; SSE41-NEXT:    mulps %xmm1, %xmm2
163; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
164; SSE41-NEXT:    mulss %xmm2, %xmm1
165; SSE41-NEXT:    mulss %xmm1, %xmm0
166; SSE41-NEXT:    retq
167;
168; AVX-LABEL: test_v16f32:
169; AVX:       # %bb.0:
170; AVX-NEXT:    vmulps %ymm2, %ymm1, %ymm1
171; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
172; AVX-NEXT:    vmulps %xmm2, %xmm1, %xmm1
173; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
174; AVX-NEXT:    vmulps %xmm2, %xmm1, %xmm1
175; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
176; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
177; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
178; AVX-NEXT:    vzeroupper
179; AVX-NEXT:    retq
180;
181; AVX512-LABEL: test_v16f32:
182; AVX512:       # %bb.0:
183; AVX512-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
184; AVX512-NEXT:    vmulps %zmm2, %zmm1, %zmm1
185; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
186; AVX512-NEXT:    vmulps %xmm2, %xmm1, %xmm1
187; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
188; AVX512-NEXT:    vmulps %xmm2, %xmm1, %xmm1
189; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
190; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
191; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
192; AVX512-NEXT:    vzeroupper
193; AVX512-NEXT:    retq
194  %1 = call fast float @llvm.vector.reduce.fmul.f32.v16f32(float %a0, <16 x float> %a1)
195  ret float %1
196}
197
198;
199; vXf32 (one)
200;
201
202define float @test_v2f32_zero(<2 x float> %a0) {
203; SSE2-LABEL: test_v2f32_zero:
204; SSE2:       # %bb.0:
205; SSE2-NEXT:    movaps %xmm0, %xmm1
206; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
207; SSE2-NEXT:    mulss %xmm0, %xmm1
208; SSE2-NEXT:    movaps %xmm1, %xmm0
209; SSE2-NEXT:    retq
210;
211; SSE41-LABEL: test_v2f32_zero:
212; SSE41:       # %bb.0:
213; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
214; SSE41-NEXT:    mulss %xmm1, %xmm0
215; SSE41-NEXT:    retq
216;
217; AVX-LABEL: test_v2f32_zero:
218; AVX:       # %bb.0:
219; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
220; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
221; AVX-NEXT:    retq
222;
223; AVX512-LABEL: test_v2f32_zero:
224; AVX512:       # %bb.0:
225; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
226; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
227; AVX512-NEXT:    retq
228  %1 = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
229  ret float %1
230}
231
232define float @test_v4f32_zero(<4 x float> %a0) {
233; SSE2-LABEL: test_v4f32_zero:
234; SSE2:       # %bb.0:
235; SSE2-NEXT:    movaps %xmm0, %xmm1
236; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
237; SSE2-NEXT:    mulps %xmm0, %xmm1
238; SSE2-NEXT:    movaps %xmm1, %xmm0
239; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
240; SSE2-NEXT:    mulss %xmm1, %xmm0
241; SSE2-NEXT:    retq
242;
243; SSE41-LABEL: test_v4f32_zero:
244; SSE41:       # %bb.0:
245; SSE41-NEXT:    movaps %xmm0, %xmm1
246; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
247; SSE41-NEXT:    mulps %xmm0, %xmm1
248; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
249; SSE41-NEXT:    mulss %xmm0, %xmm1
250; SSE41-NEXT:    movaps %xmm1, %xmm0
251; SSE41-NEXT:    retq
252;
253; AVX-LABEL: test_v4f32_zero:
254; AVX:       # %bb.0:
255; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
256; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
257; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
258; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
259; AVX-NEXT:    retq
260;
261; AVX512-LABEL: test_v4f32_zero:
262; AVX512:       # %bb.0:
263; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
264; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
265; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
266; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
267; AVX512-NEXT:    retq
268  %1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
269  ret float %1
270}
271
272define float @test_v8f32_zero(<8 x float> %a0) {
273; SSE2-LABEL: test_v8f32_zero:
274; SSE2:       # %bb.0:
275; SSE2-NEXT:    mulps %xmm1, %xmm0
276; SSE2-NEXT:    movaps %xmm0, %xmm1
277; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
278; SSE2-NEXT:    mulps %xmm0, %xmm1
279; SSE2-NEXT:    movaps %xmm1, %xmm0
280; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
281; SSE2-NEXT:    mulss %xmm1, %xmm0
282; SSE2-NEXT:    retq
283;
284; SSE41-LABEL: test_v8f32_zero:
285; SSE41:       # %bb.0:
286; SSE41-NEXT:    mulps %xmm1, %xmm0
287; SSE41-NEXT:    movaps %xmm0, %xmm1
288; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
289; SSE41-NEXT:    mulps %xmm0, %xmm1
290; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
291; SSE41-NEXT:    mulss %xmm0, %xmm1
292; SSE41-NEXT:    movaps %xmm1, %xmm0
293; SSE41-NEXT:    retq
294;
295; AVX-LABEL: test_v8f32_zero:
296; AVX:       # %bb.0:
297; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
298; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
299; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
300; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
301; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
302; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
303; AVX-NEXT:    vzeroupper
304; AVX-NEXT:    retq
305;
306; AVX512-LABEL: test_v8f32_zero:
307; AVX512:       # %bb.0:
308; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
309; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
310; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
311; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
312; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
313; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
314; AVX512-NEXT:    vzeroupper
315; AVX512-NEXT:    retq
316  %1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
317  ret float %1
318}
319
320define float @test_v16f32_zero(<16 x float> %a0) {
321; SSE2-LABEL: test_v16f32_zero:
322; SSE2:       # %bb.0:
323; SSE2-NEXT:    mulps %xmm3, %xmm1
324; SSE2-NEXT:    mulps %xmm2, %xmm0
325; SSE2-NEXT:    mulps %xmm1, %xmm0
326; SSE2-NEXT:    movaps %xmm0, %xmm1
327; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
328; SSE2-NEXT:    mulps %xmm0, %xmm1
329; SSE2-NEXT:    movaps %xmm1, %xmm0
330; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
331; SSE2-NEXT:    mulss %xmm1, %xmm0
332; SSE2-NEXT:    retq
333;
334; SSE41-LABEL: test_v16f32_zero:
335; SSE41:       # %bb.0:
336; SSE41-NEXT:    mulps %xmm3, %xmm1
337; SSE41-NEXT:    mulps %xmm2, %xmm0
338; SSE41-NEXT:    mulps %xmm1, %xmm0
339; SSE41-NEXT:    movaps %xmm0, %xmm1
340; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
341; SSE41-NEXT:    mulps %xmm0, %xmm1
342; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
343; SSE41-NEXT:    mulss %xmm0, %xmm1
344; SSE41-NEXT:    movaps %xmm1, %xmm0
345; SSE41-NEXT:    retq
346;
347; AVX-LABEL: test_v16f32_zero:
348; AVX:       # %bb.0:
349; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
350; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
351; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
352; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
353; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
354; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
355; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
356; AVX-NEXT:    vzeroupper
357; AVX-NEXT:    retq
358;
359; AVX512-LABEL: test_v16f32_zero:
360; AVX512:       # %bb.0:
361; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
362; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
363; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
364; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
365; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
366; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
367; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
368; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
369; AVX512-NEXT:    vzeroupper
370; AVX512-NEXT:    retq
371  %1 = call fast float @llvm.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
372  ret float %1
373}
374
375;
376; vXf32 (undef)
377;
378
379define float @test_v2f32_undef(<2 x float> %a0) {
380; SSE2-LABEL: test_v2f32_undef:
381; SSE2:       # %bb.0:
382; SSE2-NEXT:    movaps %xmm0, %xmm1
383; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
384; SSE2-NEXT:    mulss %xmm0, %xmm1
385; SSE2-NEXT:    movaps %xmm1, %xmm0
386; SSE2-NEXT:    retq
387;
388; SSE41-LABEL: test_v2f32_undef:
389; SSE41:       # %bb.0:
390; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
391; SSE41-NEXT:    mulss %xmm1, %xmm0
392; SSE41-NEXT:    retq
393;
394; AVX-LABEL: test_v2f32_undef:
395; AVX:       # %bb.0:
396; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
397; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
398; AVX-NEXT:    retq
399;
400; AVX512-LABEL: test_v2f32_undef:
401; AVX512:       # %bb.0:
402; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
403; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
404; AVX512-NEXT:    retq
405  %1 = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
406  ret float %1
407}
408
409define float @test_v4f32_undef(<4 x float> %a0) {
410; SSE2-LABEL: test_v4f32_undef:
411; SSE2:       # %bb.0:
412; SSE2-NEXT:    movaps %xmm0, %xmm1
413; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
414; SSE2-NEXT:    mulps %xmm0, %xmm1
415; SSE2-NEXT:    movaps %xmm1, %xmm0
416; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
417; SSE2-NEXT:    mulss %xmm1, %xmm0
418; SSE2-NEXT:    retq
419;
420; SSE41-LABEL: test_v4f32_undef:
421; SSE41:       # %bb.0:
422; SSE41-NEXT:    movaps %xmm0, %xmm1
423; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
424; SSE41-NEXT:    mulps %xmm0, %xmm1
425; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
426; SSE41-NEXT:    mulss %xmm0, %xmm1
427; SSE41-NEXT:    movaps %xmm1, %xmm0
428; SSE41-NEXT:    retq
429;
430; AVX-LABEL: test_v4f32_undef:
431; AVX:       # %bb.0:
432; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
433; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
434; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
435; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
436; AVX-NEXT:    retq
437;
438; AVX512-LABEL: test_v4f32_undef:
439; AVX512:       # %bb.0:
440; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
441; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
442; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
443; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
444; AVX512-NEXT:    retq
445  %1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
446  ret float %1
447}
448
449define float @test_v8f32_undef(<8 x float> %a0) {
450; SSE2-LABEL: test_v8f32_undef:
451; SSE2:       # %bb.0:
452; SSE2-NEXT:    mulps %xmm1, %xmm0
453; SSE2-NEXT:    movaps %xmm0, %xmm1
454; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
455; SSE2-NEXT:    mulps %xmm0, %xmm1
456; SSE2-NEXT:    movaps %xmm1, %xmm0
457; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
458; SSE2-NEXT:    mulss %xmm1, %xmm0
459; SSE2-NEXT:    retq
460;
461; SSE41-LABEL: test_v8f32_undef:
462; SSE41:       # %bb.0:
463; SSE41-NEXT:    mulps %xmm1, %xmm0
464; SSE41-NEXT:    movaps %xmm0, %xmm1
465; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
466; SSE41-NEXT:    mulps %xmm0, %xmm1
467; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
468; SSE41-NEXT:    mulss %xmm0, %xmm1
469; SSE41-NEXT:    movaps %xmm1, %xmm0
470; SSE41-NEXT:    retq
471;
472; AVX-LABEL: test_v8f32_undef:
473; AVX:       # %bb.0:
474; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
475; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
476; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
477; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
478; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
479; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
480; AVX-NEXT:    vzeroupper
481; AVX-NEXT:    retq
482;
483; AVX512-LABEL: test_v8f32_undef:
484; AVX512:       # %bb.0:
485; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
486; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
487; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
488; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
489; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
490; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
491; AVX512-NEXT:    vzeroupper
492; AVX512-NEXT:    retq
493  %1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
494  ret float %1
495}
496
497define float @test_v16f32_undef(<16 x float> %a0) {
498; SSE2-LABEL: test_v16f32_undef:
499; SSE2:       # %bb.0:
500; SSE2-NEXT:    mulps %xmm3, %xmm1
501; SSE2-NEXT:    mulps %xmm2, %xmm0
502; SSE2-NEXT:    mulps %xmm1, %xmm0
503; SSE2-NEXT:    movaps %xmm0, %xmm1
504; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
505; SSE2-NEXT:    mulps %xmm0, %xmm1
506; SSE2-NEXT:    movaps %xmm1, %xmm0
507; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
508; SSE2-NEXT:    mulss %xmm1, %xmm0
509; SSE2-NEXT:    retq
510;
511; SSE41-LABEL: test_v16f32_undef:
512; SSE41:       # %bb.0:
513; SSE41-NEXT:    mulps %xmm3, %xmm1
514; SSE41-NEXT:    mulps %xmm2, %xmm0
515; SSE41-NEXT:    mulps %xmm1, %xmm0
516; SSE41-NEXT:    movaps %xmm0, %xmm1
517; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
518; SSE41-NEXT:    mulps %xmm0, %xmm1
519; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
520; SSE41-NEXT:    mulss %xmm0, %xmm1
521; SSE41-NEXT:    movaps %xmm1, %xmm0
522; SSE41-NEXT:    retq
523;
524; AVX-LABEL: test_v16f32_undef:
525; AVX:       # %bb.0:
526; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
527; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
528; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
529; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
530; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
531; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
532; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
533; AVX-NEXT:    vzeroupper
534; AVX-NEXT:    retq
535;
536; AVX512-LABEL: test_v16f32_undef:
537; AVX512:       # %bb.0:
538; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
539; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
540; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
541; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
542; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
543; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
544; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
545; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
546; AVX512-NEXT:    vzeroupper
547; AVX512-NEXT:    retq
548  %1 = call fast float @llvm.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
549  ret float %1
550}
551
552;
553; vXf64 (accum)
554;
555
556define double @test_v2f64(double %a0, <2 x double> %a1) {
557; SSE-LABEL: test_v2f64:
558; SSE:       # %bb.0:
559; SSE-NEXT:    movapd %xmm1, %xmm2
560; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
561; SSE-NEXT:    mulsd %xmm1, %xmm2
562; SSE-NEXT:    mulsd %xmm2, %xmm0
563; SSE-NEXT:    retq
564;
565; AVX-LABEL: test_v2f64:
566; AVX:       # %bb.0:
567; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
568; AVX-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
569; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
570; AVX-NEXT:    retq
571;
572; AVX512-LABEL: test_v2f64:
573; AVX512:       # %bb.0:
574; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
575; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
576; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
577; AVX512-NEXT:    retq
578  %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1)
579  ret double %1
580}
581
582define double @test_v4f64(double %a0, <4 x double> %a1) {
583; SSE-LABEL: test_v4f64:
584; SSE:       # %bb.0:
585; SSE-NEXT:    mulpd %xmm2, %xmm1
586; SSE-NEXT:    movapd %xmm1, %xmm2
587; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
588; SSE-NEXT:    mulsd %xmm1, %xmm2
589; SSE-NEXT:    mulsd %xmm2, %xmm0
590; SSE-NEXT:    retq
591;
592; AVX-LABEL: test_v4f64:
593; AVX:       # %bb.0:
594; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
595; AVX-NEXT:    vmulpd %xmm2, %xmm1, %xmm1
596; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
597; AVX-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
598; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
599; AVX-NEXT:    vzeroupper
600; AVX-NEXT:    retq
601;
602; AVX512-LABEL: test_v4f64:
603; AVX512:       # %bb.0:
604; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
605; AVX512-NEXT:    vmulpd %xmm2, %xmm1, %xmm1
606; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
607; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
608; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
609; AVX512-NEXT:    vzeroupper
610; AVX512-NEXT:    retq
611  %1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double %a0, <4 x double> %a1)
612  ret double %1
613}
614
615define double @test_v8f64(double %a0, <8 x double> %a1) {
616; SSE-LABEL: test_v8f64:
617; SSE:       # %bb.0:
618; SSE-NEXT:    mulpd %xmm4, %xmm2
619; SSE-NEXT:    mulpd %xmm3, %xmm1
620; SSE-NEXT:    mulpd %xmm2, %xmm1
621; SSE-NEXT:    movapd %xmm1, %xmm2
622; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
623; SSE-NEXT:    mulsd %xmm1, %xmm2
624; SSE-NEXT:    mulsd %xmm2, %xmm0
625; SSE-NEXT:    retq
626;
627; AVX-LABEL: test_v8f64:
628; AVX:       # %bb.0:
629; AVX-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
630; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
631; AVX-NEXT:    vmulpd %xmm2, %xmm1, %xmm1
632; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
633; AVX-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
634; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
635; AVX-NEXT:    vzeroupper
636; AVX-NEXT:    retq
637;
638; AVX512-LABEL: test_v8f64:
639; AVX512:       # %bb.0:
640; AVX512-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
641; AVX512-NEXT:    vmulpd %zmm2, %zmm1, %zmm1
642; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
643; AVX512-NEXT:    vmulpd %xmm2, %xmm1, %xmm1
644; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
645; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
646; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
647; AVX512-NEXT:    vzeroupper
648; AVX512-NEXT:    retq
649  %1 = call fast double @llvm.vector.reduce.fmul.f64.v8f64(double %a0, <8 x double> %a1)
650  ret double %1
651}
652
653define double @test_v16f64(double %a0, <16 x double> %a1) {
654; SSE-LABEL: test_v16f64:
655; SSE:       # %bb.0:
656; SSE-NEXT:    mulpd %xmm6, %xmm2
657; SSE-NEXT:    mulpd %xmm7, %xmm3
658; SSE-NEXT:    mulpd %xmm5, %xmm1
659; SSE-NEXT:    mulpd %xmm3, %xmm1
660; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm4
661; SSE-NEXT:    mulpd %xmm2, %xmm4
662; SSE-NEXT:    mulpd %xmm1, %xmm4
663; SSE-NEXT:    movapd %xmm4, %xmm1
664; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
665; SSE-NEXT:    mulsd %xmm4, %xmm1
666; SSE-NEXT:    mulsd %xmm1, %xmm0
667; SSE-NEXT:    retq
668;
669; AVX-LABEL: test_v16f64:
670; AVX:       # %bb.0:
671; AVX-NEXT:    vmulpd %ymm4, %ymm2, %ymm2
672; AVX-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
673; AVX-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
674; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
675; AVX-NEXT:    vmulpd %xmm2, %xmm1, %xmm1
676; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
677; AVX-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
678; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
679; AVX-NEXT:    vzeroupper
680; AVX-NEXT:    retq
681;
682; AVX512-LABEL: test_v16f64:
683; AVX512:       # %bb.0:
684; AVX512-NEXT:    vmulpd %zmm2, %zmm1, %zmm1
685; AVX512-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
686; AVX512-NEXT:    vmulpd %zmm2, %zmm1, %zmm1
687; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
688; AVX512-NEXT:    vmulpd %xmm2, %xmm1, %xmm1
689; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
690; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
691; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
692; AVX512-NEXT:    vzeroupper
693; AVX512-NEXT:    retq
694  %1 = call fast double @llvm.vector.reduce.fmul.f64.v16f64(double %a0, <16 x double> %a1)
695  ret double %1
696}
697
698;
699; vXf64 (one)
700;
701
702define double @test_v2f64_zero(<2 x double> %a0) {
703; SSE-LABEL: test_v2f64_zero:
704; SSE:       # %bb.0:
705; SSE-NEXT:    movapd %xmm0, %xmm1
706; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
707; SSE-NEXT:    mulsd %xmm0, %xmm1
708; SSE-NEXT:    movapd %xmm1, %xmm0
709; SSE-NEXT:    retq
710;
711; AVX-LABEL: test_v2f64_zero:
712; AVX:       # %bb.0:
713; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
714; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
715; AVX-NEXT:    retq
716;
717; AVX512-LABEL: test_v2f64_zero:
718; AVX512:       # %bb.0:
719; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
720; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
721; AVX512-NEXT:    retq
722  %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
723  ret double %1
724}
725
726define double @test_v4f64_zero(<4 x double> %a0) {
727; SSE-LABEL: test_v4f64_zero:
728; SSE:       # %bb.0:
729; SSE-NEXT:    mulpd %xmm1, %xmm0
730; SSE-NEXT:    movapd %xmm0, %xmm1
731; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
732; SSE-NEXT:    mulsd %xmm0, %xmm1
733; SSE-NEXT:    movapd %xmm1, %xmm0
734; SSE-NEXT:    retq
735;
736; AVX-LABEL: test_v4f64_zero:
737; AVX:       # %bb.0:
738; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
739; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
740; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
741; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
742; AVX-NEXT:    vzeroupper
743; AVX-NEXT:    retq
744;
745; AVX512-LABEL: test_v4f64_zero:
746; AVX512:       # %bb.0:
747; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
748; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
749; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
750; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
751; AVX512-NEXT:    vzeroupper
752; AVX512-NEXT:    retq
753  %1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
754  ret double %1
755}
756
757define double @test_v8f64_zero(<8 x double> %a0) {
758; SSE-LABEL: test_v8f64_zero:
759; SSE:       # %bb.0:
760; SSE-NEXT:    mulpd %xmm3, %xmm1
761; SSE-NEXT:    mulpd %xmm2, %xmm0
762; SSE-NEXT:    mulpd %xmm1, %xmm0
763; SSE-NEXT:    movapd %xmm0, %xmm1
764; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
765; SSE-NEXT:    mulsd %xmm0, %xmm1
766; SSE-NEXT:    movapd %xmm1, %xmm0
767; SSE-NEXT:    retq
768;
769; AVX-LABEL: test_v8f64_zero:
770; AVX:       # %bb.0:
771; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
772; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
773; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
774; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
775; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
776; AVX-NEXT:    vzeroupper
777; AVX-NEXT:    retq
778;
779; AVX512-LABEL: test_v8f64_zero:
780; AVX512:       # %bb.0:
781; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
782; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
783; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
784; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
785; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
786; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
787; AVX512-NEXT:    vzeroupper
788; AVX512-NEXT:    retq
789  %1 = call fast double @llvm.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
790  ret double %1
791}
792
793define double @test_v16f64_zero(<16 x double> %a0) {
794; SSE-LABEL: test_v16f64_zero:
795; SSE:       # %bb.0:
796; SSE-NEXT:    mulpd %xmm6, %xmm2
797; SSE-NEXT:    mulpd %xmm4, %xmm0
798; SSE-NEXT:    mulpd %xmm2, %xmm0
799; SSE-NEXT:    mulpd %xmm7, %xmm3
800; SSE-NEXT:    mulpd %xmm5, %xmm1
801; SSE-NEXT:    mulpd %xmm3, %xmm1
802; SSE-NEXT:    mulpd %xmm0, %xmm1
803; SSE-NEXT:    movapd %xmm1, %xmm0
804; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
805; SSE-NEXT:    mulsd %xmm1, %xmm0
806; SSE-NEXT:    retq
807;
808; AVX-LABEL: test_v16f64_zero:
809; AVX:       # %bb.0:
810; AVX-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
811; AVX-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
812; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
813; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
814; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
815; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
816; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
817; AVX-NEXT:    vzeroupper
818; AVX-NEXT:    retq
819;
820; AVX512-LABEL: test_v16f64_zero:
821; AVX512:       # %bb.0:
822; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
823; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
824; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
825; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
826; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
827; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
828; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
829; AVX512-NEXT:    vzeroupper
830; AVX512-NEXT:    retq
831  %1 = call fast double @llvm.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
832  ret double %1
833}
834
835;
836; vXf64 (undef)
837;
838
839define double @test_v2f64_undef(<2 x double> %a0) {
840; SSE-LABEL: test_v2f64_undef:
841; SSE:       # %bb.0:
842; SSE-NEXT:    movapd %xmm0, %xmm1
843; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
844; SSE-NEXT:    mulsd %xmm0, %xmm1
845; SSE-NEXT:    movapd %xmm1, %xmm0
846; SSE-NEXT:    retq
847;
848; AVX-LABEL: test_v2f64_undef:
849; AVX:       # %bb.0:
850; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
851; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
852; AVX-NEXT:    retq
853;
854; AVX512-LABEL: test_v2f64_undef:
855; AVX512:       # %bb.0:
856; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
857; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
858; AVX512-NEXT:    retq
859  %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
860  ret double %1
861}
862
863define double @test_v4f64_undef(<4 x double> %a0) {
864; SSE-LABEL: test_v4f64_undef:
865; SSE:       # %bb.0:
866; SSE-NEXT:    mulpd %xmm1, %xmm0
867; SSE-NEXT:    movapd %xmm0, %xmm1
868; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
869; SSE-NEXT:    mulsd %xmm0, %xmm1
870; SSE-NEXT:    movapd %xmm1, %xmm0
871; SSE-NEXT:    retq
872;
873; AVX-LABEL: test_v4f64_undef:
874; AVX:       # %bb.0:
875; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
876; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
877; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
878; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
879; AVX-NEXT:    vzeroupper
880; AVX-NEXT:    retq
881;
882; AVX512-LABEL: test_v4f64_undef:
883; AVX512:       # %bb.0:
884; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
885; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
886; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
887; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
888; AVX512-NEXT:    vzeroupper
889; AVX512-NEXT:    retq
890  %1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
891  ret double %1
892}
893
894define double @test_v8f64_undef(<8 x double> %a0) {
895; SSE-LABEL: test_v8f64_undef:
896; SSE:       # %bb.0:
897; SSE-NEXT:    mulpd %xmm3, %xmm1
898; SSE-NEXT:    mulpd %xmm2, %xmm0
899; SSE-NEXT:    mulpd %xmm1, %xmm0
900; SSE-NEXT:    movapd %xmm0, %xmm1
901; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
902; SSE-NEXT:    mulsd %xmm0, %xmm1
903; SSE-NEXT:    movapd %xmm1, %xmm0
904; SSE-NEXT:    retq
905;
906; AVX-LABEL: test_v8f64_undef:
907; AVX:       # %bb.0:
908; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
909; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
910; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
911; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
912; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
913; AVX-NEXT:    vzeroupper
914; AVX-NEXT:    retq
915;
916; AVX512-LABEL: test_v8f64_undef:
917; AVX512:       # %bb.0:
918; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
919; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
920; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
921; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
922; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
923; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
924; AVX512-NEXT:    vzeroupper
925; AVX512-NEXT:    retq
926  %1 = call fast double @llvm.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
927  ret double %1
928}
929
930define double @test_v16f64_undef(<16 x double> %a0) {
931; SSE-LABEL: test_v16f64_undef:
932; SSE:       # %bb.0:
933; SSE-NEXT:    mulpd %xmm6, %xmm2
934; SSE-NEXT:    mulpd %xmm4, %xmm0
935; SSE-NEXT:    mulpd %xmm2, %xmm0
936; SSE-NEXT:    mulpd %xmm7, %xmm3
937; SSE-NEXT:    mulpd %xmm5, %xmm1
938; SSE-NEXT:    mulpd %xmm3, %xmm1
939; SSE-NEXT:    mulpd %xmm0, %xmm1
940; SSE-NEXT:    movapd %xmm1, %xmm0
941; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
942; SSE-NEXT:    mulsd %xmm1, %xmm0
943; SSE-NEXT:    retq
944;
945; AVX-LABEL: test_v16f64_undef:
946; AVX:       # %bb.0:
947; AVX-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
948; AVX-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
949; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
950; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
951; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
952; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
953; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
954; AVX-NEXT:    vzeroupper
955; AVX-NEXT:    retq
956;
957; AVX512-LABEL: test_v16f64_undef:
958; AVX512:       # %bb.0:
959; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
960; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
961; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
962; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
963; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
964; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
965; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
966; AVX512-NEXT:    vzeroupper
967; AVX512-NEXT:    retq
968  %1 = call fast double @llvm.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
969  ret double %1
970}
971
972declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>)
973declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
974declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
975declare float @llvm.vector.reduce.fmul.f32.v16f32(float, <16 x float>)
976
977declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
978declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
979declare double @llvm.vector.reduce.fmul.f64.v8f64(double, <8 x double>)
980declare double @llvm.vector.reduce.fmul.f64.v16f64(double, <16 x double>)
981