1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512
8
9;
10; vXf32 (accum)
11;
12
13define float @test_v2f32(float %a0, <2 x float> %a1) {
14; SSE2-LABEL: test_v2f32:
15; SSE2:       # %bb.0:
16; SSE2-NEXT:    mulss %xmm1, %xmm0
17; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
18; SSE2-NEXT:    mulss %xmm1, %xmm0
19; SSE2-NEXT:    retq
20;
21; SSE41-LABEL: test_v2f32:
22; SSE41:       # %bb.0:
23; SSE41-NEXT:    mulss %xmm1, %xmm0
24; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
25; SSE41-NEXT:    mulss %xmm1, %xmm0
26; SSE41-NEXT:    retq
27;
28; AVX-LABEL: test_v2f32:
29; AVX:       # %bb.0:
30; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
31; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
32; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
33; AVX-NEXT:    retq
34;
35; AVX512-LABEL: test_v2f32:
36; AVX512:       # %bb.0:
37; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
38; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
39; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
40; AVX512-NEXT:    retq
41  %1 = call float @llvm.vector.reduce.fmul.f32.v2f32(float %a0, <2 x float> %a1)
42  ret float %1
43}
44
45define float @test_v4f32(float %a0, <4 x float> %a1) {
46; SSE2-LABEL: test_v4f32:
47; SSE2:       # %bb.0:
48; SSE2-NEXT:    mulss %xmm1, %xmm0
49; SSE2-NEXT:    movaps %xmm1, %xmm2
50; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
51; SSE2-NEXT:    mulss %xmm2, %xmm0
52; SSE2-NEXT:    movaps %xmm1, %xmm2
53; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
54; SSE2-NEXT:    mulss %xmm2, %xmm0
55; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
56; SSE2-NEXT:    mulss %xmm1, %xmm0
57; SSE2-NEXT:    retq
58;
59; SSE41-LABEL: test_v4f32:
60; SSE41:       # %bb.0:
61; SSE41-NEXT:    mulss %xmm1, %xmm0
62; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
63; SSE41-NEXT:    mulss %xmm2, %xmm0
64; SSE41-NEXT:    movaps %xmm1, %xmm2
65; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
66; SSE41-NEXT:    mulss %xmm2, %xmm0
67; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
68; SSE41-NEXT:    mulss %xmm1, %xmm0
69; SSE41-NEXT:    retq
70;
71; AVX-LABEL: test_v4f32:
72; AVX:       # %bb.0:
73; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
74; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
75; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
76; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
77; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
78; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
79; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
80; AVX-NEXT:    retq
81;
82; AVX512-LABEL: test_v4f32:
83; AVX512:       # %bb.0:
84; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
85; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
86; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
87; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
88; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
89; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
90; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
91; AVX512-NEXT:    retq
92  %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1)
93  ret float %1
94}
95
96define float @test_v8f32(float %a0, <8 x float> %a1) {
97; SSE2-LABEL: test_v8f32:
98; SSE2:       # %bb.0:
99; SSE2-NEXT:    mulss %xmm1, %xmm0
100; SSE2-NEXT:    movaps %xmm1, %xmm3
101; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1]
102; SSE2-NEXT:    mulss %xmm3, %xmm0
103; SSE2-NEXT:    movaps %xmm1, %xmm3
104; SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
105; SSE2-NEXT:    mulss %xmm3, %xmm0
106; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
107; SSE2-NEXT:    mulss %xmm1, %xmm0
108; SSE2-NEXT:    mulss %xmm2, %xmm0
109; SSE2-NEXT:    movaps %xmm2, %xmm1
110; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
111; SSE2-NEXT:    mulss %xmm1, %xmm0
112; SSE2-NEXT:    movaps %xmm2, %xmm1
113; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
114; SSE2-NEXT:    mulss %xmm1, %xmm0
115; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
116; SSE2-NEXT:    mulss %xmm2, %xmm0
117; SSE2-NEXT:    retq
118;
119; SSE41-LABEL: test_v8f32:
120; SSE41:       # %bb.0:
121; SSE41-NEXT:    mulss %xmm1, %xmm0
122; SSE41-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
123; SSE41-NEXT:    mulss %xmm3, %xmm0
124; SSE41-NEXT:    movaps %xmm1, %xmm3
125; SSE41-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
126; SSE41-NEXT:    mulss %xmm3, %xmm0
127; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
128; SSE41-NEXT:    mulss %xmm1, %xmm0
129; SSE41-NEXT:    mulss %xmm2, %xmm0
130; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
131; SSE41-NEXT:    mulss %xmm1, %xmm0
132; SSE41-NEXT:    movaps %xmm2, %xmm1
133; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
134; SSE41-NEXT:    mulss %xmm1, %xmm0
135; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
136; SSE41-NEXT:    mulss %xmm2, %xmm0
137; SSE41-NEXT:    retq
138;
139; AVX-LABEL: test_v8f32:
140; AVX:       # %bb.0:
141; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
142; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
143; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
144; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
145; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
146; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
147; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
148; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
149; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
150; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
151; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
152; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
153; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
154; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
155; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
156; AVX-NEXT:    vzeroupper
157; AVX-NEXT:    retq
158;
159; AVX512-LABEL: test_v8f32:
160; AVX512:       # %bb.0:
161; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
162; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
163; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
164; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
165; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
166; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
167; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
168; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm1
169; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
170; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
171; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
172; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
173; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
174; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
175; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
176; AVX512-NEXT:    vzeroupper
177; AVX512-NEXT:    retq
178  %1 = call float @llvm.vector.reduce.fmul.f32.v8f32(float %a0, <8 x float> %a1)
179  ret float %1
180}
181
182define float @test_v16f32(float %a0, <16 x float> %a1) {
183; SSE2-LABEL: test_v16f32:
184; SSE2:       # %bb.0:
185; SSE2-NEXT:    mulss %xmm1, %xmm0
186; SSE2-NEXT:    movaps %xmm1, %xmm5
187; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1]
188; SSE2-NEXT:    mulss %xmm5, %xmm0
189; SSE2-NEXT:    movaps %xmm1, %xmm5
190; SSE2-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
191; SSE2-NEXT:    mulss %xmm5, %xmm0
192; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
193; SSE2-NEXT:    mulss %xmm1, %xmm0
194; SSE2-NEXT:    mulss %xmm2, %xmm0
195; SSE2-NEXT:    movaps %xmm2, %xmm1
196; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
197; SSE2-NEXT:    mulss %xmm1, %xmm0
198; SSE2-NEXT:    movaps %xmm2, %xmm1
199; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
200; SSE2-NEXT:    mulss %xmm1, %xmm0
201; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
202; SSE2-NEXT:    mulss %xmm2, %xmm0
203; SSE2-NEXT:    mulss %xmm3, %xmm0
204; SSE2-NEXT:    movaps %xmm3, %xmm1
205; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1]
206; SSE2-NEXT:    mulss %xmm1, %xmm0
207; SSE2-NEXT:    movaps %xmm3, %xmm1
208; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
209; SSE2-NEXT:    mulss %xmm1, %xmm0
210; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
211; SSE2-NEXT:    mulss %xmm3, %xmm0
212; SSE2-NEXT:    mulss %xmm4, %xmm0
213; SSE2-NEXT:    movaps %xmm4, %xmm1
214; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1]
215; SSE2-NEXT:    mulss %xmm1, %xmm0
216; SSE2-NEXT:    movaps %xmm4, %xmm1
217; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
218; SSE2-NEXT:    mulss %xmm1, %xmm0
219; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3,3,3]
220; SSE2-NEXT:    mulss %xmm4, %xmm0
221; SSE2-NEXT:    retq
222;
223; SSE41-LABEL: test_v16f32:
224; SSE41:       # %bb.0:
225; SSE41-NEXT:    mulss %xmm1, %xmm0
226; SSE41-NEXT:    movshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
227; SSE41-NEXT:    mulss %xmm5, %xmm0
228; SSE41-NEXT:    movaps %xmm1, %xmm5
229; SSE41-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
230; SSE41-NEXT:    mulss %xmm5, %xmm0
231; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
232; SSE41-NEXT:    mulss %xmm1, %xmm0
233; SSE41-NEXT:    mulss %xmm2, %xmm0
234; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
235; SSE41-NEXT:    mulss %xmm1, %xmm0
236; SSE41-NEXT:    movaps %xmm2, %xmm1
237; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
238; SSE41-NEXT:    mulss %xmm1, %xmm0
239; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
240; SSE41-NEXT:    mulss %xmm2, %xmm0
241; SSE41-NEXT:    mulss %xmm3, %xmm0
242; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
243; SSE41-NEXT:    mulss %xmm1, %xmm0
244; SSE41-NEXT:    movaps %xmm3, %xmm1
245; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
246; SSE41-NEXT:    mulss %xmm1, %xmm0
247; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
248; SSE41-NEXT:    mulss %xmm3, %xmm0
249; SSE41-NEXT:    mulss %xmm4, %xmm0
250; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3]
251; SSE41-NEXT:    mulss %xmm1, %xmm0
252; SSE41-NEXT:    movaps %xmm4, %xmm1
253; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
254; SSE41-NEXT:    mulss %xmm1, %xmm0
255; SSE41-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3,3,3]
256; SSE41-NEXT:    mulss %xmm4, %xmm0
257; SSE41-NEXT:    retq
258;
259; AVX-LABEL: test_v16f32:
260; AVX:       # %bb.0:
261; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
262; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
263; AVX-NEXT:    vmulss %xmm3, %xmm0, %xmm0
264; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
265; AVX-NEXT:    vmulss %xmm3, %xmm0, %xmm0
266; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[3,3,3,3]
267; AVX-NEXT:    vmulss %xmm3, %xmm0, %xmm0
268; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
269; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
270; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
271; AVX-NEXT:    vmulss %xmm3, %xmm0, %xmm0
272; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
273; AVX-NEXT:    vmulss %xmm3, %xmm0, %xmm0
274; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
275; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
276; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
277; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
278; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
279; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
280; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
281; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3]
282; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
283; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
284; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
285; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
286; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
287; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
288; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
289; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
290; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
291; AVX-NEXT:    vzeroupper
292; AVX-NEXT:    retq
293;
294; AVX512-LABEL: test_v16f32:
295; AVX512:       # %bb.0:
296; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
297; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
298; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
299; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
300; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
301; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
302; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
303; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
304; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
305; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
306; AVX512-NEXT:    vmulss %xmm3, %xmm0, %xmm0
307; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
308; AVX512-NEXT:    vmulss %xmm3, %xmm0, %xmm0
309; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
310; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
311; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
312; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
313; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
314; AVX512-NEXT:    vmulss %xmm3, %xmm0, %xmm0
315; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
316; AVX512-NEXT:    vmulss %xmm3, %xmm0, %xmm0
317; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
318; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
319; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
320; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
321; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
322; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
323; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
324; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
325; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
326; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
327; AVX512-NEXT:    vzeroupper
328; AVX512-NEXT:    retq
329  %1 = call float @llvm.vector.reduce.fmul.f32.v16f32(float %a0, <16 x float> %a1)
330  ret float %1
331}
332
333;
334; vXf32 (one)
335;
336
337define float @test_v2f32_one(<2 x float> %a0) {
338; SSE2-LABEL: test_v2f32_one:
339; SSE2:       # %bb.0:
340; SSE2-NEXT:    movaps %xmm0, %xmm1
341; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
342; SSE2-NEXT:    mulss %xmm0, %xmm1
343; SSE2-NEXT:    movaps %xmm1, %xmm0
344; SSE2-NEXT:    retq
345;
346; SSE41-LABEL: test_v2f32_one:
347; SSE41:       # %bb.0:
348; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
349; SSE41-NEXT:    mulss %xmm1, %xmm0
350; SSE41-NEXT:    retq
351;
352; AVX-LABEL: test_v2f32_one:
353; AVX:       # %bb.0:
354; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
355; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
356; AVX-NEXT:    retq
357;
358; AVX512-LABEL: test_v2f32_one:
359; AVX512:       # %bb.0:
360; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
361; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
362; AVX512-NEXT:    retq
363  %1 = call float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
364  ret float %1
365}
366
367define float @test_v4f32_one(<4 x float> %a0) {
368; SSE2-LABEL: test_v4f32_one:
369; SSE2:       # %bb.0:
370; SSE2-NEXT:    movaps %xmm0, %xmm1
371; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
372; SSE2-NEXT:    mulss %xmm0, %xmm1
373; SSE2-NEXT:    movaps %xmm0, %xmm2
374; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
375; SSE2-NEXT:    mulss %xmm1, %xmm2
376; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
377; SSE2-NEXT:    mulss %xmm2, %xmm0
378; SSE2-NEXT:    retq
379;
380; SSE41-LABEL: test_v4f32_one:
381; SSE41:       # %bb.0:
382; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
383; SSE41-NEXT:    mulss %xmm0, %xmm1
384; SSE41-NEXT:    movaps %xmm0, %xmm2
385; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
386; SSE41-NEXT:    mulss %xmm1, %xmm2
387; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
388; SSE41-NEXT:    mulss %xmm2, %xmm0
389; SSE41-NEXT:    retq
390;
391; AVX-LABEL: test_v4f32_one:
392; AVX:       # %bb.0:
393; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
394; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm1
395; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
396; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
397; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
398; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
399; AVX-NEXT:    retq
400;
401; AVX512-LABEL: test_v4f32_one:
402; AVX512:       # %bb.0:
403; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
404; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm1
405; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
406; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
407; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
408; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
409; AVX512-NEXT:    retq
410  %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
411  ret float %1
412}
413
414define float @test_v8f32_one(<8 x float> %a0) {
415; SSE2-LABEL: test_v8f32_one:
416; SSE2:       # %bb.0:
417; SSE2-NEXT:    movaps %xmm0, %xmm2
418; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
419; SSE2-NEXT:    mulss %xmm0, %xmm2
420; SSE2-NEXT:    movaps %xmm0, %xmm3
421; SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
422; SSE2-NEXT:    mulss %xmm2, %xmm3
423; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
424; SSE2-NEXT:    mulss %xmm3, %xmm0
425; SSE2-NEXT:    mulss %xmm1, %xmm0
426; SSE2-NEXT:    movaps %xmm1, %xmm2
427; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
428; SSE2-NEXT:    mulss %xmm2, %xmm0
429; SSE2-NEXT:    movaps %xmm1, %xmm2
430; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
431; SSE2-NEXT:    mulss %xmm2, %xmm0
432; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
433; SSE2-NEXT:    mulss %xmm1, %xmm0
434; SSE2-NEXT:    retq
435;
436; SSE41-LABEL: test_v8f32_one:
437; SSE41:       # %bb.0:
438; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
439; SSE41-NEXT:    mulss %xmm0, %xmm2
440; SSE41-NEXT:    movaps %xmm0, %xmm3
441; SSE41-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
442; SSE41-NEXT:    mulss %xmm2, %xmm3
443; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
444; SSE41-NEXT:    mulss %xmm3, %xmm0
445; SSE41-NEXT:    mulss %xmm1, %xmm0
446; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
447; SSE41-NEXT:    mulss %xmm2, %xmm0
448; SSE41-NEXT:    movaps %xmm1, %xmm2
449; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
450; SSE41-NEXT:    mulss %xmm2, %xmm0
451; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
452; SSE41-NEXT:    mulss %xmm1, %xmm0
453; SSE41-NEXT:    retq
454;
455; AVX-LABEL: test_v8f32_one:
456; AVX:       # %bb.0:
457; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
458; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm1
459; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
460; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
461; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
462; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
463; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
464; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm1
465; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
466; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
467; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
468; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
469; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
470; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
471; AVX-NEXT:    vzeroupper
472; AVX-NEXT:    retq
473;
474; AVX512-LABEL: test_v8f32_one:
475; AVX512:       # %bb.0:
476; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
477; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm1
478; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
479; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
480; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
481; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
482; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
483; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm1
484; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
485; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
486; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
487; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
488; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
489; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
490; AVX512-NEXT:    vzeroupper
491; AVX512-NEXT:    retq
492  %1 = call float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
493  ret float %1
494}
495
496define float @test_v16f32_one(<16 x float> %a0) {
497; SSE2-LABEL: test_v16f32_one:
498; SSE2:       # %bb.0:
499; SSE2-NEXT:    movaps %xmm0, %xmm4
500; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1]
501; SSE2-NEXT:    mulss %xmm0, %xmm4
502; SSE2-NEXT:    movaps %xmm0, %xmm5
503; SSE2-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
504; SSE2-NEXT:    mulss %xmm4, %xmm5
505; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
506; SSE2-NEXT:    mulss %xmm5, %xmm0
507; SSE2-NEXT:    mulss %xmm1, %xmm0
508; SSE2-NEXT:    movaps %xmm1, %xmm4
509; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1]
510; SSE2-NEXT:    mulss %xmm4, %xmm0
511; SSE2-NEXT:    movaps %xmm1, %xmm4
512; SSE2-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
513; SSE2-NEXT:    mulss %xmm4, %xmm0
514; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
515; SSE2-NEXT:    mulss %xmm1, %xmm0
516; SSE2-NEXT:    mulss %xmm2, %xmm0
517; SSE2-NEXT:    movaps %xmm2, %xmm1
518; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
519; SSE2-NEXT:    mulss %xmm1, %xmm0
520; SSE2-NEXT:    movaps %xmm2, %xmm1
521; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
522; SSE2-NEXT:    mulss %xmm1, %xmm0
523; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
524; SSE2-NEXT:    mulss %xmm2, %xmm0
525; SSE2-NEXT:    mulss %xmm3, %xmm0
526; SSE2-NEXT:    movaps %xmm3, %xmm1
527; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1]
528; SSE2-NEXT:    mulss %xmm1, %xmm0
529; SSE2-NEXT:    movaps %xmm3, %xmm1
530; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
531; SSE2-NEXT:    mulss %xmm1, %xmm0
532; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
533; SSE2-NEXT:    mulss %xmm3, %xmm0
534; SSE2-NEXT:    retq
535;
536; SSE41-LABEL: test_v16f32_one:
537; SSE41:       # %bb.0:
538; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
539; SSE41-NEXT:    mulss %xmm0, %xmm4
540; SSE41-NEXT:    movaps %xmm0, %xmm5
541; SSE41-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
542; SSE41-NEXT:    mulss %xmm4, %xmm5
543; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
544; SSE41-NEXT:    mulss %xmm5, %xmm0
545; SSE41-NEXT:    mulss %xmm1, %xmm0
546; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
547; SSE41-NEXT:    mulss %xmm4, %xmm0
548; SSE41-NEXT:    movaps %xmm1, %xmm4
549; SSE41-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
550; SSE41-NEXT:    mulss %xmm4, %xmm0
551; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
552; SSE41-NEXT:    mulss %xmm1, %xmm0
553; SSE41-NEXT:    mulss %xmm2, %xmm0
554; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
555; SSE41-NEXT:    mulss %xmm1, %xmm0
556; SSE41-NEXT:    movaps %xmm2, %xmm1
557; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
558; SSE41-NEXT:    mulss %xmm1, %xmm0
559; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
560; SSE41-NEXT:    mulss %xmm2, %xmm0
561; SSE41-NEXT:    mulss %xmm3, %xmm0
562; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
563; SSE41-NEXT:    mulss %xmm1, %xmm0
564; SSE41-NEXT:    movaps %xmm3, %xmm1
565; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
566; SSE41-NEXT:    mulss %xmm1, %xmm0
567; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
568; SSE41-NEXT:    mulss %xmm3, %xmm0
569; SSE41-NEXT:    retq
570;
571; AVX-LABEL: test_v16f32_one:
572; AVX:       # %bb.0:
573; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
574; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm2
575; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
576; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm2
577; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
578; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm2
579; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
580; AVX-NEXT:    vmulss %xmm0, %xmm2, %xmm2
581; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
582; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm2
583; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
584; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm2
585; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
586; AVX-NEXT:    vmulss %xmm0, %xmm2, %xmm0
587; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
588; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
589; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
590; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
591; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
592; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
593; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
594; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
595; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
596; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
597; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
598; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
599; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
600; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
601; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
602; AVX-NEXT:    vzeroupper
603; AVX-NEXT:    retq
604;
605; AVX512-LABEL: test_v16f32_one:
606; AVX512:       # %bb.0:
607; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
608; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm1
609; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
610; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
611; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
612; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
613; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
614; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
615; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
616; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
617; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
618; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
619; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
620; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
621; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
622; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
623; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
624; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
625; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
626; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
627; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
628; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
629; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
630; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm1
631; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
632; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
633; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
634; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
635; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
636; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
637; AVX512-NEXT:    vzeroupper
638; AVX512-NEXT:    retq
639  %1 = call float @llvm.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
640  ret float %1
641}
642
643;
644; vXf32 (undef)
645;
646
647define float @test_v2f32_undef(<2 x float> %a0) {
648; SSE2-LABEL: test_v2f32_undef:
649; SSE2:       # %bb.0:
650; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
651; SSE2-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
652; SSE2-NEXT:    retq
653;
654; SSE41-LABEL: test_v2f32_undef:
655; SSE41:       # %bb.0:
656; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
657; SSE41-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
658; SSE41-NEXT:    retq
659;
660; AVX-LABEL: test_v2f32_undef:
661; AVX:       # %bb.0:
662; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
663; AVX-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
664; AVX-NEXT:    retq
665;
666; AVX512-LABEL: test_v2f32_undef:
667; AVX512:       # %bb.0:
668; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
669; AVX512-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
670; AVX512-NEXT:    retq
671  %1 = call float @llvm.vector.reduce.fmul.f32.v2f32(float undef, <2 x float> %a0)
672  ret float %1
673}
674
675define float @test_v4f32_undef(<4 x float> %a0) {
676; SSE2-LABEL: test_v4f32_undef:
677; SSE2:       # %bb.0:
678; SSE2-NEXT:    movaps %xmm0, %xmm1
679; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
680; SSE2-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
681; SSE2-NEXT:    movaps %xmm0, %xmm2
682; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
683; SSE2-NEXT:    mulss %xmm1, %xmm2
684; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
685; SSE2-NEXT:    mulss %xmm2, %xmm0
686; SSE2-NEXT:    retq
687;
688; SSE41-LABEL: test_v4f32_undef:
689; SSE41:       # %bb.0:
690; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
691; SSE41-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
692; SSE41-NEXT:    movaps %xmm0, %xmm2
693; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
694; SSE41-NEXT:    mulss %xmm1, %xmm2
695; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
696; SSE41-NEXT:    mulss %xmm2, %xmm0
697; SSE41-NEXT:    retq
698;
699; AVX-LABEL: test_v4f32_undef:
700; AVX:       # %bb.0:
701; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
702; AVX-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
703; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
704; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
705; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
706; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
707; AVX-NEXT:    retq
708;
709; AVX512-LABEL: test_v4f32_undef:
710; AVX512:       # %bb.0:
711; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
712; AVX512-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
713; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
714; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
715; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
716; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
717; AVX512-NEXT:    retq
718  %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %a0)
719  ret float %1
720}
721
722define float @test_v8f32_undef(<8 x float> %a0) {
723; SSE2-LABEL: test_v8f32_undef:
724; SSE2:       # %bb.0:
725; SSE2-NEXT:    movaps %xmm0, %xmm2
726; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
727; SSE2-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
728; SSE2-NEXT:    movaps %xmm0, %xmm3
729; SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
730; SSE2-NEXT:    mulss %xmm2, %xmm3
731; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
732; SSE2-NEXT:    mulss %xmm3, %xmm0
733; SSE2-NEXT:    mulss %xmm1, %xmm0
734; SSE2-NEXT:    movaps %xmm1, %xmm2
735; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
736; SSE2-NEXT:    mulss %xmm2, %xmm0
737; SSE2-NEXT:    movaps %xmm1, %xmm2
738; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
739; SSE2-NEXT:    mulss %xmm2, %xmm0
740; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
741; SSE2-NEXT:    mulss %xmm1, %xmm0
742; SSE2-NEXT:    retq
743;
744; SSE41-LABEL: test_v8f32_undef:
745; SSE41:       # %bb.0:
746; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
747; SSE41-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
748; SSE41-NEXT:    movaps %xmm0, %xmm3
749; SSE41-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
750; SSE41-NEXT:    mulss %xmm2, %xmm3
751; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
752; SSE41-NEXT:    mulss %xmm3, %xmm0
753; SSE41-NEXT:    mulss %xmm1, %xmm0
754; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
755; SSE41-NEXT:    mulss %xmm2, %xmm0
756; SSE41-NEXT:    movaps %xmm1, %xmm2
757; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
758; SSE41-NEXT:    mulss %xmm2, %xmm0
759; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
760; SSE41-NEXT:    mulss %xmm1, %xmm0
761; SSE41-NEXT:    retq
762;
763; AVX-LABEL: test_v8f32_undef:
764; AVX:       # %bb.0:
765; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
766; AVX-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
767; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
768; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
769; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
770; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
771; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
772; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm1
773; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
774; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
775; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
776; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
777; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
778; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
779; AVX-NEXT:    vzeroupper
780; AVX-NEXT:    retq
781;
782; AVX512-LABEL: test_v8f32_undef:
783; AVX512:       # %bb.0:
784; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
785; AVX512-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
786; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
787; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
788; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
789; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
790; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
791; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm1
792; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
793; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
794; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
795; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
796; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
797; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
798; AVX512-NEXT:    vzeroupper
799; AVX512-NEXT:    retq
800  %1 = call float @llvm.vector.reduce.fmul.f32.v8f32(float undef, <8 x float> %a0)
801  ret float %1
802}
803
804define float @test_v16f32_undef(<16 x float> %a0) {
805; SSE2-LABEL: test_v16f32_undef:
806; SSE2:       # %bb.0:
807; SSE2-NEXT:    movaps %xmm0, %xmm4
808; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1]
809; SSE2-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
810; SSE2-NEXT:    movaps %xmm0, %xmm5
811; SSE2-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
812; SSE2-NEXT:    mulss %xmm4, %xmm5
813; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
814; SSE2-NEXT:    mulss %xmm5, %xmm0
815; SSE2-NEXT:    mulss %xmm1, %xmm0
816; SSE2-NEXT:    movaps %xmm1, %xmm4
817; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1]
818; SSE2-NEXT:    mulss %xmm4, %xmm0
819; SSE2-NEXT:    movaps %xmm1, %xmm4
820; SSE2-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
821; SSE2-NEXT:    mulss %xmm4, %xmm0
822; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
823; SSE2-NEXT:    mulss %xmm1, %xmm0
824; SSE2-NEXT:    mulss %xmm2, %xmm0
825; SSE2-NEXT:    movaps %xmm2, %xmm1
826; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
827; SSE2-NEXT:    mulss %xmm1, %xmm0
828; SSE2-NEXT:    movaps %xmm2, %xmm1
829; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
830; SSE2-NEXT:    mulss %xmm1, %xmm0
831; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
832; SSE2-NEXT:    mulss %xmm2, %xmm0
833; SSE2-NEXT:    mulss %xmm3, %xmm0
834; SSE2-NEXT:    movaps %xmm3, %xmm1
835; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1]
836; SSE2-NEXT:    mulss %xmm1, %xmm0
837; SSE2-NEXT:    movaps %xmm3, %xmm1
838; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
839; SSE2-NEXT:    mulss %xmm1, %xmm0
840; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
841; SSE2-NEXT:    mulss %xmm3, %xmm0
842; SSE2-NEXT:    retq
843;
844; SSE41-LABEL: test_v16f32_undef:
845; SSE41:       # %bb.0:
846; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
847; SSE41-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
848; SSE41-NEXT:    movaps %xmm0, %xmm5
849; SSE41-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
850; SSE41-NEXT:    mulss %xmm4, %xmm5
851; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
852; SSE41-NEXT:    mulss %xmm5, %xmm0
853; SSE41-NEXT:    mulss %xmm1, %xmm0
854; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
855; SSE41-NEXT:    mulss %xmm4, %xmm0
856; SSE41-NEXT:    movaps %xmm1, %xmm4
857; SSE41-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
858; SSE41-NEXT:    mulss %xmm4, %xmm0
859; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
860; SSE41-NEXT:    mulss %xmm1, %xmm0
861; SSE41-NEXT:    mulss %xmm2, %xmm0
862; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
863; SSE41-NEXT:    mulss %xmm1, %xmm0
864; SSE41-NEXT:    movaps %xmm2, %xmm1
865; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
866; SSE41-NEXT:    mulss %xmm1, %xmm0
867; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
868; SSE41-NEXT:    mulss %xmm2, %xmm0
869; SSE41-NEXT:    mulss %xmm3, %xmm0
870; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
871; SSE41-NEXT:    mulss %xmm1, %xmm0
872; SSE41-NEXT:    movaps %xmm3, %xmm1
873; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
874; SSE41-NEXT:    mulss %xmm1, %xmm0
875; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
876; SSE41-NEXT:    mulss %xmm3, %xmm0
877; SSE41-NEXT:    retq
878;
879; AVX-LABEL: test_v16f32_undef:
880; AVX:       # %bb.0:
881; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
882; AVX-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
883; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
884; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm2
885; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
886; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm2
887; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
888; AVX-NEXT:    vmulss %xmm0, %xmm2, %xmm2
889; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
890; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm2
891; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
892; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm2
893; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
894; AVX-NEXT:    vmulss %xmm0, %xmm2, %xmm0
895; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
896; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
897; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
898; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
899; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
900; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
901; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
902; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
903; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
904; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
905; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
906; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
907; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
908; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
909; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
910; AVX-NEXT:    vzeroupper
911; AVX-NEXT:    retq
912;
913; AVX512-LABEL: test_v16f32_undef:
914; AVX512:       # %bb.0:
915; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
916; AVX512-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
917; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
918; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
919; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
920; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
921; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
922; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
923; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
924; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
925; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
926; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
927; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
928; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
929; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
930; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
931; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
932; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
933; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
934; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
935; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
936; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
937; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
938; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm1
939; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
940; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
941; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
942; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
943; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
944; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
945; AVX512-NEXT:    vzeroupper
946; AVX512-NEXT:    retq
947  %1 = call float @llvm.vector.reduce.fmul.f32.v16f32(float undef, <16 x float> %a0)
948  ret float %1
949}
950
951;
952; vXf64 (accum)
953;
954
955define double @test_v2f64(double %a0, <2 x double> %a1) {
956; SSE-LABEL: test_v2f64:
957; SSE:       # %bb.0:
958; SSE-NEXT:    mulsd %xmm1, %xmm0
959; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
960; SSE-NEXT:    mulsd %xmm1, %xmm0
961; SSE-NEXT:    retq
962;
963; AVX-LABEL: test_v2f64:
964; AVX:       # %bb.0:
965; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
966; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
967; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
968; AVX-NEXT:    retq
969;
970; AVX512-LABEL: test_v2f64:
971; AVX512:       # %bb.0:
972; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
973; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
974; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
975; AVX512-NEXT:    retq
976  %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1)
977  ret double %1
978}
979
980define double @test_v4f64(double %a0, <4 x double> %a1) {
981; SSE-LABEL: test_v4f64:
982; SSE:       # %bb.0:
983; SSE-NEXT:    mulsd %xmm1, %xmm0
984; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
985; SSE-NEXT:    mulsd %xmm1, %xmm0
986; SSE-NEXT:    mulsd %xmm2, %xmm0
987; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
988; SSE-NEXT:    mulsd %xmm2, %xmm0
989; SSE-NEXT:    retq
990;
991; AVX-LABEL: test_v4f64:
992; AVX:       # %bb.0:
993; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
994; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
995; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
996; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
997; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
998; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
999; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1000; AVX-NEXT:    vzeroupper
1001; AVX-NEXT:    retq
1002;
1003; AVX512-LABEL: test_v4f64:
1004; AVX512:       # %bb.0:
1005; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1006; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1007; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1008; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm1
1009; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1010; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1011; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1012; AVX512-NEXT:    vzeroupper
1013; AVX512-NEXT:    retq
1014  %1 = call double @llvm.vector.reduce.fmul.f64.v4f64(double %a0, <4 x double> %a1)
1015  ret double %1
1016}
1017
1018define double @test_v8f64(double %a0, <8 x double> %a1) {
1019; SSE-LABEL: test_v8f64:
1020; SSE:       # %bb.0:
1021; SSE-NEXT:    mulsd %xmm1, %xmm0
1022; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1023; SSE-NEXT:    mulsd %xmm1, %xmm0
1024; SSE-NEXT:    mulsd %xmm2, %xmm0
1025; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1026; SSE-NEXT:    mulsd %xmm2, %xmm0
1027; SSE-NEXT:    mulsd %xmm3, %xmm0
1028; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1029; SSE-NEXT:    mulsd %xmm3, %xmm0
1030; SSE-NEXT:    mulsd %xmm4, %xmm0
1031; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1032; SSE-NEXT:    mulsd %xmm4, %xmm0
1033; SSE-NEXT:    retq
1034;
1035; AVX-LABEL: test_v8f64:
1036; AVX:       # %bb.0:
1037; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1038; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
1039; AVX-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
1040; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1041; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1042; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1043; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1044; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1045; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1046; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1047; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
1048; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1049; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1050; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1051; AVX-NEXT:    vzeroupper
1052; AVX-NEXT:    retq
1053;
1054; AVX512-LABEL: test_v8f64:
1055; AVX512:       # %bb.0:
1056; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1057; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1058; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1059; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
1060; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1061; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1062; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1063; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
1064; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1065; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1066; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1067; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
1068; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1069; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1070; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1071; AVX512-NEXT:    vzeroupper
1072; AVX512-NEXT:    retq
1073  %1 = call double @llvm.vector.reduce.fmul.f64.v8f64(double %a0, <8 x double> %a1)
1074  ret double %1
1075}
1076
1077define double @test_v16f64(double %a0, <16 x double> %a1) {
1078; SSE2-LABEL: test_v16f64:
1079; SSE2:       # %bb.0:
1080; SSE2-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm8
1081; SSE2-NEXT:    mulsd %xmm1, %xmm0
1082; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1083; SSE2-NEXT:    mulsd %xmm1, %xmm0
1084; SSE2-NEXT:    mulsd %xmm2, %xmm0
1085; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1086; SSE2-NEXT:    mulsd %xmm2, %xmm0
1087; SSE2-NEXT:    mulsd %xmm3, %xmm0
1088; SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1089; SSE2-NEXT:    mulsd %xmm3, %xmm0
1090; SSE2-NEXT:    mulsd %xmm4, %xmm0
1091; SSE2-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1092; SSE2-NEXT:    mulsd %xmm4, %xmm0
1093; SSE2-NEXT:    mulsd %xmm5, %xmm0
1094; SSE2-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
1095; SSE2-NEXT:    mulsd %xmm5, %xmm0
1096; SSE2-NEXT:    mulsd %xmm6, %xmm0
1097; SSE2-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
1098; SSE2-NEXT:    mulsd %xmm6, %xmm0
1099; SSE2-NEXT:    mulsd %xmm7, %xmm0
1100; SSE2-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
1101; SSE2-NEXT:    mulsd %xmm7, %xmm0
1102; SSE2-NEXT:    mulsd %xmm8, %xmm0
1103; SSE2-NEXT:    unpckhpd {{.*#+}} xmm8 = xmm8[1,1]
1104; SSE2-NEXT:    mulsd %xmm8, %xmm0
1105; SSE2-NEXT:    retq
1106;
1107; SSE41-LABEL: test_v16f64:
1108; SSE41:       # %bb.0:
1109; SSE41-NEXT:    mulsd %xmm1, %xmm0
1110; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1111; SSE41-NEXT:    mulsd %xmm1, %xmm0
1112; SSE41-NEXT:    mulsd %xmm2, %xmm0
1113; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1114; SSE41-NEXT:    mulsd %xmm2, %xmm0
1115; SSE41-NEXT:    mulsd %xmm3, %xmm0
1116; SSE41-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1117; SSE41-NEXT:    mulsd %xmm3, %xmm0
1118; SSE41-NEXT:    mulsd %xmm4, %xmm0
1119; SSE41-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1120; SSE41-NEXT:    mulsd %xmm4, %xmm0
1121; SSE41-NEXT:    mulsd %xmm5, %xmm0
1122; SSE41-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
1123; SSE41-NEXT:    mulsd %xmm5, %xmm0
1124; SSE41-NEXT:    mulsd %xmm6, %xmm0
1125; SSE41-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
1126; SSE41-NEXT:    mulsd %xmm6, %xmm0
1127; SSE41-NEXT:    mulsd %xmm7, %xmm0
1128; SSE41-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
1129; SSE41-NEXT:    mulsd %xmm7, %xmm0
1130; SSE41-NEXT:    mulsd {{[0-9]+}}(%rsp), %xmm0
1131; SSE41-NEXT:    mulsd {{[0-9]+}}(%rsp), %xmm0
1132; SSE41-NEXT:    retq
1133;
1134; AVX-LABEL: test_v16f64:
1135; AVX:       # %bb.0:
1136; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1137; AVX-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
1138; AVX-NEXT:    vmulsd %xmm5, %xmm0, %xmm0
1139; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1140; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1141; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1142; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1143; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1144; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1145; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1146; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
1147; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1148; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1149; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1150; AVX-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
1151; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
1152; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1153; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm1
1154; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1155; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1156; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1157; AVX-NEXT:    vmulsd %xmm4, %xmm0, %xmm0
1158; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm4[1,0]
1159; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1160; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm1
1161; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1162; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1163; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1164; AVX-NEXT:    vzeroupper
1165; AVX-NEXT:    retq
1166;
1167; AVX512-LABEL: test_v16f64:
1168; AVX512:       # %bb.0:
1169; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1170; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
1171; AVX512-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
1172; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm3
1173; AVX512-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
1174; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1175; AVX512-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
1176; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm3
1177; AVX512-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
1178; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1179; AVX512-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
1180; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
1181; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1182; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1183; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1184; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1185; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1186; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1187; AVX512-NEXT:    vextractf128 $1, %ymm2, %xmm1
1188; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1189; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1190; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1191; AVX512-NEXT:    vextractf32x4 $2, %zmm2, %xmm1
1192; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1193; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1194; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1195; AVX512-NEXT:    vextractf32x4 $3, %zmm2, %xmm1
1196; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1197; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1198; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1199; AVX512-NEXT:    vzeroupper
1200; AVX512-NEXT:    retq
1201  %1 = call double @llvm.vector.reduce.fmul.f64.v16f64(double %a0, <16 x double> %a1)
1202  ret double %1
1203}
1204
1205;
1206; vXf64 (one)
1207;
1208
1209define double @test_v2f64_one(<2 x double> %a0) {
1210; SSE-LABEL: test_v2f64_one:
1211; SSE:       # %bb.0:
1212; SSE-NEXT:    movapd %xmm0, %xmm1
1213; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1214; SSE-NEXT:    mulsd %xmm0, %xmm1
1215; SSE-NEXT:    movapd %xmm1, %xmm0
1216; SSE-NEXT:    retq
1217;
1218; AVX-LABEL: test_v2f64_one:
1219; AVX:       # %bb.0:
1220; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1221; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1222; AVX-NEXT:    retq
1223;
1224; AVX512-LABEL: test_v2f64_one:
1225; AVX512:       # %bb.0:
1226; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1227; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1228; AVX512-NEXT:    retq
1229  %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
1230  ret double %1
1231}
1232
1233define double @test_v4f64_one(<4 x double> %a0) {
1234; SSE-LABEL: test_v4f64_one:
1235; SSE:       # %bb.0:
1236; SSE-NEXT:    movapd %xmm0, %xmm2
1237; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
1238; SSE-NEXT:    mulsd %xmm0, %xmm2
1239; SSE-NEXT:    mulsd %xmm1, %xmm2
1240; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1241; SSE-NEXT:    mulsd %xmm1, %xmm2
1242; SSE-NEXT:    movapd %xmm2, %xmm0
1243; SSE-NEXT:    retq
1244;
1245; AVX-LABEL: test_v4f64_one:
1246; AVX:       # %bb.0:
1247; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1248; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm1
1249; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1250; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm1
1251; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1252; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
1253; AVX-NEXT:    vzeroupper
1254; AVX-NEXT:    retq
1255;
1256; AVX512-LABEL: test_v4f64_one:
1257; AVX512:       # %bb.0:
1258; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1259; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm1
1260; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
1261; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm1
1262; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1263; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
1264; AVX512-NEXT:    vzeroupper
1265; AVX512-NEXT:    retq
1266  %1 = call double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
1267  ret double %1
1268}
1269
1270define double @test_v8f64_one(<8 x double> %a0) {
1271; SSE-LABEL: test_v8f64_one:
1272; SSE:       # %bb.0:
1273; SSE-NEXT:    movapd %xmm0, %xmm4
1274; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
1275; SSE-NEXT:    mulsd %xmm0, %xmm4
1276; SSE-NEXT:    mulsd %xmm1, %xmm4
1277; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1278; SSE-NEXT:    mulsd %xmm1, %xmm4
1279; SSE-NEXT:    mulsd %xmm2, %xmm4
1280; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1281; SSE-NEXT:    mulsd %xmm2, %xmm4
1282; SSE-NEXT:    mulsd %xmm3, %xmm4
1283; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1284; SSE-NEXT:    mulsd %xmm3, %xmm4
1285; SSE-NEXT:    movapd %xmm4, %xmm0
1286; SSE-NEXT:    retq
1287;
1288; AVX-LABEL: test_v8f64_one:
1289; AVX:       # %bb.0:
1290; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1291; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm2
1292; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1293; AVX-NEXT:    vmulsd %xmm0, %xmm2, %xmm2
1294; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1295; AVX-NEXT:    vmulsd %xmm0, %xmm2, %xmm0
1296; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1297; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1298; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1299; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1300; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1301; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1302; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1303; AVX-NEXT:    vzeroupper
1304; AVX-NEXT:    retq
1305;
1306; AVX512-LABEL: test_v8f64_one:
1307; AVX512:       # %bb.0:
1308; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1309; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm1
1310; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
1311; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
1312; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1313; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
1314; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
1315; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
1316; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1317; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
1318; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1319; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm1
1320; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1321; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
1322; AVX512-NEXT:    vzeroupper
1323; AVX512-NEXT:    retq
1324  %1 = call double @llvm.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
1325  ret double %1
1326}
1327
1328define double @test_v16f64_one(<16 x double> %a0) {
1329; SSE-LABEL: test_v16f64_one:
1330; SSE:       # %bb.0:
1331; SSE-NEXT:    movapd %xmm0, %xmm8
1332; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
1333; SSE-NEXT:    mulsd %xmm8, %xmm0
1334; SSE-NEXT:    mulsd %xmm1, %xmm0
1335; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1336; SSE-NEXT:    mulsd %xmm1, %xmm0
1337; SSE-NEXT:    mulsd %xmm2, %xmm0
1338; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1339; SSE-NEXT:    mulsd %xmm2, %xmm0
1340; SSE-NEXT:    mulsd %xmm3, %xmm0
1341; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1342; SSE-NEXT:    mulsd %xmm3, %xmm0
1343; SSE-NEXT:    mulsd %xmm4, %xmm0
1344; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1345; SSE-NEXT:    mulsd %xmm4, %xmm0
1346; SSE-NEXT:    mulsd %xmm5, %xmm0
1347; SSE-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
1348; SSE-NEXT:    mulsd %xmm5, %xmm0
1349; SSE-NEXT:    mulsd %xmm6, %xmm0
1350; SSE-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
1351; SSE-NEXT:    mulsd %xmm6, %xmm0
1352; SSE-NEXT:    mulsd %xmm7, %xmm0
1353; SSE-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
1354; SSE-NEXT:    mulsd %xmm7, %xmm0
1355; SSE-NEXT:    retq
1356;
1357; AVX-LABEL: test_v16f64_one:
1358; AVX:       # %bb.0:
1359; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
1360; AVX-NEXT:    vmulsd %xmm4, %xmm0, %xmm4
1361; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1362; AVX-NEXT:    vmulsd %xmm0, %xmm4, %xmm4
1363; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1364; AVX-NEXT:    vmulsd %xmm0, %xmm4, %xmm0
1365; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1366; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
1367; AVX-NEXT:    vmulsd %xmm4, %xmm0, %xmm0
1368; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1369; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1370; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1371; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1372; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1373; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1374; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1375; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
1376; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1377; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1378; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1379; AVX-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
1380; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
1381; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1382; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm1
1383; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1384; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1385; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1386; AVX-NEXT:    vzeroupper
1387; AVX-NEXT:    retq
1388;
1389; AVX512-LABEL: test_v16f64_one:
1390; AVX512:       # %bb.0:
1391; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1392; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm2
1393; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm3
1394; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
1395; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1396; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
1397; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
1398; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
1399; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1400; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
1401; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1402; AVX512-NEXT:    vmulsd %xmm0, %xmm2, %xmm2
1403; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1404; AVX512-NEXT:    vmulsd %xmm0, %xmm2, %xmm0
1405; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1406; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1407; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1408; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
1409; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1410; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1411; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1412; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
1413; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1414; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1415; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1416; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
1417; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1418; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1419; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1420; AVX512-NEXT:    vzeroupper
1421; AVX512-NEXT:    retq
1422  %1 = call double @llvm.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
1423  ret double %1
1424}
1425
1426;
1427; vXf64 (undef)
1428;
1429
1430define double @test_v2f64_undef(<2 x double> %a0) {
1431; SSE-LABEL: test_v2f64_undef:
1432; SSE:       # %bb.0:
1433; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
1434; SSE-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1435; SSE-NEXT:    retq
1436;
1437; AVX-LABEL: test_v2f64_undef:
1438; AVX:       # %bb.0:
1439; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1440; AVX-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1441; AVX-NEXT:    retq
1442;
1443; AVX512-LABEL: test_v2f64_undef:
1444; AVX512:       # %bb.0:
1445; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1446; AVX512-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1447; AVX512-NEXT:    retq
1448  %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double undef, <2 x double> %a0)
1449  ret double %1
1450}
1451
1452define double @test_v4f64_undef(<4 x double> %a0) {
1453; SSE-LABEL: test_v4f64_undef:
1454; SSE:       # %bb.0:
1455; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
1456; SSE-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1457; SSE-NEXT:    mulsd %xmm1, %xmm0
1458; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1459; SSE-NEXT:    mulsd %xmm1, %xmm0
1460; SSE-NEXT:    retq
1461;
1462; AVX-LABEL: test_v4f64_undef:
1463; AVX:       # %bb.0:
1464; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1465; AVX-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1466; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1467; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm1
1468; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1469; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
1470; AVX-NEXT:    vzeroupper
1471; AVX-NEXT:    retq
1472;
1473; AVX512-LABEL: test_v4f64_undef:
1474; AVX512:       # %bb.0:
1475; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1476; AVX512-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1477; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
1478; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm1
1479; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1480; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
1481; AVX512-NEXT:    vzeroupper
1482; AVX512-NEXT:    retq
1483  %1 = call double @llvm.vector.reduce.fmul.f64.v4f64(double undef, <4 x double> %a0)
1484  ret double %1
1485}
1486
1487define double @test_v8f64_undef(<8 x double> %a0) {
1488; SSE-LABEL: test_v8f64_undef:
1489; SSE:       # %bb.0:
1490; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
1491; SSE-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1492; SSE-NEXT:    mulsd %xmm1, %xmm0
1493; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1494; SSE-NEXT:    mulsd %xmm1, %xmm0
1495; SSE-NEXT:    mulsd %xmm2, %xmm0
1496; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1497; SSE-NEXT:    mulsd %xmm2, %xmm0
1498; SSE-NEXT:    mulsd %xmm3, %xmm0
1499; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1500; SSE-NEXT:    mulsd %xmm3, %xmm0
1501; SSE-NEXT:    retq
1502;
1503; AVX-LABEL: test_v8f64_undef:
1504; AVX:       # %bb.0:
1505; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1506; AVX-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1507; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1508; AVX-NEXT:    vmulsd %xmm0, %xmm2, %xmm2
1509; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1510; AVX-NEXT:    vmulsd %xmm0, %xmm2, %xmm0
1511; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1512; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1513; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1514; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1515; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1516; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1517; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1518; AVX-NEXT:    vzeroupper
1519; AVX-NEXT:    retq
1520;
1521; AVX512-LABEL: test_v8f64_undef:
1522; AVX512:       # %bb.0:
1523; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1524; AVX512-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1525; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
1526; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
1527; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1528; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
1529; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
1530; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
1531; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1532; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
1533; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1534; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm1
1535; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1536; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
1537; AVX512-NEXT:    vzeroupper
1538; AVX512-NEXT:    retq
1539  %1 = call double @llvm.vector.reduce.fmul.f64.v8f64(double undef, <8 x double> %a0)
1540  ret double %1
1541}
1542
1543define double @test_v16f64_undef(<16 x double> %a0) {
1544; SSE-LABEL: test_v16f64_undef:
1545; SSE:       # %bb.0:
1546; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
1547; SSE-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1548; SSE-NEXT:    mulsd %xmm1, %xmm0
1549; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1550; SSE-NEXT:    mulsd %xmm1, %xmm0
1551; SSE-NEXT:    mulsd %xmm2, %xmm0
1552; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1553; SSE-NEXT:    mulsd %xmm2, %xmm0
1554; SSE-NEXT:    mulsd %xmm3, %xmm0
1555; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1556; SSE-NEXT:    mulsd %xmm3, %xmm0
1557; SSE-NEXT:    mulsd %xmm4, %xmm0
1558; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1559; SSE-NEXT:    mulsd %xmm4, %xmm0
1560; SSE-NEXT:    mulsd %xmm5, %xmm0
1561; SSE-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
1562; SSE-NEXT:    mulsd %xmm5, %xmm0
1563; SSE-NEXT:    mulsd %xmm6, %xmm0
1564; SSE-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
1565; SSE-NEXT:    mulsd %xmm6, %xmm0
1566; SSE-NEXT:    mulsd %xmm7, %xmm0
1567; SSE-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
1568; SSE-NEXT:    mulsd %xmm7, %xmm0
1569; SSE-NEXT:    retq
1570;
1571; AVX-LABEL: test_v16f64_undef:
1572; AVX:       # %bb.0:
1573; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
1574; AVX-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
1575; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1576; AVX-NEXT:    vmulsd %xmm0, %xmm4, %xmm4
1577; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1578; AVX-NEXT:    vmulsd %xmm0, %xmm4, %xmm0
1579; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1580; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
1581; AVX-NEXT:    vmulsd %xmm4, %xmm0, %xmm0
1582; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1583; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1584; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1585; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1586; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1587; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1588; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1589; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
1590; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1591; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1592; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1593; AVX-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
1594; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
1595; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1596; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm1
1597; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1598; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1599; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1600; AVX-NEXT:    vzeroupper
1601; AVX-NEXT:    retq
1602;
1603; AVX512-LABEL: test_v16f64_undef:
1604; AVX512:       # %bb.0:
1605; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
1606; AVX512-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1607; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm3
1608; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
1609; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1610; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
1611; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
1612; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
1613; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
1614; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
1615; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1616; AVX512-NEXT:    vmulsd %xmm0, %xmm2, %xmm2
1617; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1618; AVX512-NEXT:    vmulsd %xmm0, %xmm2, %xmm0
1619; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1620; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1621; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1622; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
1623; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1624; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1625; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1626; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
1627; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1628; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1629; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1630; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
1631; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1632; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1633; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1634; AVX512-NEXT:    vzeroupper
1635; AVX512-NEXT:    retq
1636  %1 = call double @llvm.vector.reduce.fmul.f64.v16f64(double undef, <16 x double> %a0)
1637  ret double %1
1638}
1639
1640declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>)
1641declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
1642declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
1643declare float @llvm.vector.reduce.fmul.f32.v16f32(float, <16 x float>)
1644
1645declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
1646declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
1647declare double @llvm.vector.reduce.fmul.f64.v8f64(double, <8 x double>)
1648declare double @llvm.vector.reduce.fmul.f64.v16f64(double, <16 x double>)
1649