1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3               | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops     | FileCheck %s --check-prefixes=SSE3,SSE3-FAST
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx                | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1,AVX1-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops      | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1,AVX1-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2               | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2,AVX2-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops     | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2,AVX2-FAST
8; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f            | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512,AVX512-SLOW
9; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512,AVX512-FAST
10
11define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
12; SSE3-LABEL: haddpd1:
13; SSE3:       # %bb.0:
14; SSE3-NEXT:    haddpd %xmm1, %xmm0
15; SSE3-NEXT:    retq
16;
17; AVX-LABEL: haddpd1:
18; AVX:       # %bb.0:
19; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
20; AVX-NEXT:    retq
21  %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
22  %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
23  %r = fadd <2 x double> %a, %b
24  ret <2 x double> %r
25}
26
27define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) {
28; SSE3-LABEL: haddpd2:
29; SSE3:       # %bb.0:
30; SSE3-NEXT:    haddpd %xmm1, %xmm0
31; SSE3-NEXT:    retq
32;
33; AVX-LABEL: haddpd2:
34; AVX:       # %bb.0:
35; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
36; AVX-NEXT:    retq
37  %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 2>
38  %b = shufflevector <2 x double> %y, <2 x double> %x, <2 x i32> <i32 2, i32 1>
39  %r = fadd <2 x double> %a, %b
40  ret <2 x double> %r
41}
42
43define <2 x double> @haddpd3(<2 x double> %x) {
44; SSE3-SLOW-LABEL: haddpd3:
45; SSE3-SLOW:       # %bb.0:
46; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
47; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
48; SSE3-SLOW-NEXT:    addpd %xmm0, %xmm1
49; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
50; SSE3-SLOW-NEXT:    retq
51;
52; SSE3-FAST-LABEL: haddpd3:
53; SSE3-FAST:       # %bb.0:
54; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
55; SSE3-FAST-NEXT:    retq
56;
57; AVX-SLOW-LABEL: haddpd3:
58; AVX-SLOW:       # %bb.0:
59; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
60; AVX-SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
61; AVX-SLOW-NEXT:    retq
62;
63; AVX-FAST-LABEL: haddpd3:
64; AVX-FAST:       # %bb.0:
65; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
66; AVX-FAST-NEXT:    retq
67  %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
68  %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
69  %r = fadd <2 x double> %a, %b
70  ret <2 x double> %r
71}
72
73define <4 x float> @haddps1(<4 x float> %x, <4 x float> %y) {
74; SSE3-LABEL: haddps1:
75; SSE3:       # %bb.0:
76; SSE3-NEXT:    haddps %xmm1, %xmm0
77; SSE3-NEXT:    retq
78;
79; AVX-LABEL: haddps1:
80; AVX:       # %bb.0:
81; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
82; AVX-NEXT:    retq
83  %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
84  %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
85  %r = fadd <4 x float> %a, %b
86  ret <4 x float> %r
87}
88
89define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) {
90; SSE3-LABEL: haddps2:
91; SSE3:       # %bb.0:
92; SSE3-NEXT:    haddps %xmm1, %xmm0
93; SSE3-NEXT:    retq
94;
95; AVX-LABEL: haddps2:
96; AVX:       # %bb.0:
97; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
98; AVX-NEXT:    retq
99  %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
100  %b = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3>
101  %r = fadd <4 x float> %a, %b
102  ret <4 x float> %r
103}
104
105define <4 x float> @haddps3(<4 x float> %x) {
106; SSE3-LABEL: haddps3:
107; SSE3:       # %bb.0:
108; SSE3-NEXT:    haddps %xmm0, %xmm0
109; SSE3-NEXT:    retq
110;
111; AVX-LABEL: haddps3:
112; AVX:       # %bb.0:
113; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
114; AVX-NEXT:    retq
115  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
116  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
117  %r = fadd <4 x float> %a, %b
118  ret <4 x float> %r
119}
120
121define <4 x float> @haddps4(<4 x float> %x) {
122; SSE3-LABEL: haddps4:
123; SSE3:       # %bb.0:
124; SSE3-NEXT:    haddps %xmm0, %xmm0
125; SSE3-NEXT:    retq
126;
127; AVX-LABEL: haddps4:
128; AVX:       # %bb.0:
129; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
130; AVX-NEXT:    retq
131  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
132  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
133  %r = fadd <4 x float> %a, %b
134  ret <4 x float> %r
135}
136
137define <4 x float> @haddps5(<4 x float> %x) {
138; SSE3-LABEL: haddps5:
139; SSE3:       # %bb.0:
140; SSE3-NEXT:    haddps %xmm0, %xmm0
141; SSE3-NEXT:    retq
142;
143; AVX-LABEL: haddps5:
144; AVX:       # %bb.0:
145; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
146; AVX-NEXT:    retq
147  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
148  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
149  %r = fadd <4 x float> %a, %b
150  ret <4 x float> %r
151}
152
153define <4 x float> @haddps6(<4 x float> %x) {
154; SSE3-SLOW-LABEL: haddps6:
155; SSE3-SLOW:       # %bb.0:
156; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
157; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
158; SSE3-SLOW-NEXT:    retq
159;
160; SSE3-FAST-LABEL: haddps6:
161; SSE3-FAST:       # %bb.0:
162; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
163; SSE3-FAST-NEXT:    retq
164;
165; AVX-SLOW-LABEL: haddps6:
166; AVX-SLOW:       # %bb.0:
167; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
168; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
169; AVX-SLOW-NEXT:    retq
170;
171; AVX-FAST-LABEL: haddps6:
172; AVX-FAST:       # %bb.0:
173; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
174; AVX-FAST-NEXT:    retq
175  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
176  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
177  %r = fadd <4 x float> %a, %b
178  ret <4 x float> %r
179}
180
181define <4 x float> @haddps7(<4 x float> %x) {
182; SSE3-LABEL: haddps7:
183; SSE3:       # %bb.0:
184; SSE3-NEXT:    haddps %xmm0, %xmm0
185; SSE3-NEXT:    retq
186;
187; AVX-LABEL: haddps7:
188; AVX:       # %bb.0:
189; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
190; AVX-NEXT:    retq
191  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
192  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
193  %r = fadd <4 x float> %a, %b
194  ret <4 x float> %r
195}
196
197define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) {
198; SSE3-LABEL: hsubpd1:
199; SSE3:       # %bb.0:
200; SSE3-NEXT:    hsubpd %xmm1, %xmm0
201; SSE3-NEXT:    retq
202;
203; AVX-LABEL: hsubpd1:
204; AVX:       # %bb.0:
205; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
206; AVX-NEXT:    retq
207  %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
208  %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
209  %r = fsub <2 x double> %a, %b
210  ret <2 x double> %r
211}
212
213define <2 x double> @hsubpd2(<2 x double> %x) {
214; SSE3-SLOW-LABEL: hsubpd2:
215; SSE3-SLOW:       # %bb.0:
216; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
217; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
218; SSE3-SLOW-NEXT:    subpd %xmm1, %xmm0
219; SSE3-SLOW-NEXT:    retq
220;
221; SSE3-FAST-LABEL: hsubpd2:
222; SSE3-FAST:       # %bb.0:
223; SSE3-FAST-NEXT:    hsubpd %xmm0, %xmm0
224; SSE3-FAST-NEXT:    retq
225;
226; AVX-SLOW-LABEL: hsubpd2:
227; AVX-SLOW:       # %bb.0:
228; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
229; AVX-SLOW-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
230; AVX-SLOW-NEXT:    retq
231;
232; AVX-FAST-LABEL: hsubpd2:
233; AVX-FAST:       # %bb.0:
234; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
235; AVX-FAST-NEXT:    retq
236  %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
237  %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
238  %r = fsub <2 x double> %a, %b
239  ret <2 x double> %r
240}
241
242define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) {
243; SSE3-LABEL: hsubps1:
244; SSE3:       # %bb.0:
245; SSE3-NEXT:    hsubps %xmm1, %xmm0
246; SSE3-NEXT:    retq
247;
248; AVX-LABEL: hsubps1:
249; AVX:       # %bb.0:
250; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
251; AVX-NEXT:    retq
252  %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
253  %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
254  %r = fsub <4 x float> %a, %b
255  ret <4 x float> %r
256}
257
258define <4 x float> @hsubps2(<4 x float> %x) {
259; SSE3-LABEL: hsubps2:
260; SSE3:       # %bb.0:
261; SSE3-NEXT:    hsubps %xmm0, %xmm0
262; SSE3-NEXT:    retq
263;
264; AVX-LABEL: hsubps2:
265; AVX:       # %bb.0:
266; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
267; AVX-NEXT:    retq
268  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
269  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
270  %r = fsub <4 x float> %a, %b
271  ret <4 x float> %r
272}
273
274define <4 x float> @hsubps3(<4 x float> %x) {
275; SSE3-LABEL: hsubps3:
276; SSE3:       # %bb.0:
277; SSE3-NEXT:    hsubps %xmm0, %xmm0
278; SSE3-NEXT:    retq
279;
280; AVX-LABEL: hsubps3:
281; AVX:       # %bb.0:
282; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
283; AVX-NEXT:    retq
284  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
285  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
286  %r = fsub <4 x float> %a, %b
287  ret <4 x float> %r
288}
289
290define <4 x float> @hsubps4(<4 x float> %x) {
291; SSE3-SLOW-LABEL: hsubps4:
292; SSE3-SLOW:       # %bb.0:
293; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
294; SSE3-SLOW-NEXT:    subps %xmm1, %xmm0
295; SSE3-SLOW-NEXT:    retq
296;
297; SSE3-FAST-LABEL: hsubps4:
298; SSE3-FAST:       # %bb.0:
299; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
300; SSE3-FAST-NEXT:    retq
301;
302; AVX-SLOW-LABEL: hsubps4:
303; AVX-SLOW:       # %bb.0:
304; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
305; AVX-SLOW-NEXT:    vsubps %xmm1, %xmm0, %xmm0
306; AVX-SLOW-NEXT:    retq
307;
308; AVX-FAST-LABEL: hsubps4:
309; AVX-FAST:       # %bb.0:
310; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
311; AVX-FAST-NEXT:    retq
312  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
313  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
314  %r = fsub <4 x float> %a, %b
315  ret <4 x float> %r
316}
317
318define <8 x float> @vhaddps1(<8 x float> %x, <8 x float> %y) {
319; SSE3-LABEL: vhaddps1:
320; SSE3:       # %bb.0:
321; SSE3-NEXT:    haddps %xmm2, %xmm0
322; SSE3-NEXT:    haddps %xmm3, %xmm1
323; SSE3-NEXT:    retq
324;
325; AVX-LABEL: vhaddps1:
326; AVX:       # %bb.0:
327; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
328; AVX-NEXT:    retq
329  %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
330  %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
331  %r = fadd <8 x float> %a, %b
332  ret <8 x float> %r
333}
334
335define <8 x float> @vhaddps2(<8 x float> %x, <8 x float> %y) {
336; SSE3-LABEL: vhaddps2:
337; SSE3:       # %bb.0:
338; SSE3-NEXT:    haddps %xmm2, %xmm0
339; SSE3-NEXT:    haddps %xmm3, %xmm1
340; SSE3-NEXT:    retq
341;
342; AVX-LABEL: vhaddps2:
343; AVX:       # %bb.0:
344; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
345; AVX-NEXT:    retq
346  %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 2, i32 9, i32 10, i32 5, i32 6, i32 13, i32 14>
347  %b = shufflevector <8 x float> %y, <8 x float> %x, <8 x i32> <i32 8, i32 11, i32 0, i32 3, i32 12, i32 15, i32 4, i32 7>
348  %r = fadd <8 x float> %a, %b
349  ret <8 x float> %r
350}
351
352define <8 x float> @vhaddps3(<8 x float> %x) {
353; SSE3-LABEL: vhaddps3:
354; SSE3:       # %bb.0:
355; SSE3-NEXT:    haddps %xmm0, %xmm0
356; SSE3-NEXT:    haddps %xmm1, %xmm1
357; SSE3-NEXT:    retq
358;
359; AVX-LABEL: vhaddps3:
360; AVX:       # %bb.0:
361; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
362; AVX-NEXT:    retq
363  %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
364  %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
365  %r = fadd <8 x float> %a, %b
366  ret <8 x float> %r
367}
368
369define <8 x float> @vhsubps1(<8 x float> %x, <8 x float> %y) {
370; SSE3-LABEL: vhsubps1:
371; SSE3:       # %bb.0:
372; SSE3-NEXT:    hsubps %xmm2, %xmm0
373; SSE3-NEXT:    hsubps %xmm3, %xmm1
374; SSE3-NEXT:    retq
375;
376; AVX-LABEL: vhsubps1:
377; AVX:       # %bb.0:
378; AVX-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
379; AVX-NEXT:    retq
380  %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
381  %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
382  %r = fsub <8 x float> %a, %b
383  ret <8 x float> %r
384}
385
386define <8 x float> @vhsubps3(<8 x float> %x) {
387; SSE3-LABEL: vhsubps3:
388; SSE3:       # %bb.0:
389; SSE3-NEXT:    hsubps %xmm0, %xmm0
390; SSE3-NEXT:    hsubps %xmm1, %xmm1
391; SSE3-NEXT:    retq
392;
393; AVX-LABEL: vhsubps3:
394; AVX:       # %bb.0:
395; AVX-NEXT:    vhsubps %ymm0, %ymm0, %ymm0
396; AVX-NEXT:    retq
397  %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
398  %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
399  %r = fsub <8 x float> %a, %b
400  ret <8 x float> %r
401}
402
403define <4 x double> @vhaddpd1(<4 x double> %x, <4 x double> %y) {
404; SSE3-LABEL: vhaddpd1:
405; SSE3:       # %bb.0:
406; SSE3-NEXT:    haddpd %xmm2, %xmm0
407; SSE3-NEXT:    haddpd %xmm3, %xmm1
408; SSE3-NEXT:    retq
409;
410; AVX-LABEL: vhaddpd1:
411; AVX:       # %bb.0:
412; AVX-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
413; AVX-NEXT:    retq
414  %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
415  %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
416  %r = fadd <4 x double> %a, %b
417  ret <4 x double> %r
418}
419
420define <4 x double> @vhsubpd1(<4 x double> %x, <4 x double> %y) {
421; SSE3-LABEL: vhsubpd1:
422; SSE3:       # %bb.0:
423; SSE3-NEXT:    hsubpd %xmm2, %xmm0
424; SSE3-NEXT:    hsubpd %xmm3, %xmm1
425; SSE3-NEXT:    retq
426;
427; AVX-LABEL: vhsubpd1:
428; AVX:       # %bb.0:
429; AVX-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
430; AVX-NEXT:    retq
431  %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
432  %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
433  %r = fsub <4 x double> %a, %b
434  ret <4 x double> %r
435}
436
437define <2 x float> @haddps_v2f32(<4 x float> %v0) {
438; SSE3-LABEL: haddps_v2f32:
439; SSE3:       # %bb.0:
440; SSE3-NEXT:    haddps %xmm0, %xmm0
441; SSE3-NEXT:    retq
442;
443; AVX-LABEL: haddps_v2f32:
444; AVX:       # %bb.0:
445; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
446; AVX-NEXT:    retq
447  %v0.0 = extractelement <4 x float> %v0, i32 0
448  %v0.1 = extractelement <4 x float> %v0, i32 1
449  %v0.2 = extractelement <4 x float> %v0, i32 2
450  %v0.3 = extractelement <4 x float> %v0, i32 3
451  %op0 = fadd float %v0.0, %v0.1
452  %op1 = fadd float %v0.2, %v0.3
453  %res0 = insertelement <2 x float> undef, float %op0, i32 0
454  %res1 = insertelement <2 x float> %res0, float %op1, i32 1
455  ret <2 x float> %res1
456}
457
458; 128-bit vectors, float/double, fadd/fsub
459
460define float @extract_extract01_v4f32_fadd_f32(<4 x float> %x) {
461; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32:
462; SSE3-SLOW:       # %bb.0:
463; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
464; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
465; SSE3-SLOW-NEXT:    retq
466;
467; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32:
468; SSE3-FAST:       # %bb.0:
469; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
470; SSE3-FAST-NEXT:    retq
471;
472; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32:
473; AVX-SLOW:       # %bb.0:
474; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
475; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
476; AVX-SLOW-NEXT:    retq
477;
478; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32:
479; AVX-FAST:       # %bb.0:
480; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
481; AVX-FAST-NEXT:    retq
482  %x0 = extractelement <4 x float> %x, i32 0
483  %x1 = extractelement <4 x float> %x, i32 1
484  %x01 = fadd float %x0, %x1
485  ret float %x01
486}
487
488define float @extract_extract23_v4f32_fadd_f32(<4 x float> %x) {
489; SSE3-SLOW-LABEL: extract_extract23_v4f32_fadd_f32:
490; SSE3-SLOW:       # %bb.0:
491; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
492; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
493; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
494; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
495; SSE3-SLOW-NEXT:    retq
496;
497; SSE3-FAST-LABEL: extract_extract23_v4f32_fadd_f32:
498; SSE3-FAST:       # %bb.0:
499; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
500; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
501; SSE3-FAST-NEXT:    retq
502;
503; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32:
504; AVX-SLOW:       # %bb.0:
505; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
506; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
507; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
508; AVX-SLOW-NEXT:    retq
509;
510; AVX-FAST-LABEL: extract_extract23_v4f32_fadd_f32:
511; AVX-FAST:       # %bb.0:
512; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
513; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
514; AVX-FAST-NEXT:    retq
515  %x0 = extractelement <4 x float> %x, i32 2
516  %x1 = extractelement <4 x float> %x, i32 3
517  %x01 = fadd float %x0, %x1
518  ret float %x01
519}
520
521define float @extract_extract01_v4f32_fadd_f32_commute(<4 x float> %x) {
522; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_commute:
523; SSE3-SLOW:       # %bb.0:
524; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
525; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
526; SSE3-SLOW-NEXT:    retq
527;
528; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_commute:
529; SSE3-FAST:       # %bb.0:
530; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
531; SSE3-FAST-NEXT:    retq
532;
533; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_commute:
534; AVX-SLOW:       # %bb.0:
535; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
536; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
537; AVX-SLOW-NEXT:    retq
538;
539; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_commute:
540; AVX-FAST:       # %bb.0:
541; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
542; AVX-FAST-NEXT:    retq
543  %x0 = extractelement <4 x float> %x, i32 0
544  %x1 = extractelement <4 x float> %x, i32 1
545  %x01 = fadd float %x1, %x0
546  ret float %x01
547}
548
549define float @extract_extract23_v4f32_fadd_f32_commute(<4 x float> %x) {
550; SSE3-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute:
551; SSE3-SLOW:       # %bb.0:
552; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
553; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
554; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
555; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
556; SSE3-SLOW-NEXT:    retq
557;
558; SSE3-FAST-LABEL: extract_extract23_v4f32_fadd_f32_commute:
559; SSE3-FAST:       # %bb.0:
560; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
561; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
562; SSE3-FAST-NEXT:    retq
563;
564; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute:
565; AVX-SLOW:       # %bb.0:
566; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
567; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
568; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
569; AVX-SLOW-NEXT:    retq
570;
571; AVX-FAST-LABEL: extract_extract23_v4f32_fadd_f32_commute:
572; AVX-FAST:       # %bb.0:
573; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
574; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
575; AVX-FAST-NEXT:    retq
576  %x0 = extractelement <4 x float> %x, i32 2
577  %x1 = extractelement <4 x float> %x, i32 3
578  %x01 = fadd float %x1, %x0
579  ret float %x01
580}
581
582define double @extract_extract01_v2f64_fadd_f64(<2 x double> %x) {
583; SSE3-SLOW-LABEL: extract_extract01_v2f64_fadd_f64:
584; SSE3-SLOW:       # %bb.0:
585; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
586; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
587; SSE3-SLOW-NEXT:    addsd %xmm0, %xmm1
588; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
589; SSE3-SLOW-NEXT:    retq
590;
591; SSE3-FAST-LABEL: extract_extract01_v2f64_fadd_f64:
592; SSE3-FAST:       # %bb.0:
593; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
594; SSE3-FAST-NEXT:    retq
595;
596; AVX-SLOW-LABEL: extract_extract01_v2f64_fadd_f64:
597; AVX-SLOW:       # %bb.0:
598; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
599; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
600; AVX-SLOW-NEXT:    retq
601;
602; AVX-FAST-LABEL: extract_extract01_v2f64_fadd_f64:
603; AVX-FAST:       # %bb.0:
604; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
605; AVX-FAST-NEXT:    retq
606  %x0 = extractelement <2 x double> %x, i32 0
607  %x1 = extractelement <2 x double> %x, i32 1
608  %x01 = fadd double %x0, %x1
609  ret double %x01
610}
611
612define double @extract_extract01_v2f64_fadd_f64_commute(<2 x double> %x) {
613; SSE3-SLOW-LABEL: extract_extract01_v2f64_fadd_f64_commute:
614; SSE3-SLOW:       # %bb.0:
615; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
616; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
617; SSE3-SLOW-NEXT:    addsd %xmm0, %xmm1
618; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
619; SSE3-SLOW-NEXT:    retq
620;
621; SSE3-FAST-LABEL: extract_extract01_v2f64_fadd_f64_commute:
622; SSE3-FAST:       # %bb.0:
623; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
624; SSE3-FAST-NEXT:    retq
625;
626; AVX-SLOW-LABEL: extract_extract01_v2f64_fadd_f64_commute:
627; AVX-SLOW:       # %bb.0:
628; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
629; AVX-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
630; AVX-SLOW-NEXT:    retq
631;
632; AVX-FAST-LABEL: extract_extract01_v2f64_fadd_f64_commute:
633; AVX-FAST:       # %bb.0:
634; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
635; AVX-FAST-NEXT:    retq
636  %x0 = extractelement <2 x double> %x, i32 0
637  %x1 = extractelement <2 x double> %x, i32 1
638  %x01 = fadd double %x1, %x0
639  ret double %x01
640}
641
642define float @extract_extract01_v4f32_fsub_f32(<4 x float> %x) {
643; SSE3-SLOW-LABEL: extract_extract01_v4f32_fsub_f32:
644; SSE3-SLOW:       # %bb.0:
645; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
646; SSE3-SLOW-NEXT:    subss %xmm1, %xmm0
647; SSE3-SLOW-NEXT:    retq
648;
649; SSE3-FAST-LABEL: extract_extract01_v4f32_fsub_f32:
650; SSE3-FAST:       # %bb.0:
651; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
652; SSE3-FAST-NEXT:    retq
653;
654; AVX-SLOW-LABEL: extract_extract01_v4f32_fsub_f32:
655; AVX-SLOW:       # %bb.0:
656; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
657; AVX-SLOW-NEXT:    vsubss %xmm1, %xmm0, %xmm0
658; AVX-SLOW-NEXT:    retq
659;
660; AVX-FAST-LABEL: extract_extract01_v4f32_fsub_f32:
661; AVX-FAST:       # %bb.0:
662; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
663; AVX-FAST-NEXT:    retq
664  %x0 = extractelement <4 x float> %x, i32 0
665  %x1 = extractelement <4 x float> %x, i32 1
666  %x01 = fsub float %x0, %x1
667  ret float %x01
668}
669
670define float @extract_extract23_v4f32_fsub_f32(<4 x float> %x) {
671; SSE3-SLOW-LABEL: extract_extract23_v4f32_fsub_f32:
672; SSE3-SLOW:       # %bb.0:
673; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
674; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
675; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
676; SSE3-SLOW-NEXT:    subss %xmm0, %xmm1
677; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
678; SSE3-SLOW-NEXT:    retq
679;
680; SSE3-FAST-LABEL: extract_extract23_v4f32_fsub_f32:
681; SSE3-FAST:       # %bb.0:
682; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
683; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
684; SSE3-FAST-NEXT:    retq
685;
686; AVX-SLOW-LABEL: extract_extract23_v4f32_fsub_f32:
687; AVX-SLOW:       # %bb.0:
688; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
689; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
690; AVX-SLOW-NEXT:    vsubss %xmm0, %xmm1, %xmm0
691; AVX-SLOW-NEXT:    retq
692;
693; AVX-FAST-LABEL: extract_extract23_v4f32_fsub_f32:
694; AVX-FAST:       # %bb.0:
695; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
696; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
697; AVX-FAST-NEXT:    retq
698  %x0 = extractelement <4 x float> %x, i32 2
699  %x1 = extractelement <4 x float> %x, i32 3
700  %x01 = fsub float %x0, %x1
701  ret float %x01
702}
703
704define float @extract_extract01_v4f32_fsub_f32_commute(<4 x float> %x) {
705; SSE3-LABEL: extract_extract01_v4f32_fsub_f32_commute:
706; SSE3:       # %bb.0:
707; SSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
708; SSE3-NEXT:    subss %xmm0, %xmm1
709; SSE3-NEXT:    movaps %xmm1, %xmm0
710; SSE3-NEXT:    retq
711;
712; AVX-LABEL: extract_extract01_v4f32_fsub_f32_commute:
713; AVX:       # %bb.0:
714; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
715; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
716; AVX-NEXT:    retq
717  %x0 = extractelement <4 x float> %x, i32 0
718  %x1 = extractelement <4 x float> %x, i32 1
719  %x01 = fsub float %x1, %x0
720  ret float %x01
721}
722
723define float @extract_extract23_v4f32_fsub_f32_commute(<4 x float> %x) {
724; SSE3-LABEL: extract_extract23_v4f32_fsub_f32_commute:
725; SSE3:       # %bb.0:
726; SSE3-NEXT:    movaps %xmm0, %xmm1
727; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
728; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
729; SSE3-NEXT:    subss %xmm1, %xmm0
730; SSE3-NEXT:    retq
731;
732; AVX-LABEL: extract_extract23_v4f32_fsub_f32_commute:
733; AVX:       # %bb.0:
734; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
735; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
736; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
737; AVX-NEXT:    retq
738  %x0 = extractelement <4 x float> %x, i32 2
739  %x1 = extractelement <4 x float> %x, i32 3
740  %x01 = fsub float %x1, %x0
741  ret float %x01
742}
743
744define double @extract_extract01_v2f64_fsub_f64(<2 x double> %x) {
745; SSE3-SLOW-LABEL: extract_extract01_v2f64_fsub_f64:
746; SSE3-SLOW:       # %bb.0:
747; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
748; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
749; SSE3-SLOW-NEXT:    subsd %xmm1, %xmm0
750; SSE3-SLOW-NEXT:    retq
751;
752; SSE3-FAST-LABEL: extract_extract01_v2f64_fsub_f64:
753; SSE3-FAST:       # %bb.0:
754; SSE3-FAST-NEXT:    hsubpd %xmm0, %xmm0
755; SSE3-FAST-NEXT:    retq
756;
757; AVX-SLOW-LABEL: extract_extract01_v2f64_fsub_f64:
758; AVX-SLOW:       # %bb.0:
759; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
760; AVX-SLOW-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
761; AVX-SLOW-NEXT:    retq
762;
763; AVX-FAST-LABEL: extract_extract01_v2f64_fsub_f64:
764; AVX-FAST:       # %bb.0:
765; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
766; AVX-FAST-NEXT:    retq
767  %x0 = extractelement <2 x double> %x, i32 0
768  %x1 = extractelement <2 x double> %x, i32 1
769  %x01 = fsub double %x0, %x1
770  ret double %x01
771}
772
773define double @extract_extract01_v2f64_fsub_f64_commute(<2 x double> %x) {
774; SSE3-LABEL: extract_extract01_v2f64_fsub_f64_commute:
775; SSE3:       # %bb.0:
776; SSE3-NEXT:    movapd %xmm0, %xmm1
777; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
778; SSE3-NEXT:    subsd %xmm0, %xmm1
779; SSE3-NEXT:    movapd %xmm1, %xmm0
780; SSE3-NEXT:    retq
781;
782; AVX-LABEL: extract_extract01_v2f64_fsub_f64_commute:
783; AVX:       # %bb.0:
784; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
785; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
786; AVX-NEXT:    retq
787  %x0 = extractelement <2 x double> %x, i32 0
788  %x1 = extractelement <2 x double> %x, i32 1
789  %x01 = fsub double %x1, %x0
790  ret double %x01
791}
792
793; 256-bit vectors, float/double, fadd/fsub
794
795define float @extract_extract01_v8f32_fadd_f32(<8 x float> %x) {
796; SSE3-SLOW-LABEL: extract_extract01_v8f32_fadd_f32:
797; SSE3-SLOW:       # %bb.0:
798; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
799; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
800; SSE3-SLOW-NEXT:    retq
801;
802; SSE3-FAST-LABEL: extract_extract01_v8f32_fadd_f32:
803; SSE3-FAST:       # %bb.0:
804; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
805; SSE3-FAST-NEXT:    retq
806;
807; AVX-SLOW-LABEL: extract_extract01_v8f32_fadd_f32:
808; AVX-SLOW:       # %bb.0:
809; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
810; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
811; AVX-SLOW-NEXT:    vzeroupper
812; AVX-SLOW-NEXT:    retq
813;
814; AVX-FAST-LABEL: extract_extract01_v8f32_fadd_f32:
815; AVX-FAST:       # %bb.0:
816; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
817; AVX-FAST-NEXT:    vzeroupper
818; AVX-FAST-NEXT:    retq
819  %x0 = extractelement <8 x float> %x, i32 0
820  %x1 = extractelement <8 x float> %x, i32 1
821  %x01 = fadd float %x0, %x1
822  ret float %x01
823}
824
825define float @extract_extract23_v8f32_fadd_f32(<8 x float> %x) {
826; SSE3-SLOW-LABEL: extract_extract23_v8f32_fadd_f32:
827; SSE3-SLOW:       # %bb.0:
828; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
829; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
830; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
831; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
832; SSE3-SLOW-NEXT:    retq
833;
834; SSE3-FAST-LABEL: extract_extract23_v8f32_fadd_f32:
835; SSE3-FAST:       # %bb.0:
836; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
837; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
838; SSE3-FAST-NEXT:    retq
839;
840; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32:
841; AVX-SLOW:       # %bb.0:
842; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
843; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
844; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
845; AVX-SLOW-NEXT:    vzeroupper
846; AVX-SLOW-NEXT:    retq
847;
848; AVX-FAST-LABEL: extract_extract23_v8f32_fadd_f32:
849; AVX-FAST:       # %bb.0:
850; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
851; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
852; AVX-FAST-NEXT:    vzeroupper
853; AVX-FAST-NEXT:    retq
854  %x0 = extractelement <8 x float> %x, i32 2
855  %x1 = extractelement <8 x float> %x, i32 3
856  %x01 = fadd float %x0, %x1
857  ret float %x01
858}
859
860define float @extract_extract67_v8f32_fadd_f32(<8 x float> %x) {
861; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32:
862; SSE3-SLOW:       # %bb.0:
863; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
864; SSE3-SLOW-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
865; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
866; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
867; SSE3-SLOW-NEXT:    retq
868;
869; SSE3-FAST-LABEL: extract_extract67_v8f32_fadd_f32:
870; SSE3-FAST:       # %bb.0:
871; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
872; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
873; SSE3-FAST-NEXT:    retq
874;
875; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32:
876; AVX-SLOW:       # %bb.0:
877; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
878; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
879; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
880; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
881; AVX-SLOW-NEXT:    vzeroupper
882; AVX-SLOW-NEXT:    retq
883;
884; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32:
885; AVX-FAST:       # %bb.0:
886; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
887; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
888; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
889; AVX-FAST-NEXT:    vzeroupper
890; AVX-FAST-NEXT:    retq
891  %x0 = extractelement <8 x float> %x, i32 6
892  %x1 = extractelement <8 x float> %x, i32 7
893  %x01 = fadd float %x0, %x1
894  ret float %x01
895}
896
897define float @extract_extract01_v8f32_fadd_f32_commute(<8 x float> %x) {
898; SSE3-SLOW-LABEL: extract_extract01_v8f32_fadd_f32_commute:
899; SSE3-SLOW:       # %bb.0:
900; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
901; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
902; SSE3-SLOW-NEXT:    retq
903;
904; SSE3-FAST-LABEL: extract_extract01_v8f32_fadd_f32_commute:
905; SSE3-FAST:       # %bb.0:
906; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
907; SSE3-FAST-NEXT:    retq
908;
909; AVX-SLOW-LABEL: extract_extract01_v8f32_fadd_f32_commute:
910; AVX-SLOW:       # %bb.0:
911; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
912; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
913; AVX-SLOW-NEXT:    vzeroupper
914; AVX-SLOW-NEXT:    retq
915;
916; AVX-FAST-LABEL: extract_extract01_v8f32_fadd_f32_commute:
917; AVX-FAST:       # %bb.0:
918; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
919; AVX-FAST-NEXT:    vzeroupper
920; AVX-FAST-NEXT:    retq
921  %x0 = extractelement <8 x float> %x, i32 0
922  %x1 = extractelement <8 x float> %x, i32 1
923  %x01 = fadd float %x1, %x0
924  ret float %x01
925}
926
927define float @extract_extract23_v8f32_fadd_f32_commute(<8 x float> %x) {
928; SSE3-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute:
929; SSE3-SLOW:       # %bb.0:
930; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
931; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
932; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
933; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
934; SSE3-SLOW-NEXT:    retq
935;
936; SSE3-FAST-LABEL: extract_extract23_v8f32_fadd_f32_commute:
937; SSE3-FAST:       # %bb.0:
938; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
939; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
940; SSE3-FAST-NEXT:    retq
941;
942; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute:
943; AVX-SLOW:       # %bb.0:
944; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
945; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
946; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
947; AVX-SLOW-NEXT:    vzeroupper
948; AVX-SLOW-NEXT:    retq
949;
950; AVX-FAST-LABEL: extract_extract23_v8f32_fadd_f32_commute:
951; AVX-FAST:       # %bb.0:
952; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
953; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
954; AVX-FAST-NEXT:    vzeroupper
955; AVX-FAST-NEXT:    retq
956  %x0 = extractelement <8 x float> %x, i32 2
957  %x1 = extractelement <8 x float> %x, i32 3
958  %x01 = fadd float %x1, %x0
959  ret float %x01
960}
961
962define float @extract_extract67_v8f32_fadd_f32_commute(<8 x float> %x) {
963; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute:
964; SSE3-SLOW:       # %bb.0:
965; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
966; SSE3-SLOW-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
967; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
968; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
969; SSE3-SLOW-NEXT:    retq
970;
971; SSE3-FAST-LABEL: extract_extract67_v8f32_fadd_f32_commute:
972; SSE3-FAST:       # %bb.0:
973; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
974; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
975; SSE3-FAST-NEXT:    retq
976;
977; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute:
978; AVX-SLOW:       # %bb.0:
979; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
980; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
981; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
982; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
983; AVX-SLOW-NEXT:    vzeroupper
984; AVX-SLOW-NEXT:    retq
985;
986; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32_commute:
987; AVX-FAST:       # %bb.0:
988; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
989; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
990; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
991; AVX-FAST-NEXT:    vzeroupper
992; AVX-FAST-NEXT:    retq
993  %x0 = extractelement <8 x float> %x, i32 6
994  %x1 = extractelement <8 x float> %x, i32 7
995  %x01 = fadd float %x1, %x0
996  ret float %x01
997}
998
999define double @extract_extract01_v4f64_fadd_f64(<4 x double> %x) {
1000; SSE3-SLOW-LABEL: extract_extract01_v4f64_fadd_f64:
1001; SSE3-SLOW:       # %bb.0:
1002; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1003; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1004; SSE3-SLOW-NEXT:    addsd %xmm0, %xmm1
1005; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
1006; SSE3-SLOW-NEXT:    retq
1007;
1008; SSE3-FAST-LABEL: extract_extract01_v4f64_fadd_f64:
1009; SSE3-FAST:       # %bb.0:
1010; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
1011; SSE3-FAST-NEXT:    retq
1012;
1013; AVX-SLOW-LABEL: extract_extract01_v4f64_fadd_f64:
1014; AVX-SLOW:       # %bb.0:
1015; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1016; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1017; AVX-SLOW-NEXT:    vzeroupper
1018; AVX-SLOW-NEXT:    retq
1019;
1020; AVX-FAST-LABEL: extract_extract01_v4f64_fadd_f64:
1021; AVX-FAST:       # %bb.0:
1022; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1023; AVX-FAST-NEXT:    vzeroupper
1024; AVX-FAST-NEXT:    retq
1025  %x0 = extractelement <4 x double> %x, i32 0
1026  %x1 = extractelement <4 x double> %x, i32 1
1027  %x01 = fadd double %x0, %x1
1028  ret double %x01
1029}
1030
1031define double @extract_extract23_v4f64_fadd_f64(<4 x double> %x) {
1032; SSE3-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
1033; SSE3-SLOW:       # %bb.0:
1034; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
1035; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1036; SSE3-SLOW-NEXT:    addsd %xmm1, %xmm0
1037; SSE3-SLOW-NEXT:    retq
1038;
1039; SSE3-FAST-LABEL: extract_extract23_v4f64_fadd_f64:
1040; SSE3-FAST:       # %bb.0:
1041; SSE3-FAST-NEXT:    movapd %xmm1, %xmm0
1042; SSE3-FAST-NEXT:    haddpd %xmm1, %xmm0
1043; SSE3-FAST-NEXT:    retq
1044;
1045; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
1046; AVX-SLOW:       # %bb.0:
1047; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
1048; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1049; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1050; AVX-SLOW-NEXT:    vzeroupper
1051; AVX-SLOW-NEXT:    retq
1052;
1053; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64:
1054; AVX-FAST:       # %bb.0:
1055; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
1056; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1057; AVX-FAST-NEXT:    vzeroupper
1058; AVX-FAST-NEXT:    retq
1059  %x0 = extractelement <4 x double> %x, i32 2
1060  %x1 = extractelement <4 x double> %x, i32 3
1061  %x01 = fadd double %x0, %x1
1062  ret double %x01
1063}
1064
1065define double @extract_extract01_v4f64_fadd_f64_commute(<4 x double> %x) {
1066; SSE3-SLOW-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1067; SSE3-SLOW:       # %bb.0:
1068; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1069; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1070; SSE3-SLOW-NEXT:    addsd %xmm0, %xmm1
1071; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
1072; SSE3-SLOW-NEXT:    retq
1073;
1074; SSE3-FAST-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1075; SSE3-FAST:       # %bb.0:
1076; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
1077; SSE3-FAST-NEXT:    retq
1078;
1079; AVX-SLOW-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1080; AVX-SLOW:       # %bb.0:
1081; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1082; AVX-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1083; AVX-SLOW-NEXT:    vzeroupper
1084; AVX-SLOW-NEXT:    retq
1085;
1086; AVX-FAST-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1087; AVX-FAST:       # %bb.0:
1088; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1089; AVX-FAST-NEXT:    vzeroupper
1090; AVX-FAST-NEXT:    retq
1091  %x0 = extractelement <4 x double> %x, i32 0
1092  %x1 = extractelement <4 x double> %x, i32 1
1093  %x01 = fadd double %x1, %x0
1094  ret double %x01
1095}
1096
1097define double @extract_extract23_v4f64_fadd_f64_commute(<4 x double> %x) {
1098; SSE3-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1099; SSE3-SLOW:       # %bb.0:
1100; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
1101; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1102; SSE3-SLOW-NEXT:    addsd %xmm1, %xmm0
1103; SSE3-SLOW-NEXT:    retq
1104;
1105; SSE3-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1106; SSE3-FAST:       # %bb.0:
1107; SSE3-FAST-NEXT:    movapd %xmm1, %xmm0
1108; SSE3-FAST-NEXT:    haddpd %xmm1, %xmm0
1109; SSE3-FAST-NEXT:    retq
1110;
1111; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1112; AVX-SLOW:       # %bb.0:
1113; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
1114; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1115; AVX-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1116; AVX-SLOW-NEXT:    vzeroupper
1117; AVX-SLOW-NEXT:    retq
1118;
1119; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1120; AVX-FAST:       # %bb.0:
1121; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
1122; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1123; AVX-FAST-NEXT:    vzeroupper
1124; AVX-FAST-NEXT:    retq
1125  %x0 = extractelement <4 x double> %x, i32 2
1126  %x1 = extractelement <4 x double> %x, i32 3
1127  %x01 = fadd double %x1, %x0
1128  ret double %x01
1129}
1130
1131define float @extract_extract01_v8f32_fsub_f32(<8 x float> %x) {
1132; SSE3-SLOW-LABEL: extract_extract01_v8f32_fsub_f32:
1133; SSE3-SLOW:       # %bb.0:
1134; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1135; SSE3-SLOW-NEXT:    subss %xmm1, %xmm0
1136; SSE3-SLOW-NEXT:    retq
1137;
1138; SSE3-FAST-LABEL: extract_extract01_v8f32_fsub_f32:
1139; SSE3-FAST:       # %bb.0:
1140; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
1141; SSE3-FAST-NEXT:    retq
1142;
1143; AVX-SLOW-LABEL: extract_extract01_v8f32_fsub_f32:
1144; AVX-SLOW:       # %bb.0:
1145; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1146; AVX-SLOW-NEXT:    vsubss %xmm1, %xmm0, %xmm0
1147; AVX-SLOW-NEXT:    vzeroupper
1148; AVX-SLOW-NEXT:    retq
1149;
1150; AVX-FAST-LABEL: extract_extract01_v8f32_fsub_f32:
1151; AVX-FAST:       # %bb.0:
1152; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
1153; AVX-FAST-NEXT:    vzeroupper
1154; AVX-FAST-NEXT:    retq
1155  %x0 = extractelement <8 x float> %x, i32 0
1156  %x1 = extractelement <8 x float> %x, i32 1
1157  %x01 = fsub float %x0, %x1
1158  ret float %x01
1159}
1160
1161define float @extract_extract23_v8f32_fsub_f32(<8 x float> %x) {
1162; SSE3-SLOW-LABEL: extract_extract23_v8f32_fsub_f32:
1163; SSE3-SLOW:       # %bb.0:
1164; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
1165; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1166; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1167; SSE3-SLOW-NEXT:    subss %xmm0, %xmm1
1168; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
1169; SSE3-SLOW-NEXT:    retq
1170;
1171; SSE3-FAST-LABEL: extract_extract23_v8f32_fsub_f32:
1172; SSE3-FAST:       # %bb.0:
1173; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
1174; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1175; SSE3-FAST-NEXT:    retq
1176;
1177; AVX-SLOW-LABEL: extract_extract23_v8f32_fsub_f32:
1178; AVX-SLOW:       # %bb.0:
1179; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1180; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1181; AVX-SLOW-NEXT:    vsubss %xmm0, %xmm1, %xmm0
1182; AVX-SLOW-NEXT:    vzeroupper
1183; AVX-SLOW-NEXT:    retq
1184;
1185; AVX-FAST-LABEL: extract_extract23_v8f32_fsub_f32:
1186; AVX-FAST:       # %bb.0:
1187; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
1188; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1189; AVX-FAST-NEXT:    vzeroupper
1190; AVX-FAST-NEXT:    retq
1191  %x0 = extractelement <8 x float> %x, i32 2
1192  %x1 = extractelement <8 x float> %x, i32 3
1193  %x01 = fsub float %x0, %x1
1194  ret float %x01
1195}
1196
1197define float @extract_extract45_v8f32_fsub_f32(<8 x float> %x) {
1198; SSE3-SLOW-LABEL: extract_extract45_v8f32_fsub_f32:
1199; SSE3-SLOW:       # %bb.0:
1200; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
1201; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
1202; SSE3-SLOW-NEXT:    subss %xmm1, %xmm0
1203; SSE3-SLOW-NEXT:    retq
1204;
1205; SSE3-FAST-LABEL: extract_extract45_v8f32_fsub_f32:
1206; SSE3-FAST:       # %bb.0:
1207; SSE3-FAST-NEXT:    movaps %xmm1, %xmm0
1208; SSE3-FAST-NEXT:    hsubps %xmm1, %xmm0
1209; SSE3-FAST-NEXT:    retq
1210;
1211; AVX-SLOW-LABEL: extract_extract45_v8f32_fsub_f32:
1212; AVX-SLOW:       # %bb.0:
1213; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
1214; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1215; AVX-SLOW-NEXT:    vsubss %xmm1, %xmm0, %xmm0
1216; AVX-SLOW-NEXT:    vzeroupper
1217; AVX-SLOW-NEXT:    retq
1218;
1219; AVX-FAST-LABEL: extract_extract45_v8f32_fsub_f32:
1220; AVX-FAST:       # %bb.0:
1221; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
1222; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
1223; AVX-FAST-NEXT:    vzeroupper
1224; AVX-FAST-NEXT:    retq
1225  %x0 = extractelement <8 x float> %x, i32 4
1226  %x1 = extractelement <8 x float> %x, i32 5
1227  %x01 = fsub float %x0, %x1
1228  ret float %x01
1229}
1230
1231; Negative test...or get hoppy and negate?
1232
1233define float @extract_extract01_v8f32_fsub_f32_commute(<8 x float> %x) {
1234; SSE3-LABEL: extract_extract01_v8f32_fsub_f32_commute:
1235; SSE3:       # %bb.0:
1236; SSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1237; SSE3-NEXT:    subss %xmm0, %xmm1
1238; SSE3-NEXT:    movaps %xmm1, %xmm0
1239; SSE3-NEXT:    retq
1240;
1241; AVX-LABEL: extract_extract01_v8f32_fsub_f32_commute:
1242; AVX:       # %bb.0:
1243; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1244; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
1245; AVX-NEXT:    vzeroupper
1246; AVX-NEXT:    retq
1247  %x0 = extractelement <8 x float> %x, i32 0
1248  %x1 = extractelement <8 x float> %x, i32 1
1249  %x01 = fsub float %x1, %x0
1250  ret float %x01
1251}
1252
1253define double @extract_extract01_v4f64_fsub_f64(<4 x double> %x) {
1254; SSE3-SLOW-LABEL: extract_extract01_v4f64_fsub_f64:
1255; SSE3-SLOW:       # %bb.0:
1256; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1257; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1258; SSE3-SLOW-NEXT:    subsd %xmm1, %xmm0
1259; SSE3-SLOW-NEXT:    retq
1260;
1261; SSE3-FAST-LABEL: extract_extract01_v4f64_fsub_f64:
1262; SSE3-FAST:       # %bb.0:
1263; SSE3-FAST-NEXT:    hsubpd %xmm0, %xmm0
1264; SSE3-FAST-NEXT:    retq
1265;
1266; AVX-SLOW-LABEL: extract_extract01_v4f64_fsub_f64:
1267; AVX-SLOW:       # %bb.0:
1268; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1269; AVX-SLOW-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
1270; AVX-SLOW-NEXT:    vzeroupper
1271; AVX-SLOW-NEXT:    retq
1272;
1273; AVX-FAST-LABEL: extract_extract01_v4f64_fsub_f64:
1274; AVX-FAST:       # %bb.0:
1275; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
1276; AVX-FAST-NEXT:    vzeroupper
1277; AVX-FAST-NEXT:    retq
1278  %x0 = extractelement <4 x double> %x, i32 0
1279  %x1 = extractelement <4 x double> %x, i32 1
1280  %x01 = fsub double %x0, %x1
1281  ret double %x01
1282}
1283
1284; Negative test...or get hoppy and negate?
1285
1286define double @extract_extract01_v4f64_fsub_f64_commute(<4 x double> %x) {
1287; SSE3-LABEL: extract_extract01_v4f64_fsub_f64_commute:
1288; SSE3:       # %bb.0:
1289; SSE3-NEXT:    movapd %xmm0, %xmm1
1290; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1291; SSE3-NEXT:    subsd %xmm0, %xmm1
1292; SSE3-NEXT:    movapd %xmm1, %xmm0
1293; SSE3-NEXT:    retq
1294;
1295; AVX-LABEL: extract_extract01_v4f64_fsub_f64_commute:
1296; AVX:       # %bb.0:
1297; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1298; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
1299; AVX-NEXT:    vzeroupper
1300; AVX-NEXT:    retq
1301  %x0 = extractelement <4 x double> %x, i32 0
1302  %x1 = extractelement <4 x double> %x, i32 1
1303  %x01 = fsub double %x1, %x0
1304  ret double %x01
1305}
1306
1307; 512-bit vectors, float/double, fadd/fsub
1308
1309define float @extract_extract01_v16f32_fadd_f32(<16 x float> %x) {
1310; SSE3-SLOW-LABEL: extract_extract01_v16f32_fadd_f32:
1311; SSE3-SLOW:       # %bb.0:
1312; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1313; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1314; SSE3-SLOW-NEXT:    retq
1315;
1316; SSE3-FAST-LABEL: extract_extract01_v16f32_fadd_f32:
1317; SSE3-FAST:       # %bb.0:
1318; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1319; SSE3-FAST-NEXT:    retq
1320;
1321; AVX-SLOW-LABEL: extract_extract01_v16f32_fadd_f32:
1322; AVX-SLOW:       # %bb.0:
1323; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1324; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1325; AVX-SLOW-NEXT:    vzeroupper
1326; AVX-SLOW-NEXT:    retq
1327;
1328; AVX-FAST-LABEL: extract_extract01_v16f32_fadd_f32:
1329; AVX-FAST:       # %bb.0:
1330; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1331; AVX-FAST-NEXT:    vzeroupper
1332; AVX-FAST-NEXT:    retq
1333  %x0 = extractelement <16 x float> %x, i32 0
1334  %x1 = extractelement <16 x float> %x, i32 1
1335  %x01 = fadd float %x0, %x1
1336  ret float %x01
1337}
1338
1339define float @extract_extract01_v16f32_fadd_f32_commute(<16 x float> %x) {
1340; SSE3-SLOW-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1341; SSE3-SLOW:       # %bb.0:
1342; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1343; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1344; SSE3-SLOW-NEXT:    retq
1345;
1346; SSE3-FAST-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1347; SSE3-FAST:       # %bb.0:
1348; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1349; SSE3-FAST-NEXT:    retq
1350;
1351; AVX-SLOW-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1352; AVX-SLOW:       # %bb.0:
1353; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1354; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
1355; AVX-SLOW-NEXT:    vzeroupper
1356; AVX-SLOW-NEXT:    retq
1357;
1358; AVX-FAST-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1359; AVX-FAST:       # %bb.0:
1360; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1361; AVX-FAST-NEXT:    vzeroupper
1362; AVX-FAST-NEXT:    retq
1363  %x0 = extractelement <16 x float> %x, i32 0
1364  %x1 = extractelement <16 x float> %x, i32 1
1365  %x01 = fadd float %x1, %x0
1366  ret float %x01
1367}
1368
1369define double @extract_extract01_v8f64_fadd_f64(<8 x double> %x) {
1370; SSE3-SLOW-LABEL: extract_extract01_v8f64_fadd_f64:
1371; SSE3-SLOW:       # %bb.0:
1372; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1373; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1374; SSE3-SLOW-NEXT:    addsd %xmm0, %xmm1
1375; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
1376; SSE3-SLOW-NEXT:    retq
1377;
1378; SSE3-FAST-LABEL: extract_extract01_v8f64_fadd_f64:
1379; SSE3-FAST:       # %bb.0:
1380; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
1381; SSE3-FAST-NEXT:    retq
1382;
1383; AVX-SLOW-LABEL: extract_extract01_v8f64_fadd_f64:
1384; AVX-SLOW:       # %bb.0:
1385; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1386; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1387; AVX-SLOW-NEXT:    vzeroupper
1388; AVX-SLOW-NEXT:    retq
1389;
1390; AVX-FAST-LABEL: extract_extract01_v8f64_fadd_f64:
1391; AVX-FAST:       # %bb.0:
1392; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1393; AVX-FAST-NEXT:    vzeroupper
1394; AVX-FAST-NEXT:    retq
1395  %x0 = extractelement <8 x double> %x, i32 0
1396  %x1 = extractelement <8 x double> %x, i32 1
1397  %x01 = fadd double %x0, %x1
1398  ret double %x01
1399}
1400
1401define double @extract_extract01_v8f64_fadd_f64_commute(<8 x double> %x) {
1402; SSE3-SLOW-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1403; SSE3-SLOW:       # %bb.0:
1404; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1405; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1406; SSE3-SLOW-NEXT:    addsd %xmm0, %xmm1
1407; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
1408; SSE3-SLOW-NEXT:    retq
1409;
1410; SSE3-FAST-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1411; SSE3-FAST:       # %bb.0:
1412; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
1413; SSE3-FAST-NEXT:    retq
1414;
1415; AVX-SLOW-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1416; AVX-SLOW:       # %bb.0:
1417; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1418; AVX-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1419; AVX-SLOW-NEXT:    vzeroupper
1420; AVX-SLOW-NEXT:    retq
1421;
1422; AVX-FAST-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1423; AVX-FAST:       # %bb.0:
1424; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1425; AVX-FAST-NEXT:    vzeroupper
1426; AVX-FAST-NEXT:    retq
1427  %x0 = extractelement <8 x double> %x, i32 0
1428  %x1 = extractelement <8 x double> %x, i32 1
1429  %x01 = fadd double %x1, %x0
1430  ret double %x01
1431}
1432
1433define float @extract_extract01_v16f32_fsub_f32(<16 x float> %x) {
1434; SSE3-SLOW-LABEL: extract_extract01_v16f32_fsub_f32:
1435; SSE3-SLOW:       # %bb.0:
1436; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1437; SSE3-SLOW-NEXT:    subss %xmm1, %xmm0
1438; SSE3-SLOW-NEXT:    retq
1439;
1440; SSE3-FAST-LABEL: extract_extract01_v16f32_fsub_f32:
1441; SSE3-FAST:       # %bb.0:
1442; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
1443; SSE3-FAST-NEXT:    retq
1444;
1445; AVX-SLOW-LABEL: extract_extract01_v16f32_fsub_f32:
1446; AVX-SLOW:       # %bb.0:
1447; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1448; AVX-SLOW-NEXT:    vsubss %xmm1, %xmm0, %xmm0
1449; AVX-SLOW-NEXT:    vzeroupper
1450; AVX-SLOW-NEXT:    retq
1451;
1452; AVX-FAST-LABEL: extract_extract01_v16f32_fsub_f32:
1453; AVX-FAST:       # %bb.0:
1454; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
1455; AVX-FAST-NEXT:    vzeroupper
1456; AVX-FAST-NEXT:    retq
1457  %x0 = extractelement <16 x float> %x, i32 0
1458  %x1 = extractelement <16 x float> %x, i32 1
1459  %x01 = fsub float %x0, %x1
1460  ret float %x01
1461}
1462
1463define float @extract_extract01_v16f32_fsub_f32_commute(<16 x float> %x) {
1464; SSE3-LABEL: extract_extract01_v16f32_fsub_f32_commute:
1465; SSE3:       # %bb.0:
1466; SSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1467; SSE3-NEXT:    subss %xmm0, %xmm1
1468; SSE3-NEXT:    movaps %xmm1, %xmm0
1469; SSE3-NEXT:    retq
1470;
1471; AVX-LABEL: extract_extract01_v16f32_fsub_f32_commute:
1472; AVX:       # %bb.0:
1473; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1474; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
1475; AVX-NEXT:    vzeroupper
1476; AVX-NEXT:    retq
1477  %x0 = extractelement <16 x float> %x, i32 0
1478  %x1 = extractelement <16 x float> %x, i32 1
1479  %x01 = fsub float %x1, %x0
1480  ret float %x01
1481}
1482
1483define double @extract_extract01_v8f64_fsub_f64(<8 x double> %x) {
1484; SSE3-SLOW-LABEL: extract_extract01_v8f64_fsub_f64:
1485; SSE3-SLOW:       # %bb.0:
1486; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1487; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1488; SSE3-SLOW-NEXT:    subsd %xmm1, %xmm0
1489; SSE3-SLOW-NEXT:    retq
1490;
1491; SSE3-FAST-LABEL: extract_extract01_v8f64_fsub_f64:
1492; SSE3-FAST:       # %bb.0:
1493; SSE3-FAST-NEXT:    hsubpd %xmm0, %xmm0
1494; SSE3-FAST-NEXT:    retq
1495;
1496; AVX-SLOW-LABEL: extract_extract01_v8f64_fsub_f64:
1497; AVX-SLOW:       # %bb.0:
1498; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1499; AVX-SLOW-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
1500; AVX-SLOW-NEXT:    vzeroupper
1501; AVX-SLOW-NEXT:    retq
1502;
1503; AVX-FAST-LABEL: extract_extract01_v8f64_fsub_f64:
1504; AVX-FAST:       # %bb.0:
1505; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
1506; AVX-FAST-NEXT:    vzeroupper
1507; AVX-FAST-NEXT:    retq
1508  %x0 = extractelement <8 x double> %x, i32 0
1509  %x1 = extractelement <8 x double> %x, i32 1
1510  %x01 = fsub double %x0, %x1
1511  ret double %x01
1512}
1513
1514define double @extract_extract01_v8f64_fsub_f64_commute(<8 x double> %x) {
1515; SSE3-LABEL: extract_extract01_v8f64_fsub_f64_commute:
1516; SSE3:       # %bb.0:
1517; SSE3-NEXT:    movapd %xmm0, %xmm1
1518; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1519; SSE3-NEXT:    subsd %xmm0, %xmm1
1520; SSE3-NEXT:    movapd %xmm1, %xmm0
1521; SSE3-NEXT:    retq
1522;
1523; AVX-LABEL: extract_extract01_v8f64_fsub_f64_commute:
1524; AVX:       # %bb.0:
1525; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1526; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
1527; AVX-NEXT:    vzeroupper
1528; AVX-NEXT:    retq
1529  %x0 = extractelement <8 x double> %x, i32 0
1530  %x1 = extractelement <8 x double> %x, i32 1
1531  %x01 = fsub double %x1, %x0
1532  ret double %x01
1533}
1534
1535; Check output when 1 or both extracts have extra uses.
1536
1537define float @extract_extract01_v4f32_fadd_f32_uses1(<4 x float> %x, float* %p) {
1538; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1539; SSE3-SLOW:       # %bb.0:
1540; SSE3-SLOW-NEXT:    movss %xmm0, (%rdi)
1541; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1542; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1543; SSE3-SLOW-NEXT:    retq
1544;
1545; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1546; SSE3-FAST:       # %bb.0:
1547; SSE3-FAST-NEXT:    movss %xmm0, (%rdi)
1548; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1549; SSE3-FAST-NEXT:    retq
1550;
1551; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1552; AVX-SLOW:       # %bb.0:
1553; AVX-SLOW-NEXT:    vmovss %xmm0, (%rdi)
1554; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1555; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1556; AVX-SLOW-NEXT:    retq
1557;
1558; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1559; AVX-FAST:       # %bb.0:
1560; AVX-FAST-NEXT:    vmovss %xmm0, (%rdi)
1561; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1562; AVX-FAST-NEXT:    retq
1563  %x0 = extractelement <4 x float> %x, i32 0
1564  store float %x0, float* %p
1565  %x1 = extractelement <4 x float> %x, i32 1
1566  %x01 = fadd float %x0, %x1
1567  ret float %x01
1568}
1569
1570define float @extract_extract01_v4f32_fadd_f32_uses2(<4 x float> %x, float* %p) {
1571; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1572; SSE3-SLOW:       # %bb.0:
1573; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1574; SSE3-SLOW-NEXT:    movss %xmm1, (%rdi)
1575; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1576; SSE3-SLOW-NEXT:    retq
1577;
1578; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1579; SSE3-FAST:       # %bb.0:
1580; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1581; SSE3-FAST-NEXT:    movss %xmm1, (%rdi)
1582; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1583; SSE3-FAST-NEXT:    retq
1584;
1585; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1586; AVX-SLOW:       # %bb.0:
1587; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1588; AVX-SLOW-NEXT:    vmovss %xmm1, (%rdi)
1589; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1590; AVX-SLOW-NEXT:    retq
1591;
1592; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1593; AVX-FAST:       # %bb.0:
1594; AVX-FAST-NEXT:    vextractps $1, %xmm0, (%rdi)
1595; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1596; AVX-FAST-NEXT:    retq
1597  %x0 = extractelement <4 x float> %x, i32 0
1598  %x1 = extractelement <4 x float> %x, i32 1
1599  store float %x1, float* %p
1600  %x01 = fadd float %x0, %x1
1601  ret float %x01
1602}
1603
1604define float @extract_extract01_v4f32_fadd_f32_uses3(<4 x float> %x, float* %p1, float* %p2) {
1605; SSE3-LABEL: extract_extract01_v4f32_fadd_f32_uses3:
1606; SSE3:       # %bb.0:
1607; SSE3-NEXT:    movss %xmm0, (%rdi)
1608; SSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1609; SSE3-NEXT:    movss %xmm1, (%rsi)
1610; SSE3-NEXT:    addss %xmm1, %xmm0
1611; SSE3-NEXT:    retq
1612;
1613; AVX-LABEL: extract_extract01_v4f32_fadd_f32_uses3:
1614; AVX:       # %bb.0:
1615; AVX-NEXT:    vmovss %xmm0, (%rdi)
1616; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1617; AVX-NEXT:    vmovss %xmm1, (%rsi)
1618; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1619; AVX-NEXT:    retq
1620  %x0 = extractelement <4 x float> %x, i32 0
1621  store float %x0, float* %p1
1622  %x1 = extractelement <4 x float> %x, i32 1
1623  store float %x1, float* %p2
1624  %x01 = fadd float %x0, %x1
1625  ret float %x01
1626}
1627
1628; Repeat tests from general reductions to verify output for hoppy targets:
1629; PR38971: https://bugs.llvm.org/show_bug.cgi?id=38971
1630
1631declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
1632declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
1633
1634define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) {
1635; SSE3-SLOW-LABEL: fadd_reduce_v8f32:
1636; SSE3-SLOW:       # %bb.0:
1637; SSE3-SLOW-NEXT:    addps %xmm2, %xmm1
1638; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm2
1639; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1640; SSE3-SLOW-NEXT:    addps %xmm1, %xmm2
1641; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
1642; SSE3-SLOW-NEXT:    addss %xmm2, %xmm1
1643; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1644; SSE3-SLOW-NEXT:    retq
1645;
1646; SSE3-FAST-LABEL: fadd_reduce_v8f32:
1647; SSE3-FAST:       # %bb.0:
1648; SSE3-FAST-NEXT:    haddps %xmm1, %xmm2
1649; SSE3-FAST-NEXT:    haddps %xmm2, %xmm2
1650; SSE3-FAST-NEXT:    haddps %xmm2, %xmm2
1651; SSE3-FAST-NEXT:    addss %xmm2, %xmm0
1652; SSE3-FAST-NEXT:    retq
1653;
1654; AVX-SLOW-LABEL: fadd_reduce_v8f32:
1655; AVX-SLOW:       # %bb.0:
1656; AVX-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
1657; AVX-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
1658; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1659; AVX-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
1660; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1661; AVX-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
1662; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1663; AVX-SLOW-NEXT:    vzeroupper
1664; AVX-SLOW-NEXT:    retq
1665;
1666; AVX-FAST-LABEL: fadd_reduce_v8f32:
1667; AVX-FAST:       # %bb.0:
1668; AVX-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
1669; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm2, %xmm1
1670; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
1671; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
1672; AVX-FAST-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1673; AVX-FAST-NEXT:    vzeroupper
1674; AVX-FAST-NEXT:    retq
1675  %r = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
1676  ret float %r
1677}
1678
1679define double @fadd_reduce_v4f64(double %a0, <4 x double> %a1) {
1680; SSE3-SLOW-LABEL: fadd_reduce_v4f64:
1681; SSE3-SLOW:       # %bb.0:
1682; SSE3-SLOW-NEXT:    addpd %xmm2, %xmm1
1683; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm2
1684; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1685; SSE3-SLOW-NEXT:    addsd %xmm1, %xmm2
1686; SSE3-SLOW-NEXT:    addsd %xmm2, %xmm0
1687; SSE3-SLOW-NEXT:    retq
1688;
1689; SSE3-FAST-LABEL: fadd_reduce_v4f64:
1690; SSE3-FAST:       # %bb.0:
1691; SSE3-FAST-NEXT:    haddpd %xmm1, %xmm2
1692; SSE3-FAST-NEXT:    haddpd %xmm2, %xmm2
1693; SSE3-FAST-NEXT:    addsd %xmm2, %xmm0
1694; SSE3-FAST-NEXT:    retq
1695;
1696; AVX-SLOW-LABEL: fadd_reduce_v4f64:
1697; AVX-SLOW:       # %bb.0:
1698; AVX-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
1699; AVX-SLOW-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
1700; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1701; AVX-SLOW-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
1702; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1703; AVX-SLOW-NEXT:    vzeroupper
1704; AVX-SLOW-NEXT:    retq
1705;
1706; AVX-FAST-LABEL: fadd_reduce_v4f64:
1707; AVX-FAST:       # %bb.0:
1708; AVX-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
1709; AVX-FAST-NEXT:    vhaddpd %xmm1, %xmm2, %xmm1
1710; AVX-FAST-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
1711; AVX-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1712; AVX-FAST-NEXT:    vzeroupper
1713; AVX-FAST-NEXT:    retq
1714  %r = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
1715  ret double %r
1716}
1717
1718define float @PR39936_v8f32(<8 x float>) {
1719; SSSE3-SLOW-LABEL: PR39936_v8f32:
1720; SSSE3-SLOW:       # %bb.0:
1721; SSSE3-SLOW-NEXT:    haddps %xmm1, %xmm0
1722; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
1723; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
1724; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
1725; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm0
1726; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1727; SSSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1728; SSSE3-SLOW-NEXT:    retq
1729;
1730; SSSE3-FAST-LABEL: PR39936_v8f32:
1731; SSSE3-FAST:       # %bb.0:
1732; SSSE3-FAST-NEXT:    haddps %xmm1, %xmm0
1733; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1734; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1735; SSSE3-FAST-NEXT:    retq
1736;
1737; SSE3-SLOW-LABEL: PR39936_v8f32:
1738; SSE3-SLOW:       # %bb.0:
1739; SSE3-SLOW-NEXT:    haddps %xmm1, %xmm0
1740; SSE3-SLOW-NEXT:    haddps %xmm0, %xmm0
1741; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1742; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1743; SSE3-SLOW-NEXT:    retq
1744;
1745; SSE3-FAST-LABEL: PR39936_v8f32:
1746; SSE3-FAST:       # %bb.0:
1747; SSE3-FAST-NEXT:    haddps %xmm1, %xmm0
1748; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1749; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1750; SSE3-FAST-NEXT:    retq
1751;
1752; AVX-SLOW-LABEL: PR39936_v8f32:
1753; AVX-SLOW:       # %bb.0:
1754; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
1755; AVX-SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
1756; AVX-SLOW-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1757; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1758; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1759; AVX-SLOW-NEXT:    vzeroupper
1760; AVX-SLOW-NEXT:    retq
1761;
1762; AVX-FAST-LABEL: PR39936_v8f32:
1763; AVX-FAST:       # %bb.0:
1764; AVX-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
1765; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
1766; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1767; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1768; AVX-FAST-NEXT:    vzeroupper
1769; AVX-FAST-NEXT:    retq
1770  %2 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
1771  %3 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
1772  %4 = fadd <8 x float> %2, %3
1773  %5 = shufflevector <8 x float> %4, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1774  %6 = shufflevector <8 x float> %4, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1775  %7 = fadd <8 x float> %5, %6
1776  %8 = shufflevector <8 x float> %7, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1777  %9 = fadd <8 x float> %7, %8
1778  %10 = extractelement <8 x float> %9, i32 0
1779  ret float %10
1780}
1781
1782define float @hadd32_4(<4 x float> %x225) {
1783; SSE3-SLOW-LABEL: hadd32_4:
1784; SSE3-SLOW:       # %bb.0:
1785; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
1786; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1787; SSE3-SLOW-NEXT:    addps %xmm0, %xmm1
1788; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1789; SSE3-SLOW-NEXT:    addss %xmm0, %xmm1
1790; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
1791; SSE3-SLOW-NEXT:    retq
1792;
1793; SSE3-FAST-LABEL: hadd32_4:
1794; SSE3-FAST:       # %bb.0:
1795; SSE3-FAST-NEXT:    movaps %xmm0, %xmm1
1796; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1797; SSE3-FAST-NEXT:    addps %xmm0, %xmm1
1798; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
1799; SSE3-FAST-NEXT:    movaps %xmm1, %xmm0
1800; SSE3-FAST-NEXT:    retq
1801;
1802; AVX-SLOW-LABEL: hadd32_4:
1803; AVX-SLOW:       # %bb.0:
1804; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1805; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1806; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1807; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1808; AVX-SLOW-NEXT:    retq
1809;
1810; AVX-FAST-LABEL: hadd32_4:
1811; AVX-FAST:       # %bb.0:
1812; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1813; AVX-FAST-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1814; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1815; AVX-FAST-NEXT:    retq
1816  %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
1817  %x227 = fadd <4 x float> %x225, %x226
1818  %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1819  %x229 = fadd <4 x float> %x227, %x228
1820  %x230 = extractelement <4 x float> %x229, i32 0
1821  ret float %x230
1822}
1823
1824define float @hadd32_8(<8 x float> %x225) {
1825; SSE3-SLOW-LABEL: hadd32_8:
1826; SSE3-SLOW:       # %bb.0:
1827; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
1828; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1829; SSE3-SLOW-NEXT:    addps %xmm0, %xmm1
1830; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1831; SSE3-SLOW-NEXT:    addss %xmm0, %xmm1
1832; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
1833; SSE3-SLOW-NEXT:    retq
1834;
1835; SSE3-FAST-LABEL: hadd32_8:
1836; SSE3-FAST:       # %bb.0:
1837; SSE3-FAST-NEXT:    movaps %xmm0, %xmm1
1838; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1839; SSE3-FAST-NEXT:    addps %xmm0, %xmm1
1840; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
1841; SSE3-FAST-NEXT:    movaps %xmm1, %xmm0
1842; SSE3-FAST-NEXT:    retq
1843;
1844; AVX-SLOW-LABEL: hadd32_8:
1845; AVX-SLOW:       # %bb.0:
1846; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1847; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1848; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1849; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1850; AVX-SLOW-NEXT:    vzeroupper
1851; AVX-SLOW-NEXT:    retq
1852;
1853; AVX-FAST-LABEL: hadd32_8:
1854; AVX-FAST:       # %bb.0:
1855; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1856; AVX-FAST-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1857; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1858; AVX-FAST-NEXT:    vzeroupper
1859; AVX-FAST-NEXT:    retq
1860  %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1861  %x227 = fadd <8 x float> %x225, %x226
1862  %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1863  %x229 = fadd <8 x float> %x227, %x228
1864  %x230 = extractelement <8 x float> %x229, i32 0
1865  ret float %x230
1866}
1867
1868define float @hadd32_16(<16 x float> %x225) {
1869; SSE3-SLOW-LABEL: hadd32_16:
1870; SSE3-SLOW:       # %bb.0:
1871; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
1872; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1873; SSE3-SLOW-NEXT:    addps %xmm0, %xmm1
1874; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1875; SSE3-SLOW-NEXT:    addss %xmm0, %xmm1
1876; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
1877; SSE3-SLOW-NEXT:    retq
1878;
1879; SSE3-FAST-LABEL: hadd32_16:
1880; SSE3-FAST:       # %bb.0:
1881; SSE3-FAST-NEXT:    movaps %xmm0, %xmm1
1882; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1883; SSE3-FAST-NEXT:    addps %xmm0, %xmm1
1884; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
1885; SSE3-FAST-NEXT:    movaps %xmm1, %xmm0
1886; SSE3-FAST-NEXT:    retq
1887;
1888; AVX-SLOW-LABEL: hadd32_16:
1889; AVX-SLOW:       # %bb.0:
1890; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1891; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1892; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1893; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1894; AVX-SLOW-NEXT:    vzeroupper
1895; AVX-SLOW-NEXT:    retq
1896;
1897; AVX-FAST-LABEL: hadd32_16:
1898; AVX-FAST:       # %bb.0:
1899; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1900; AVX-FAST-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1901; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1902; AVX-FAST-NEXT:    vzeroupper
1903; AVX-FAST-NEXT:    retq
1904  %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1905  %x227 = fadd <16 x float> %x225, %x226
1906  %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1907  %x229 = fadd <16 x float> %x227, %x228
1908  %x230 = extractelement <16 x float> %x229, i32 0
1909  ret float %x230
1910}
1911
1912define float @hadd32_4_optsize(<4 x float> %x225) optsize {
1913; SSE3-LABEL: hadd32_4_optsize:
1914; SSE3:       # %bb.0:
1915; SSE3-NEXT:    movaps %xmm0, %xmm1
1916; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1917; SSE3-NEXT:    addps %xmm0, %xmm1
1918; SSE3-NEXT:    haddps %xmm1, %xmm1
1919; SSE3-NEXT:    movaps %xmm1, %xmm0
1920; SSE3-NEXT:    retq
1921;
1922; AVX-LABEL: hadd32_4_optsize:
1923; AVX:       # %bb.0:
1924; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1925; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1926; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1927; AVX-NEXT:    retq
1928  %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
1929  %x227 = fadd <4 x float> %x225, %x226
1930  %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1931  %x229 = fadd <4 x float> %x227, %x228
1932  %x230 = extractelement <4 x float> %x229, i32 0
1933  ret float %x230
1934}
1935
1936define float @hadd32_8_optsize(<8 x float> %x225) optsize {
1937; SSE3-LABEL: hadd32_8_optsize:
1938; SSE3:       # %bb.0:
1939; SSE3-NEXT:    movaps %xmm0, %xmm1
1940; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1941; SSE3-NEXT:    addps %xmm0, %xmm1
1942; SSE3-NEXT:    haddps %xmm1, %xmm1
1943; SSE3-NEXT:    movaps %xmm1, %xmm0
1944; SSE3-NEXT:    retq
1945;
1946; AVX-LABEL: hadd32_8_optsize:
1947; AVX:       # %bb.0:
1948; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1949; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1950; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1951; AVX-NEXT:    vzeroupper
1952; AVX-NEXT:    retq
1953  %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1954  %x227 = fadd <8 x float> %x225, %x226
1955  %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1956  %x229 = fadd <8 x float> %x227, %x228
1957  %x230 = extractelement <8 x float> %x229, i32 0
1958  ret float %x230
1959}
1960
1961define float @hadd32_16_optsize(<16 x float> %x225) optsize {
1962; SSE3-LABEL: hadd32_16_optsize:
1963; SSE3:       # %bb.0:
1964; SSE3-NEXT:    movaps %xmm0, %xmm1
1965; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1966; SSE3-NEXT:    addps %xmm0, %xmm1
1967; SSE3-NEXT:    haddps %xmm1, %xmm1
1968; SSE3-NEXT:    movaps %xmm1, %xmm0
1969; SSE3-NEXT:    retq
1970;
1971; AVX-LABEL: hadd32_16_optsize:
1972; AVX:       # %bb.0:
1973; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1974; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1975; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1976; AVX-NEXT:    vzeroupper
1977; AVX-NEXT:    retq
1978  %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1979  %x227 = fadd <16 x float> %x225, %x226
1980  %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1981  %x229 = fadd <16 x float> %x227, %x228
1982  %x230 = extractelement <16 x float> %x229, i32 0
1983  ret float %x230
1984}
1985
1986define float @hadd32_4_pgso(<4 x float> %x225) !prof !14 {
1987; SSE3-LABEL: hadd32_4_pgso:
1988; SSE3:       # %bb.0:
1989; SSE3-NEXT:    movaps %xmm0, %xmm1
1990; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1991; SSE3-NEXT:    addps %xmm0, %xmm1
1992; SSE3-NEXT:    haddps %xmm1, %xmm1
1993; SSE3-NEXT:    movaps %xmm1, %xmm0
1994; SSE3-NEXT:    retq
1995;
1996; AVX-LABEL: hadd32_4_pgso:
1997; AVX:       # %bb.0:
1998; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1999; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2000; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
2001; AVX-NEXT:    retq
2002  %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
2003  %x227 = fadd <4 x float> %x225, %x226
2004  %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
2005  %x229 = fadd <4 x float> %x227, %x228
2006  %x230 = extractelement <4 x float> %x229, i32 0
2007  ret float %x230
2008}
2009
2010define float @hadd32_8_pgso(<8 x float> %x225) !prof !14 {
2011; SSE3-LABEL: hadd32_8_pgso:
2012; SSE3:       # %bb.0:
2013; SSE3-NEXT:    movaps %xmm0, %xmm1
2014; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2015; SSE3-NEXT:    addps %xmm0, %xmm1
2016; SSE3-NEXT:    haddps %xmm1, %xmm1
2017; SSE3-NEXT:    movaps %xmm1, %xmm0
2018; SSE3-NEXT:    retq
2019;
2020; AVX-LABEL: hadd32_8_pgso:
2021; AVX:       # %bb.0:
2022; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2023; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2024; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
2025; AVX-NEXT:    vzeroupper
2026; AVX-NEXT:    retq
2027  %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2028  %x227 = fadd <8 x float> %x225, %x226
2029  %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2030  %x229 = fadd <8 x float> %x227, %x228
2031  %x230 = extractelement <8 x float> %x229, i32 0
2032  ret float %x230
2033}
2034
2035define float @hadd32_16_pgso(<16 x float> %x225) !prof !14 {
2036; SSE3-LABEL: hadd32_16_pgso:
2037; SSE3:       # %bb.0:
2038; SSE3-NEXT:    movaps %xmm0, %xmm1
2039; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2040; SSE3-NEXT:    addps %xmm0, %xmm1
2041; SSE3-NEXT:    haddps %xmm1, %xmm1
2042; SSE3-NEXT:    movaps %xmm1, %xmm0
2043; SSE3-NEXT:    retq
2044;
2045; AVX-LABEL: hadd32_16_pgso:
2046; AVX:       # %bb.0:
2047; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2048; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2049; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
2050; AVX-NEXT:    vzeroupper
2051; AVX-NEXT:    retq
2052  %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2053  %x227 = fadd <16 x float> %x225, %x226
2054  %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2055  %x229 = fadd <16 x float> %x227, %x228
2056  %x230 = extractelement <16 x float> %x229, i32 0
2057  ret float %x230
2058}
2059
2060define float @partial_reduction_fadd_v8f32(<8 x float> %x) {
2061; SSE3-SLOW-LABEL: partial_reduction_fadd_v8f32:
2062; SSE3-SLOW:       # %bb.0:
2063; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
2064; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2065; SSE3-SLOW-NEXT:    addps %xmm0, %xmm1
2066; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
2067; SSE3-SLOW-NEXT:    addss %xmm0, %xmm1
2068; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
2069; SSE3-SLOW-NEXT:    retq
2070;
2071; SSE3-FAST-LABEL: partial_reduction_fadd_v8f32:
2072; SSE3-FAST:       # %bb.0:
2073; SSE3-FAST-NEXT:    movaps %xmm0, %xmm1
2074; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2075; SSE3-FAST-NEXT:    addps %xmm0, %xmm1
2076; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
2077; SSE3-FAST-NEXT:    movaps %xmm1, %xmm0
2078; SSE3-FAST-NEXT:    retq
2079;
2080; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32:
2081; AVX-SLOW:       # %bb.0:
2082; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2083; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2084; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2085; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
2086; AVX-SLOW-NEXT:    vzeroupper
2087; AVX-SLOW-NEXT:    retq
2088;
2089; AVX-FAST-LABEL: partial_reduction_fadd_v8f32:
2090; AVX-FAST:       # %bb.0:
2091; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
2092; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
2093; AVX-FAST-NEXT:    vzeroupper
2094; AVX-FAST-NEXT:    retq
2095  %x23 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2096  %x0213 = fadd <8 x float> %x, %x23
2097  %x13 = shufflevector <8 x float> %x0213, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2098  %x0123 = fadd nsz reassoc <8 x float> %x0213, %x13
2099  %r = extractelement <8 x float> %x0123, i32 0
2100  ret float %r
2101}
2102
2103; Negative test - only the flags on the final math op in the
2104; sequence determine whether we can transform to horizontal ops.
2105
2106define float @partial_reduction_fadd_v8f32_wrong_flags(<8 x float> %x) {
2107; SSE3-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2108; SSE3-SLOW:       # %bb.0:
2109; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
2110; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2111; SSE3-SLOW-NEXT:    addps %xmm0, %xmm1
2112; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
2113; SSE3-SLOW-NEXT:    addss %xmm0, %xmm1
2114; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
2115; SSE3-SLOW-NEXT:    retq
2116;
2117; SSE3-FAST-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2118; SSE3-FAST:       # %bb.0:
2119; SSE3-FAST-NEXT:    movaps %xmm0, %xmm1
2120; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2121; SSE3-FAST-NEXT:    addps %xmm0, %xmm1
2122; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
2123; SSE3-FAST-NEXT:    movaps %xmm1, %xmm0
2124; SSE3-FAST-NEXT:    retq
2125;
2126; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2127; AVX-SLOW:       # %bb.0:
2128; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2129; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2130; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2131; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
2132; AVX-SLOW-NEXT:    vzeroupper
2133; AVX-SLOW-NEXT:    retq
2134;
2135; AVX-FAST-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2136; AVX-FAST:       # %bb.0:
2137; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2138; AVX-FAST-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2139; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
2140; AVX-FAST-NEXT:    vzeroupper
2141; AVX-FAST-NEXT:    retq
2142  %x23 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2143  %x0213 = fadd fast <8 x float> %x, %x23
2144  %x13 = shufflevector <8 x float> %x0213, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2145  %x0123 = fadd ninf nnan <8 x float> %x0213, %x13
2146  %r = extractelement <8 x float> %x0123, i32 0
2147  ret float %r
2148}
2149
2150define float @partial_reduction_fadd_v16f32(<16 x float> %x) {
2151; SSE3-SLOW-LABEL: partial_reduction_fadd_v16f32:
2152; SSE3-SLOW:       # %bb.0:
2153; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
2154; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2155; SSE3-SLOW-NEXT:    addps %xmm0, %xmm1
2156; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
2157; SSE3-SLOW-NEXT:    addss %xmm0, %xmm1
2158; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
2159; SSE3-SLOW-NEXT:    retq
2160;
2161; SSE3-FAST-LABEL: partial_reduction_fadd_v16f32:
2162; SSE3-FAST:       # %bb.0:
2163; SSE3-FAST-NEXT:    movaps %xmm0, %xmm1
2164; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2165; SSE3-FAST-NEXT:    addps %xmm0, %xmm1
2166; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
2167; SSE3-FAST-NEXT:    movaps %xmm1, %xmm0
2168; SSE3-FAST-NEXT:    retq
2169;
2170; AVX-SLOW-LABEL: partial_reduction_fadd_v16f32:
2171; AVX-SLOW:       # %bb.0:
2172; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2173; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2174; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2175; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
2176; AVX-SLOW-NEXT:    vzeroupper
2177; AVX-SLOW-NEXT:    retq
2178;
2179; AVX-FAST-LABEL: partial_reduction_fadd_v16f32:
2180; AVX-FAST:       # %bb.0:
2181; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
2182; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
2183; AVX-FAST-NEXT:    vzeroupper
2184; AVX-FAST-NEXT:    retq
2185  %x23 = shufflevector <16 x float> %x, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2186  %x0213 = fadd <16 x float> %x, %x23
2187  %x13 = shufflevector <16 x float> %x0213, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2188  %x0123 = fadd reassoc nsz <16 x float> %x0213, %x13
2189  %r = extractelement <16 x float> %x0123, i32 0
2190  ret float %r
2191}
2192
2193!llvm.module.flags = !{!0}
2194!0 = !{i32 1, !"ProfileSummary", !1}
2195!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
2196!2 = !{!"ProfileFormat", !"InstrProf"}
2197!3 = !{!"TotalCount", i64 10000}
2198!4 = !{!"MaxCount", i64 10}
2199!5 = !{!"MaxInternalCount", i64 1}
2200!6 = !{!"MaxFunctionCount", i64 1000}
2201!7 = !{!"NumCounts", i64 3}
2202!8 = !{!"NumFunctions", i64 3}
2203!9 = !{!"DetailedSummary", !10}
2204!10 = !{!11, !12, !13}
2205!11 = !{i32 10000, i64 100, i32 1}
2206!12 = !{i32 999000, i64 100, i32 1}
2207!13 = !{i32 999999, i64 1, i32 2}
2208!14 = !{!"function_entry_count", i64 0}
2209