1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3               | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops     | FileCheck %s --check-prefixes=SSE3,SSE3-FAST
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx                | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops      | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f            | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512-FAST
8
9define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
10; SSE3-LABEL: haddpd1:
11; SSE3:       # %bb.0:
12; SSE3-NEXT:    haddpd %xmm1, %xmm0
13; SSE3-NEXT:    retq
14;
15; AVX-LABEL: haddpd1:
16; AVX:       # %bb.0:
17; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
18; AVX-NEXT:    retq
19  %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
20  %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
21  %r = fadd <2 x double> %a, %b
22  ret <2 x double> %r
23}
24
25define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) {
26; SSE3-LABEL: haddpd2:
27; SSE3:       # %bb.0:
28; SSE3-NEXT:    haddpd %xmm1, %xmm0
29; SSE3-NEXT:    retq
30;
31; AVX-LABEL: haddpd2:
32; AVX:       # %bb.0:
33; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
34; AVX-NEXT:    retq
35  %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 2>
36  %b = shufflevector <2 x double> %y, <2 x double> %x, <2 x i32> <i32 2, i32 1>
37  %r = fadd <2 x double> %a, %b
38  ret <2 x double> %r
39}
40
41define <2 x double> @haddpd3(<2 x double> %x) {
42; SSE3-SLOW-LABEL: haddpd3:
43; SSE3-SLOW:       # %bb.0:
44; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
45; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
46; SSE3-SLOW-NEXT:    addpd %xmm0, %xmm1
47; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
48; SSE3-SLOW-NEXT:    retq
49;
50; SSE3-FAST-LABEL: haddpd3:
51; SSE3-FAST:       # %bb.0:
52; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
53; SSE3-FAST-NEXT:    retq
54;
55; AVX-SLOW-LABEL: haddpd3:
56; AVX-SLOW:       # %bb.0:
57; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
58; AVX-SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
59; AVX-SLOW-NEXT:    retq
60;
61; AVX-FAST-LABEL: haddpd3:
62; AVX-FAST:       # %bb.0:
63; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
64; AVX-FAST-NEXT:    retq
65  %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
66  %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
67  %r = fadd <2 x double> %a, %b
68  ret <2 x double> %r
69}
70
71define <4 x float> @haddps1(<4 x float> %x, <4 x float> %y) {
72; SSE3-LABEL: haddps1:
73; SSE3:       # %bb.0:
74; SSE3-NEXT:    haddps %xmm1, %xmm0
75; SSE3-NEXT:    retq
76;
77; AVX-LABEL: haddps1:
78; AVX:       # %bb.0:
79; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
80; AVX-NEXT:    retq
81  %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
82  %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
83  %r = fadd <4 x float> %a, %b
84  ret <4 x float> %r
85}
86
87define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) {
88; SSE3-LABEL: haddps2:
89; SSE3:       # %bb.0:
90; SSE3-NEXT:    haddps %xmm1, %xmm0
91; SSE3-NEXT:    retq
92;
93; AVX-LABEL: haddps2:
94; AVX:       # %bb.0:
95; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
96; AVX-NEXT:    retq
97  %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
98  %b = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3>
99  %r = fadd <4 x float> %a, %b
100  ret <4 x float> %r
101}
102
103define <4 x float> @haddps3(<4 x float> %x) {
104; SSE3-SLOW-LABEL: haddps3:
105; SSE3-SLOW:       # %bb.0:
106; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
107; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
108; SSE3-SLOW-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
109; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
110; SSE3-SLOW-NEXT:    retq
111;
112; SSE3-FAST-LABEL: haddps3:
113; SSE3-FAST:       # %bb.0:
114; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
115; SSE3-FAST-NEXT:    retq
116;
117; AVX-SLOW-LABEL: haddps3:
118; AVX-SLOW:       # %bb.0:
119; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
120; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
121; AVX-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
122; AVX-SLOW-NEXT:    retq
123;
124; AVX-FAST-LABEL: haddps3:
125; AVX-FAST:       # %bb.0:
126; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
127; AVX-FAST-NEXT:    retq
128  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
129  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
130  %r = fadd <4 x float> %a, %b
131  ret <4 x float> %r
132}
133
134define <4 x float> @haddps4(<4 x float> %x) {
135; SSE3-SLOW-LABEL: haddps4:
136; SSE3-SLOW:       # %bb.0:
137; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
138; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
139; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
140; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
141; SSE3-SLOW-NEXT:    retq
142;
143; SSE3-FAST-LABEL: haddps4:
144; SSE3-FAST:       # %bb.0:
145; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
146; SSE3-FAST-NEXT:    retq
147;
148; AVX-SLOW-LABEL: haddps4:
149; AVX-SLOW:       # %bb.0:
150; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
151; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
152; AVX-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
153; AVX-SLOW-NEXT:    retq
154;
155; AVX-FAST-LABEL: haddps4:
156; AVX-FAST:       # %bb.0:
157; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
158; AVX-FAST-NEXT:    retq
159  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
160  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
161  %r = fadd <4 x float> %a, %b
162  ret <4 x float> %r
163}
164
165define <4 x float> @haddps5(<4 x float> %x) {
166; SSE3-SLOW-LABEL: haddps5:
167; SSE3-SLOW:       # %bb.0:
168; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
169; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,3]
170; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2,2,3]
171; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
172; SSE3-SLOW-NEXT:    retq
173;
174; SSE3-FAST-LABEL: haddps5:
175; SSE3-FAST:       # %bb.0:
176; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
177; SSE3-FAST-NEXT:    retq
178;
179; AVX-SLOW-LABEL: haddps5:
180; AVX-SLOW:       # %bb.0:
181; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,3,2,3]
182; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,2,2,3]
183; AVX-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
184; AVX-SLOW-NEXT:    retq
185;
186; AVX-FAST-LABEL: haddps5:
187; AVX-FAST:       # %bb.0:
188; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
189; AVX-FAST-NEXT:    retq
190  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
191  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
192  %r = fadd <4 x float> %a, %b
193  ret <4 x float> %r
194}
195
196define <4 x float> @haddps6(<4 x float> %x) {
197; SSE3-SLOW-LABEL: haddps6:
198; SSE3-SLOW:       # %bb.0:
199; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
200; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
201; SSE3-SLOW-NEXT:    retq
202;
203; SSE3-FAST-LABEL: haddps6:
204; SSE3-FAST:       # %bb.0:
205; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
206; SSE3-FAST-NEXT:    retq
207;
208; AVX-SLOW-LABEL: haddps6:
209; AVX-SLOW:       # %bb.0:
210; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
211; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
212; AVX-SLOW-NEXT:    retq
213;
214; AVX-FAST-LABEL: haddps6:
215; AVX-FAST:       # %bb.0:
216; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
217; AVX-FAST-NEXT:    retq
218  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
219  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
220  %r = fadd <4 x float> %a, %b
221  ret <4 x float> %r
222}
223
224define <4 x float> @haddps7(<4 x float> %x) {
225; SSE3-SLOW-LABEL: haddps7:
226; SSE3-SLOW:       # %bb.0:
227; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
228; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
229; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
230; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
231; SSE3-SLOW-NEXT:    retq
232;
233; SSE3-FAST-LABEL: haddps7:
234; SSE3-FAST:       # %bb.0:
235; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
236; SSE3-FAST-NEXT:    retq
237;
238; AVX-SLOW-LABEL: haddps7:
239; AVX-SLOW:       # %bb.0:
240; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
241; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
242; AVX-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
243; AVX-SLOW-NEXT:    retq
244;
245; AVX-FAST-LABEL: haddps7:
246; AVX-FAST:       # %bb.0:
247; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
248; AVX-FAST-NEXT:    retq
249  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
250  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
251  %r = fadd <4 x float> %a, %b
252  ret <4 x float> %r
253}
254
255define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) {
256; SSE3-LABEL: hsubpd1:
257; SSE3:       # %bb.0:
258; SSE3-NEXT:    hsubpd %xmm1, %xmm0
259; SSE3-NEXT:    retq
260;
261; AVX-LABEL: hsubpd1:
262; AVX:       # %bb.0:
263; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
264; AVX-NEXT:    retq
265  %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
266  %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
267  %r = fsub <2 x double> %a, %b
268  ret <2 x double> %r
269}
270
271define <2 x double> @hsubpd2(<2 x double> %x) {
272; SSE3-SLOW-LABEL: hsubpd2:
273; SSE3-SLOW:       # %bb.0:
274; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
275; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
276; SSE3-SLOW-NEXT:    subpd %xmm1, %xmm0
277; SSE3-SLOW-NEXT:    retq
278;
279; SSE3-FAST-LABEL: hsubpd2:
280; SSE3-FAST:       # %bb.0:
281; SSE3-FAST-NEXT:    hsubpd %xmm0, %xmm0
282; SSE3-FAST-NEXT:    retq
283;
284; AVX-SLOW-LABEL: hsubpd2:
285; AVX-SLOW:       # %bb.0:
286; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
287; AVX-SLOW-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
288; AVX-SLOW-NEXT:    retq
289;
290; AVX-FAST-LABEL: hsubpd2:
291; AVX-FAST:       # %bb.0:
292; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
293; AVX-FAST-NEXT:    retq
294  %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
295  %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
296  %r = fsub <2 x double> %a, %b
297  ret <2 x double> %r
298}
299
300define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) {
301; SSE3-LABEL: hsubps1:
302; SSE3:       # %bb.0:
303; SSE3-NEXT:    hsubps %xmm1, %xmm0
304; SSE3-NEXT:    retq
305;
306; AVX-LABEL: hsubps1:
307; AVX:       # %bb.0:
308; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
309; AVX-NEXT:    retq
310  %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
311  %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
312  %r = fsub <4 x float> %a, %b
313  ret <4 x float> %r
314}
315
316define <4 x float> @hsubps2(<4 x float> %x) {
317; SSE3-SLOW-LABEL: hsubps2:
318; SSE3-SLOW:       # %bb.0:
319; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
320; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
321; SSE3-SLOW-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
322; SSE3-SLOW-NEXT:    subps %xmm0, %xmm1
323; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
324; SSE3-SLOW-NEXT:    retq
325;
326; SSE3-FAST-LABEL: hsubps2:
327; SSE3-FAST:       # %bb.0:
328; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
329; SSE3-FAST-NEXT:    retq
330;
331; AVX-SLOW-LABEL: hsubps2:
332; AVX-SLOW:       # %bb.0:
333; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
334; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
335; AVX-SLOW-NEXT:    vsubps %xmm0, %xmm1, %xmm0
336; AVX-SLOW-NEXT:    retq
337;
338; AVX-FAST-LABEL: hsubps2:
339; AVX-FAST:       # %bb.0:
340; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
341; AVX-FAST-NEXT:    retq
342  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
343  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
344  %r = fsub <4 x float> %a, %b
345  ret <4 x float> %r
346}
347
348define <4 x float> @hsubps3(<4 x float> %x) {
349; SSE3-SLOW-LABEL: hsubps3:
350; SSE3-SLOW:       # %bb.0:
351; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
352; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
353; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
354; SSE3-SLOW-NEXT:    subps %xmm0, %xmm1
355; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
356; SSE3-SLOW-NEXT:    retq
357;
358; SSE3-FAST-LABEL: hsubps3:
359; SSE3-FAST:       # %bb.0:
360; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
361; SSE3-FAST-NEXT:    retq
362;
363; AVX-SLOW-LABEL: hsubps3:
364; AVX-SLOW:       # %bb.0:
365; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
366; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
367; AVX-SLOW-NEXT:    vsubps %xmm0, %xmm1, %xmm0
368; AVX-SLOW-NEXT:    retq
369;
370; AVX-FAST-LABEL: hsubps3:
371; AVX-FAST:       # %bb.0:
372; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
373; AVX-FAST-NEXT:    retq
374  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
375  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
376  %r = fsub <4 x float> %a, %b
377  ret <4 x float> %r
378}
379
380define <4 x float> @hsubps4(<4 x float> %x) {
381; SSE3-SLOW-LABEL: hsubps4:
382; SSE3-SLOW:       # %bb.0:
383; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
384; SSE3-SLOW-NEXT:    subps %xmm1, %xmm0
385; SSE3-SLOW-NEXT:    retq
386;
387; SSE3-FAST-LABEL: hsubps4:
388; SSE3-FAST:       # %bb.0:
389; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
390; SSE3-FAST-NEXT:    retq
391;
392; AVX-SLOW-LABEL: hsubps4:
393; AVX-SLOW:       # %bb.0:
394; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
395; AVX-SLOW-NEXT:    vsubps %xmm1, %xmm0, %xmm0
396; AVX-SLOW-NEXT:    retq
397;
398; AVX-FAST-LABEL: hsubps4:
399; AVX-FAST:       # %bb.0:
400; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
401; AVX-FAST-NEXT:    retq
402  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
403  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
404  %r = fsub <4 x float> %a, %b
405  ret <4 x float> %r
406}
407
408define <8 x float> @vhaddps1(<8 x float> %x, <8 x float> %y) {
409; SSE3-LABEL: vhaddps1:
410; SSE3:       # %bb.0:
411; SSE3-NEXT:    haddps %xmm2, %xmm0
412; SSE3-NEXT:    haddps %xmm3, %xmm1
413; SSE3-NEXT:    retq
414;
415; AVX-LABEL: vhaddps1:
416; AVX:       # %bb.0:
417; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
418; AVX-NEXT:    retq
419  %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
420  %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
421  %r = fadd <8 x float> %a, %b
422  ret <8 x float> %r
423}
424
425define <8 x float> @vhaddps2(<8 x float> %x, <8 x float> %y) {
426; SSE3-LABEL: vhaddps2:
427; SSE3:       # %bb.0:
428; SSE3-NEXT:    haddps %xmm2, %xmm0
429; SSE3-NEXT:    haddps %xmm3, %xmm1
430; SSE3-NEXT:    retq
431;
432; AVX-LABEL: vhaddps2:
433; AVX:       # %bb.0:
434; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
435; AVX-NEXT:    retq
436  %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 2, i32 9, i32 10, i32 5, i32 6, i32 13, i32 14>
437  %b = shufflevector <8 x float> %y, <8 x float> %x, <8 x i32> <i32 8, i32 11, i32 0, i32 3, i32 12, i32 15, i32 4, i32 7>
438  %r = fadd <8 x float> %a, %b
439  ret <8 x float> %r
440}
441
442define <8 x float> @vhaddps3(<8 x float> %x) {
443; SSE3-SLOW-LABEL: vhaddps3:
444; SSE3-SLOW:       # %bb.0:
445; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm2
446; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
447; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm3
448; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,3]
449; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
450; SSE3-SLOW-NEXT:    addps %xmm2, %xmm1
451; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
452; SSE3-SLOW-NEXT:    addps %xmm3, %xmm0
453; SSE3-SLOW-NEXT:    retq
454;
455; SSE3-FAST-LABEL: vhaddps3:
456; SSE3-FAST:       # %bb.0:
457; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
458; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
459; SSE3-FAST-NEXT:    retq
460;
461; AVX-SLOW-LABEL: vhaddps3:
462; AVX-SLOW:       # %bb.0:
463; AVX-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
464; AVX-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
465; AVX-SLOW-NEXT:    vaddps %ymm0, %ymm1, %ymm0
466; AVX-SLOW-NEXT:    retq
467;
468; AVX-FAST-LABEL: vhaddps3:
469; AVX-FAST:       # %bb.0:
470; AVX-FAST-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
471; AVX-FAST-NEXT:    retq
472  %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
473  %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
474  %r = fadd <8 x float> %a, %b
475  ret <8 x float> %r
476}
477
478define <8 x float> @vhsubps1(<8 x float> %x, <8 x float> %y) {
479; SSE3-LABEL: vhsubps1:
480; SSE3:       # %bb.0:
481; SSE3-NEXT:    hsubps %xmm2, %xmm0
482; SSE3-NEXT:    hsubps %xmm3, %xmm1
483; SSE3-NEXT:    retq
484;
485; AVX-LABEL: vhsubps1:
486; AVX:       # %bb.0:
487; AVX-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
488; AVX-NEXT:    retq
489  %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
490  %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
491  %r = fsub <8 x float> %a, %b
492  ret <8 x float> %r
493}
494
495define <8 x float> @vhsubps3(<8 x float> %x) {
496; SSE3-SLOW-LABEL: vhsubps3:
497; SSE3-SLOW:       # %bb.0:
498; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm2
499; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
500; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm3
501; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,3]
502; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
503; SSE3-SLOW-NEXT:    subps %xmm1, %xmm2
504; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
505; SSE3-SLOW-NEXT:    subps %xmm0, %xmm3
506; SSE3-SLOW-NEXT:    movaps %xmm3, %xmm0
507; SSE3-SLOW-NEXT:    movaps %xmm2, %xmm1
508; SSE3-SLOW-NEXT:    retq
509;
510; SSE3-FAST-LABEL: vhsubps3:
511; SSE3-FAST:       # %bb.0:
512; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
513; SSE3-FAST-NEXT:    hsubps %xmm1, %xmm1
514; SSE3-FAST-NEXT:    retq
515;
516; AVX-SLOW-LABEL: vhsubps3:
517; AVX-SLOW:       # %bb.0:
518; AVX-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
519; AVX-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
520; AVX-SLOW-NEXT:    vsubps %ymm0, %ymm1, %ymm0
521; AVX-SLOW-NEXT:    retq
522;
523; AVX-FAST-LABEL: vhsubps3:
524; AVX-FAST:       # %bb.0:
525; AVX-FAST-NEXT:    vhsubps %ymm0, %ymm0, %ymm0
526; AVX-FAST-NEXT:    retq
527  %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
528  %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
529  %r = fsub <8 x float> %a, %b
530  ret <8 x float> %r
531}
532
533define <4 x double> @vhaddpd1(<4 x double> %x, <4 x double> %y) {
534; SSE3-LABEL: vhaddpd1:
535; SSE3:       # %bb.0:
536; SSE3-NEXT:    haddpd %xmm2, %xmm0
537; SSE3-NEXT:    haddpd %xmm3, %xmm1
538; SSE3-NEXT:    retq
539;
540; AVX-LABEL: vhaddpd1:
541; AVX:       # %bb.0:
542; AVX-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
543; AVX-NEXT:    retq
544  %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
545  %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
546  %r = fadd <4 x double> %a, %b
547  ret <4 x double> %r
548}
549
550define <4 x double> @vhsubpd1(<4 x double> %x, <4 x double> %y) {
551; SSE3-LABEL: vhsubpd1:
552; SSE3:       # %bb.0:
553; SSE3-NEXT:    hsubpd %xmm2, %xmm0
554; SSE3-NEXT:    hsubpd %xmm3, %xmm1
555; SSE3-NEXT:    retq
556;
557; AVX-LABEL: vhsubpd1:
558; AVX:       # %bb.0:
559; AVX-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
560; AVX-NEXT:    retq
561  %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
562  %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
563  %r = fsub <4 x double> %a, %b
564  ret <4 x double> %r
565}
566
567define <2 x float> @haddps_v2f32(<4 x float> %v0) {
568; SSE3-LABEL: haddps_v2f32:
569; SSE3:       # %bb.0:
570; SSE3-NEXT:    haddps %xmm0, %xmm0
571; SSE3-NEXT:    retq
572;
573; AVX-LABEL: haddps_v2f32:
574; AVX:       # %bb.0:
575; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
576; AVX-NEXT:    retq
577  %v0.0 = extractelement <4 x float> %v0, i32 0
578  %v0.1 = extractelement <4 x float> %v0, i32 1
579  %v0.2 = extractelement <4 x float> %v0, i32 2
580  %v0.3 = extractelement <4 x float> %v0, i32 3
581  %op0 = fadd float %v0.0, %v0.1
582  %op1 = fadd float %v0.2, %v0.3
583  %res0 = insertelement <2 x float> undef, float %op0, i32 0
584  %res1 = insertelement <2 x float> %res0, float %op1, i32 1
585  ret <2 x float> %res1
586}
587
588; 128-bit vectors, float/double, fadd/fsub
589
590define float @extract_extract_v4f32_fadd_f32(<4 x float> %x) {
591; SSE3-SLOW-LABEL: extract_extract_v4f32_fadd_f32:
592; SSE3-SLOW:       # %bb.0:
593; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
594; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
595; SSE3-SLOW-NEXT:    retq
596;
597; SSE3-FAST-LABEL: extract_extract_v4f32_fadd_f32:
598; SSE3-FAST:       # %bb.0:
599; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
600; SSE3-FAST-NEXT:    retq
601;
602; AVX-SLOW-LABEL: extract_extract_v4f32_fadd_f32:
603; AVX-SLOW:       # %bb.0:
604; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
605; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
606; AVX-SLOW-NEXT:    retq
607;
608; AVX-FAST-LABEL: extract_extract_v4f32_fadd_f32:
609; AVX-FAST:       # %bb.0:
610; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
611; AVX-FAST-NEXT:    retq
612  %x0 = extractelement <4 x float> %x, i32 0
613  %x1 = extractelement <4 x float> %x, i32 1
614  %x01 = fadd float %x0, %x1
615  ret float %x01
616}
617
618define float @extract_extract_v4f32_fadd_f32_commute(<4 x float> %x) {
619; SSE3-SLOW-LABEL: extract_extract_v4f32_fadd_f32_commute:
620; SSE3-SLOW:       # %bb.0:
621; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
622; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
623; SSE3-SLOW-NEXT:    retq
624;
625; SSE3-FAST-LABEL: extract_extract_v4f32_fadd_f32_commute:
626; SSE3-FAST:       # %bb.0:
627; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
628; SSE3-FAST-NEXT:    retq
629;
630; AVX-SLOW-LABEL: extract_extract_v4f32_fadd_f32_commute:
631; AVX-SLOW:       # %bb.0:
632; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
633; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
634; AVX-SLOW-NEXT:    retq
635;
636; AVX-FAST-LABEL: extract_extract_v4f32_fadd_f32_commute:
637; AVX-FAST:       # %bb.0:
638; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
639; AVX-FAST-NEXT:    retq
640  %x0 = extractelement <4 x float> %x, i32 0
641  %x1 = extractelement <4 x float> %x, i32 1
642  %x01 = fadd float %x1, %x0
643  ret float %x01
644}
645
646define double @extract_extract_v2f64_fadd_f64(<2 x double> %x) {
647; SSE3-SLOW-LABEL: extract_extract_v2f64_fadd_f64:
648; SSE3-SLOW:       # %bb.0:
649; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
650; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
651; SSE3-SLOW-NEXT:    addsd %xmm0, %xmm1
652; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
653; SSE3-SLOW-NEXT:    retq
654;
655; SSE3-FAST-LABEL: extract_extract_v2f64_fadd_f64:
656; SSE3-FAST:       # %bb.0:
657; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
658; SSE3-FAST-NEXT:    retq
659;
660; AVX-SLOW-LABEL: extract_extract_v2f64_fadd_f64:
661; AVX-SLOW:       # %bb.0:
662; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
663; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
664; AVX-SLOW-NEXT:    retq
665;
666; AVX-FAST-LABEL: extract_extract_v2f64_fadd_f64:
667; AVX-FAST:       # %bb.0:
668; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
669; AVX-FAST-NEXT:    retq
670  %x0 = extractelement <2 x double> %x, i32 0
671  %x1 = extractelement <2 x double> %x, i32 1
672  %x01 = fadd double %x0, %x1
673  ret double %x01
674}
675
676define double @extract_extract_v2f64_fadd_f64_commute(<2 x double> %x) {
677; SSE3-SLOW-LABEL: extract_extract_v2f64_fadd_f64_commute:
678; SSE3-SLOW:       # %bb.0:
679; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
680; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
681; SSE3-SLOW-NEXT:    addsd %xmm0, %xmm1
682; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
683; SSE3-SLOW-NEXT:    retq
684;
685; SSE3-FAST-LABEL: extract_extract_v2f64_fadd_f64_commute:
686; SSE3-FAST:       # %bb.0:
687; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
688; SSE3-FAST-NEXT:    retq
689;
690; AVX-SLOW-LABEL: extract_extract_v2f64_fadd_f64_commute:
691; AVX-SLOW:       # %bb.0:
692; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
693; AVX-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
694; AVX-SLOW-NEXT:    retq
695;
696; AVX-FAST-LABEL: extract_extract_v2f64_fadd_f64_commute:
697; AVX-FAST:       # %bb.0:
698; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
699; AVX-FAST-NEXT:    retq
700  %x0 = extractelement <2 x double> %x, i32 0
701  %x1 = extractelement <2 x double> %x, i32 1
702  %x01 = fadd double %x1, %x0
703  ret double %x01
704}
705
706define float @extract_extract_v4f32_fsub_f32(<4 x float> %x) {
707; SSE3-SLOW-LABEL: extract_extract_v4f32_fsub_f32:
708; SSE3-SLOW:       # %bb.0:
709; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
710; SSE3-SLOW-NEXT:    subss %xmm1, %xmm0
711; SSE3-SLOW-NEXT:    retq
712;
713; SSE3-FAST-LABEL: extract_extract_v4f32_fsub_f32:
714; SSE3-FAST:       # %bb.0:
715; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
716; SSE3-FAST-NEXT:    retq
717;
718; AVX-SLOW-LABEL: extract_extract_v4f32_fsub_f32:
719; AVX-SLOW:       # %bb.0:
720; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
721; AVX-SLOW-NEXT:    vsubss %xmm1, %xmm0, %xmm0
722; AVX-SLOW-NEXT:    retq
723;
724; AVX-FAST-LABEL: extract_extract_v4f32_fsub_f32:
725; AVX-FAST:       # %bb.0:
726; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
727; AVX-FAST-NEXT:    retq
728  %x0 = extractelement <4 x float> %x, i32 0
729  %x1 = extractelement <4 x float> %x, i32 1
730  %x01 = fsub float %x0, %x1
731  ret float %x01
732}
733
734define float @extract_extract_v4f32_fsub_f32_commute(<4 x float> %x) {
735; SSE3-LABEL: extract_extract_v4f32_fsub_f32_commute:
736; SSE3:       # %bb.0:
737; SSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
738; SSE3-NEXT:    subss %xmm0, %xmm1
739; SSE3-NEXT:    movaps %xmm1, %xmm0
740; SSE3-NEXT:    retq
741;
742; AVX-LABEL: extract_extract_v4f32_fsub_f32_commute:
743; AVX:       # %bb.0:
744; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
745; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
746; AVX-NEXT:    retq
747  %x0 = extractelement <4 x float> %x, i32 0
748  %x1 = extractelement <4 x float> %x, i32 1
749  %x01 = fsub float %x1, %x0
750  ret float %x01
751}
752
753define double @extract_extract_v2f64_fsub_f64(<2 x double> %x) {
754; SSE3-SLOW-LABEL: extract_extract_v2f64_fsub_f64:
755; SSE3-SLOW:       # %bb.0:
756; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
757; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
758; SSE3-SLOW-NEXT:    subsd %xmm1, %xmm0
759; SSE3-SLOW-NEXT:    retq
760;
761; SSE3-FAST-LABEL: extract_extract_v2f64_fsub_f64:
762; SSE3-FAST:       # %bb.0:
763; SSE3-FAST-NEXT:    hsubpd %xmm0, %xmm0
764; SSE3-FAST-NEXT:    retq
765;
766; AVX-SLOW-LABEL: extract_extract_v2f64_fsub_f64:
767; AVX-SLOW:       # %bb.0:
768; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
769; AVX-SLOW-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
770; AVX-SLOW-NEXT:    retq
771;
772; AVX-FAST-LABEL: extract_extract_v2f64_fsub_f64:
773; AVX-FAST:       # %bb.0:
774; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
775; AVX-FAST-NEXT:    retq
776  %x0 = extractelement <2 x double> %x, i32 0
777  %x1 = extractelement <2 x double> %x, i32 1
778  %x01 = fsub double %x0, %x1
779  ret double %x01
780}
781
782define double @extract_extract_v2f64_fsub_f64_commute(<2 x double> %x) {
783; SSE3-LABEL: extract_extract_v2f64_fsub_f64_commute:
784; SSE3:       # %bb.0:
785; SSE3-NEXT:    movapd %xmm0, %xmm1
786; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
787; SSE3-NEXT:    subsd %xmm0, %xmm1
788; SSE3-NEXT:    movapd %xmm1, %xmm0
789; SSE3-NEXT:    retq
790;
791; AVX-LABEL: extract_extract_v2f64_fsub_f64_commute:
792; AVX:       # %bb.0:
793; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
794; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
795; AVX-NEXT:    retq
796  %x0 = extractelement <2 x double> %x, i32 0
797  %x1 = extractelement <2 x double> %x, i32 1
798  %x01 = fsub double %x1, %x0
799  ret double %x01
800}
801
802; 256-bit vectors, float/double, fadd/fsub
803
804define float @extract_extract_v8f32_fadd_f32(<8 x float> %x) {
805; SSE3-SLOW-LABEL: extract_extract_v8f32_fadd_f32:
806; SSE3-SLOW:       # %bb.0:
807; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
808; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
809; SSE3-SLOW-NEXT:    retq
810;
811; SSE3-FAST-LABEL: extract_extract_v8f32_fadd_f32:
812; SSE3-FAST:       # %bb.0:
813; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
814; SSE3-FAST-NEXT:    retq
815;
816; AVX-SLOW-LABEL: extract_extract_v8f32_fadd_f32:
817; AVX-SLOW:       # %bb.0:
818; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
819; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
820; AVX-SLOW-NEXT:    vzeroupper
821; AVX-SLOW-NEXT:    retq
822;
823; AVX-FAST-LABEL: extract_extract_v8f32_fadd_f32:
824; AVX-FAST:       # %bb.0:
825; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
826; AVX-FAST-NEXT:    vzeroupper
827; AVX-FAST-NEXT:    retq
828  %x0 = extractelement <8 x float> %x, i32 0
829  %x1 = extractelement <8 x float> %x, i32 1
830  %x01 = fadd float %x0, %x1
831  ret float %x01
832}
833
834define float @extract_extract_v8f32_fadd_f32_commute(<8 x float> %x) {
835; SSE3-SLOW-LABEL: extract_extract_v8f32_fadd_f32_commute:
836; SSE3-SLOW:       # %bb.0:
837; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
838; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
839; SSE3-SLOW-NEXT:    retq
840;
841; SSE3-FAST-LABEL: extract_extract_v8f32_fadd_f32_commute:
842; SSE3-FAST:       # %bb.0:
843; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
844; SSE3-FAST-NEXT:    retq
845;
846; AVX-SLOW-LABEL: extract_extract_v8f32_fadd_f32_commute:
847; AVX-SLOW:       # %bb.0:
848; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
849; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
850; AVX-SLOW-NEXT:    vzeroupper
851; AVX-SLOW-NEXT:    retq
852;
853; AVX-FAST-LABEL: extract_extract_v8f32_fadd_f32_commute:
854; AVX-FAST:       # %bb.0:
855; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
856; AVX-FAST-NEXT:    vzeroupper
857; AVX-FAST-NEXT:    retq
858  %x0 = extractelement <8 x float> %x, i32 0
859  %x1 = extractelement <8 x float> %x, i32 1
860  %x01 = fadd float %x1, %x0
861  ret float %x01
862}
863
864define double @extract_extract_v4f64_fadd_f64(<4 x double> %x) {
865; SSE3-SLOW-LABEL: extract_extract_v4f64_fadd_f64:
866; SSE3-SLOW:       # %bb.0:
867; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
868; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
869; SSE3-SLOW-NEXT:    addsd %xmm0, %xmm1
870; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
871; SSE3-SLOW-NEXT:    retq
872;
873; SSE3-FAST-LABEL: extract_extract_v4f64_fadd_f64:
874; SSE3-FAST:       # %bb.0:
875; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
876; SSE3-FAST-NEXT:    retq
877;
878; AVX-SLOW-LABEL: extract_extract_v4f64_fadd_f64:
879; AVX-SLOW:       # %bb.0:
880; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
881; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
882; AVX-SLOW-NEXT:    vzeroupper
883; AVX-SLOW-NEXT:    retq
884;
885; AVX-FAST-LABEL: extract_extract_v4f64_fadd_f64:
886; AVX-FAST:       # %bb.0:
887; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
888; AVX-FAST-NEXT:    vzeroupper
889; AVX-FAST-NEXT:    retq
890  %x0 = extractelement <4 x double> %x, i32 0
891  %x1 = extractelement <4 x double> %x, i32 1
892  %x01 = fadd double %x0, %x1
893  ret double %x01
894}
895
896define double @extract_extract_v4f64_fadd_f64_commute(<4 x double> %x) {
897; SSE3-SLOW-LABEL: extract_extract_v4f64_fadd_f64_commute:
898; SSE3-SLOW:       # %bb.0:
899; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
900; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
901; SSE3-SLOW-NEXT:    addsd %xmm0, %xmm1
902; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
903; SSE3-SLOW-NEXT:    retq
904;
905; SSE3-FAST-LABEL: extract_extract_v4f64_fadd_f64_commute:
906; SSE3-FAST:       # %bb.0:
907; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
908; SSE3-FAST-NEXT:    retq
909;
910; AVX-SLOW-LABEL: extract_extract_v4f64_fadd_f64_commute:
911; AVX-SLOW:       # %bb.0:
912; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
913; AVX-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
914; AVX-SLOW-NEXT:    vzeroupper
915; AVX-SLOW-NEXT:    retq
916;
917; AVX-FAST-LABEL: extract_extract_v4f64_fadd_f64_commute:
918; AVX-FAST:       # %bb.0:
919; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
920; AVX-FAST-NEXT:    vzeroupper
921; AVX-FAST-NEXT:    retq
922  %x0 = extractelement <4 x double> %x, i32 0
923  %x1 = extractelement <4 x double> %x, i32 1
924  %x01 = fadd double %x1, %x0
925  ret double %x01
926}
927
928define float @extract_extract_v8f32_fsub_f32(<8 x float> %x) {
929; SSE3-SLOW-LABEL: extract_extract_v8f32_fsub_f32:
930; SSE3-SLOW:       # %bb.0:
931; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
932; SSE3-SLOW-NEXT:    subss %xmm1, %xmm0
933; SSE3-SLOW-NEXT:    retq
934;
935; SSE3-FAST-LABEL: extract_extract_v8f32_fsub_f32:
936; SSE3-FAST:       # %bb.0:
937; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
938; SSE3-FAST-NEXT:    retq
939;
940; AVX-SLOW-LABEL: extract_extract_v8f32_fsub_f32:
941; AVX-SLOW:       # %bb.0:
942; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
943; AVX-SLOW-NEXT:    vsubss %xmm1, %xmm0, %xmm0
944; AVX-SLOW-NEXT:    vzeroupper
945; AVX-SLOW-NEXT:    retq
946;
947; AVX-FAST-LABEL: extract_extract_v8f32_fsub_f32:
948; AVX-FAST:       # %bb.0:
949; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
950; AVX-FAST-NEXT:    vzeroupper
951; AVX-FAST-NEXT:    retq
952  %x0 = extractelement <8 x float> %x, i32 0
953  %x1 = extractelement <8 x float> %x, i32 1
954  %x01 = fsub float %x0, %x1
955  ret float %x01
956}
957
958; Negative test...or get hoppy and negate?
959
960define float @extract_extract_v8f32_fsub_f32_commute(<8 x float> %x) {
961; SSE3-LABEL: extract_extract_v8f32_fsub_f32_commute:
962; SSE3:       # %bb.0:
963; SSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
964; SSE3-NEXT:    subss %xmm0, %xmm1
965; SSE3-NEXT:    movaps %xmm1, %xmm0
966; SSE3-NEXT:    retq
967;
968; AVX-LABEL: extract_extract_v8f32_fsub_f32_commute:
969; AVX:       # %bb.0:
970; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
971; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
972; AVX-NEXT:    vzeroupper
973; AVX-NEXT:    retq
974  %x0 = extractelement <8 x float> %x, i32 0
975  %x1 = extractelement <8 x float> %x, i32 1
976  %x01 = fsub float %x1, %x0
977  ret float %x01
978}
979
980define double @extract_extract_v4f64_fsub_f64(<4 x double> %x) {
981; SSE3-SLOW-LABEL: extract_extract_v4f64_fsub_f64:
982; SSE3-SLOW:       # %bb.0:
983; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
984; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
985; SSE3-SLOW-NEXT:    subsd %xmm1, %xmm0
986; SSE3-SLOW-NEXT:    retq
987;
988; SSE3-FAST-LABEL: extract_extract_v4f64_fsub_f64:
989; SSE3-FAST:       # %bb.0:
990; SSE3-FAST-NEXT:    hsubpd %xmm0, %xmm0
991; SSE3-FAST-NEXT:    retq
992;
993; AVX-SLOW-LABEL: extract_extract_v4f64_fsub_f64:
994; AVX-SLOW:       # %bb.0:
995; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
996; AVX-SLOW-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
997; AVX-SLOW-NEXT:    vzeroupper
998; AVX-SLOW-NEXT:    retq
999;
1000; AVX-FAST-LABEL: extract_extract_v4f64_fsub_f64:
1001; AVX-FAST:       # %bb.0:
1002; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
1003; AVX-FAST-NEXT:    vzeroupper
1004; AVX-FAST-NEXT:    retq
1005  %x0 = extractelement <4 x double> %x, i32 0
1006  %x1 = extractelement <4 x double> %x, i32 1
1007  %x01 = fsub double %x0, %x1
1008  ret double %x01
1009}
1010
1011; Negative test...or get hoppy and negate?
1012
1013define double @extract_extract_v4f64_fsub_f64_commute(<4 x double> %x) {
1014; SSE3-LABEL: extract_extract_v4f64_fsub_f64_commute:
1015; SSE3:       # %bb.0:
1016; SSE3-NEXT:    movapd %xmm0, %xmm1
1017; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1018; SSE3-NEXT:    subsd %xmm0, %xmm1
1019; SSE3-NEXT:    movapd %xmm1, %xmm0
1020; SSE3-NEXT:    retq
1021;
1022; AVX-LABEL: extract_extract_v4f64_fsub_f64_commute:
1023; AVX:       # %bb.0:
1024; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1025; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
1026; AVX-NEXT:    vzeroupper
1027; AVX-NEXT:    retq
1028  %x0 = extractelement <4 x double> %x, i32 0
1029  %x1 = extractelement <4 x double> %x, i32 1
1030  %x01 = fsub double %x1, %x0
1031  ret double %x01
1032}
1033
1034; 512-bit vectors, float/double, fadd/fsub
1035
1036define float @extract_extract_v16f32_fadd_f32(<16 x float> %x) {
1037; SSE3-SLOW-LABEL: extract_extract_v16f32_fadd_f32:
1038; SSE3-SLOW:       # %bb.0:
1039; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1040; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1041; SSE3-SLOW-NEXT:    retq
1042;
1043; SSE3-FAST-LABEL: extract_extract_v16f32_fadd_f32:
1044; SSE3-FAST:       # %bb.0:
1045; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1046; SSE3-FAST-NEXT:    retq
1047;
1048; AVX-SLOW-LABEL: extract_extract_v16f32_fadd_f32:
1049; AVX-SLOW:       # %bb.0:
1050; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1051; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1052; AVX-SLOW-NEXT:    vzeroupper
1053; AVX-SLOW-NEXT:    retq
1054;
1055; AVX-FAST-LABEL: extract_extract_v16f32_fadd_f32:
1056; AVX-FAST:       # %bb.0:
1057; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1058; AVX-FAST-NEXT:    vzeroupper
1059; AVX-FAST-NEXT:    retq
1060  %x0 = extractelement <16 x float> %x, i32 0
1061  %x1 = extractelement <16 x float> %x, i32 1
1062  %x01 = fadd float %x0, %x1
1063  ret float %x01
1064}
1065
1066define float @extract_extract_v16f32_fadd_f32_commute(<16 x float> %x) {
1067; SSE3-SLOW-LABEL: extract_extract_v16f32_fadd_f32_commute:
1068; SSE3-SLOW:       # %bb.0:
1069; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1070; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1071; SSE3-SLOW-NEXT:    retq
1072;
1073; SSE3-FAST-LABEL: extract_extract_v16f32_fadd_f32_commute:
1074; SSE3-FAST:       # %bb.0:
1075; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1076; SSE3-FAST-NEXT:    retq
1077;
1078; AVX-SLOW-LABEL: extract_extract_v16f32_fadd_f32_commute:
1079; AVX-SLOW:       # %bb.0:
1080; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1081; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
1082; AVX-SLOW-NEXT:    vzeroupper
1083; AVX-SLOW-NEXT:    retq
1084;
1085; AVX-FAST-LABEL: extract_extract_v16f32_fadd_f32_commute:
1086; AVX-FAST:       # %bb.0:
1087; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1088; AVX-FAST-NEXT:    vzeroupper
1089; AVX-FAST-NEXT:    retq
1090  %x0 = extractelement <16 x float> %x, i32 0
1091  %x1 = extractelement <16 x float> %x, i32 1
1092  %x01 = fadd float %x1, %x0
1093  ret float %x01
1094}
1095
1096define double @extract_extract_v8f64_fadd_f64(<8 x double> %x) {
1097; SSE3-SLOW-LABEL: extract_extract_v8f64_fadd_f64:
1098; SSE3-SLOW:       # %bb.0:
1099; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1100; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1101; SSE3-SLOW-NEXT:    addsd %xmm0, %xmm1
1102; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
1103; SSE3-SLOW-NEXT:    retq
1104;
1105; SSE3-FAST-LABEL: extract_extract_v8f64_fadd_f64:
1106; SSE3-FAST:       # %bb.0:
1107; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
1108; SSE3-FAST-NEXT:    retq
1109;
1110; AVX-SLOW-LABEL: extract_extract_v8f64_fadd_f64:
1111; AVX-SLOW:       # %bb.0:
1112; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1113; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1114; AVX-SLOW-NEXT:    vzeroupper
1115; AVX-SLOW-NEXT:    retq
1116;
1117; AVX-FAST-LABEL: extract_extract_v8f64_fadd_f64:
1118; AVX-FAST:       # %bb.0:
1119; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1120; AVX-FAST-NEXT:    vzeroupper
1121; AVX-FAST-NEXT:    retq
1122  %x0 = extractelement <8 x double> %x, i32 0
1123  %x1 = extractelement <8 x double> %x, i32 1
1124  %x01 = fadd double %x0, %x1
1125  ret double %x01
1126}
1127
1128define double @extract_extract_v8f64_fadd_f64_commute(<8 x double> %x) {
1129; SSE3-SLOW-LABEL: extract_extract_v8f64_fadd_f64_commute:
1130; SSE3-SLOW:       # %bb.0:
1131; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1132; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1133; SSE3-SLOW-NEXT:    addsd %xmm0, %xmm1
1134; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
1135; SSE3-SLOW-NEXT:    retq
1136;
1137; SSE3-FAST-LABEL: extract_extract_v8f64_fadd_f64_commute:
1138; SSE3-FAST:       # %bb.0:
1139; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
1140; SSE3-FAST-NEXT:    retq
1141;
1142; AVX-SLOW-LABEL: extract_extract_v8f64_fadd_f64_commute:
1143; AVX-SLOW:       # %bb.0:
1144; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1145; AVX-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1146; AVX-SLOW-NEXT:    vzeroupper
1147; AVX-SLOW-NEXT:    retq
1148;
1149; AVX-FAST-LABEL: extract_extract_v8f64_fadd_f64_commute:
1150; AVX-FAST:       # %bb.0:
1151; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1152; AVX-FAST-NEXT:    vzeroupper
1153; AVX-FAST-NEXT:    retq
1154  %x0 = extractelement <8 x double> %x, i32 0
1155  %x1 = extractelement <8 x double> %x, i32 1
1156  %x01 = fadd double %x1, %x0
1157  ret double %x01
1158}
1159
1160define float @extract_extract_v16f32_fsub_f32(<16 x float> %x) {
1161; SSE3-SLOW-LABEL: extract_extract_v16f32_fsub_f32:
1162; SSE3-SLOW:       # %bb.0:
1163; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1164; SSE3-SLOW-NEXT:    subss %xmm1, %xmm0
1165; SSE3-SLOW-NEXT:    retq
1166;
1167; SSE3-FAST-LABEL: extract_extract_v16f32_fsub_f32:
1168; SSE3-FAST:       # %bb.0:
1169; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
1170; SSE3-FAST-NEXT:    retq
1171;
1172; AVX-SLOW-LABEL: extract_extract_v16f32_fsub_f32:
1173; AVX-SLOW:       # %bb.0:
1174; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1175; AVX-SLOW-NEXT:    vsubss %xmm1, %xmm0, %xmm0
1176; AVX-SLOW-NEXT:    vzeroupper
1177; AVX-SLOW-NEXT:    retq
1178;
1179; AVX-FAST-LABEL: extract_extract_v16f32_fsub_f32:
1180; AVX-FAST:       # %bb.0:
1181; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
1182; AVX-FAST-NEXT:    vzeroupper
1183; AVX-FAST-NEXT:    retq
1184  %x0 = extractelement <16 x float> %x, i32 0
1185  %x1 = extractelement <16 x float> %x, i32 1
1186  %x01 = fsub float %x0, %x1
1187  ret float %x01
1188}
1189
1190define float @extract_extract_v16f32_fsub_f32_commute(<16 x float> %x) {
1191; SSE3-LABEL: extract_extract_v16f32_fsub_f32_commute:
1192; SSE3:       # %bb.0:
1193; SSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1194; SSE3-NEXT:    subss %xmm0, %xmm1
1195; SSE3-NEXT:    movaps %xmm1, %xmm0
1196; SSE3-NEXT:    retq
1197;
1198; AVX-LABEL: extract_extract_v16f32_fsub_f32_commute:
1199; AVX:       # %bb.0:
1200; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1201; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
1202; AVX-NEXT:    vzeroupper
1203; AVX-NEXT:    retq
1204  %x0 = extractelement <16 x float> %x, i32 0
1205  %x1 = extractelement <16 x float> %x, i32 1
1206  %x01 = fsub float %x1, %x0
1207  ret float %x01
1208}
1209
1210define double @extract_extract_v8f64_fsub_f64(<8 x double> %x) {
1211; SSE3-SLOW-LABEL: extract_extract_v8f64_fsub_f64:
1212; SSE3-SLOW:       # %bb.0:
1213; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
1214; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1215; SSE3-SLOW-NEXT:    subsd %xmm1, %xmm0
1216; SSE3-SLOW-NEXT:    retq
1217;
1218; SSE3-FAST-LABEL: extract_extract_v8f64_fsub_f64:
1219; SSE3-FAST:       # %bb.0:
1220; SSE3-FAST-NEXT:    hsubpd %xmm0, %xmm0
1221; SSE3-FAST-NEXT:    retq
1222;
1223; AVX-SLOW-LABEL: extract_extract_v8f64_fsub_f64:
1224; AVX-SLOW:       # %bb.0:
1225; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1226; AVX-SLOW-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
1227; AVX-SLOW-NEXT:    vzeroupper
1228; AVX-SLOW-NEXT:    retq
1229;
1230; AVX-FAST-LABEL: extract_extract_v8f64_fsub_f64:
1231; AVX-FAST:       # %bb.0:
1232; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
1233; AVX-FAST-NEXT:    vzeroupper
1234; AVX-FAST-NEXT:    retq
1235  %x0 = extractelement <8 x double> %x, i32 0
1236  %x1 = extractelement <8 x double> %x, i32 1
1237  %x01 = fsub double %x0, %x1
1238  ret double %x01
1239}
1240
1241define double @extract_extract_v8f64_fsub_f64_commute(<8 x double> %x) {
1242; SSE3-LABEL: extract_extract_v8f64_fsub_f64_commute:
1243; SSE3:       # %bb.0:
1244; SSE3-NEXT:    movapd %xmm0, %xmm1
1245; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1246; SSE3-NEXT:    subsd %xmm0, %xmm1
1247; SSE3-NEXT:    movapd %xmm1, %xmm0
1248; SSE3-NEXT:    retq
1249;
1250; AVX-LABEL: extract_extract_v8f64_fsub_f64_commute:
1251; AVX:       # %bb.0:
1252; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1253; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
1254; AVX-NEXT:    vzeroupper
1255; AVX-NEXT:    retq
1256  %x0 = extractelement <8 x double> %x, i32 0
1257  %x1 = extractelement <8 x double> %x, i32 1
1258  %x01 = fsub double %x1, %x0
1259  ret double %x01
1260}
1261
1262; Check output when 1 or both extracts have extra uses.
1263
1264define float @extract_extract_v4f32_fadd_f32_uses1(<4 x float> %x, float* %p) {
1265; SSE3-SLOW-LABEL: extract_extract_v4f32_fadd_f32_uses1:
1266; SSE3-SLOW:       # %bb.0:
1267; SSE3-SLOW-NEXT:    movss %xmm0, (%rdi)
1268; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1269; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1270; SSE3-SLOW-NEXT:    retq
1271;
1272; SSE3-FAST-LABEL: extract_extract_v4f32_fadd_f32_uses1:
1273; SSE3-FAST:       # %bb.0:
1274; SSE3-FAST-NEXT:    movss %xmm0, (%rdi)
1275; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1276; SSE3-FAST-NEXT:    retq
1277;
1278; AVX-SLOW-LABEL: extract_extract_v4f32_fadd_f32_uses1:
1279; AVX-SLOW:       # %bb.0:
1280; AVX-SLOW-NEXT:    vmovss %xmm0, (%rdi)
1281; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1282; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1283; AVX-SLOW-NEXT:    retq
1284;
1285; AVX-FAST-LABEL: extract_extract_v4f32_fadd_f32_uses1:
1286; AVX-FAST:       # %bb.0:
1287; AVX-FAST-NEXT:    vmovss %xmm0, (%rdi)
1288; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1289; AVX-FAST-NEXT:    retq
1290  %x0 = extractelement <4 x float> %x, i32 0
1291  store float %x0, float* %p
1292  %x1 = extractelement <4 x float> %x, i32 1
1293  %x01 = fadd float %x0, %x1
1294  ret float %x01
1295}
1296
1297define float @extract_extract_v4f32_fadd_f32_uses2(<4 x float> %x, float* %p) {
1298; SSE3-SLOW-LABEL: extract_extract_v4f32_fadd_f32_uses2:
1299; SSE3-SLOW:       # %bb.0:
1300; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1301; SSE3-SLOW-NEXT:    movss %xmm1, (%rdi)
1302; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
1303; SSE3-SLOW-NEXT:    retq
1304;
1305; SSE3-FAST-LABEL: extract_extract_v4f32_fadd_f32_uses2:
1306; SSE3-FAST:       # %bb.0:
1307; SSE3-FAST-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1308; SSE3-FAST-NEXT:    movss %xmm1, (%rdi)
1309; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
1310; SSE3-FAST-NEXT:    retq
1311;
1312; AVX-SLOW-LABEL: extract_extract_v4f32_fadd_f32_uses2:
1313; AVX-SLOW:       # %bb.0:
1314; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1315; AVX-SLOW-NEXT:    vmovss %xmm1, (%rdi)
1316; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1317; AVX-SLOW-NEXT:    retq
1318;
1319; AVX-FAST-LABEL: extract_extract_v4f32_fadd_f32_uses2:
1320; AVX-FAST:       # %bb.0:
1321; AVX-FAST-NEXT:    vextractps $1, %xmm0, (%rdi)
1322; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
1323; AVX-FAST-NEXT:    retq
1324  %x0 = extractelement <4 x float> %x, i32 0
1325  %x1 = extractelement <4 x float> %x, i32 1
1326  store float %x1, float* %p
1327  %x01 = fadd float %x0, %x1
1328  ret float %x01
1329}
1330
1331define float @extract_extract_v4f32_fadd_f32_uses3(<4 x float> %x, float* %p1, float* %p2) {
1332; SSE3-LABEL: extract_extract_v4f32_fadd_f32_uses3:
1333; SSE3:       # %bb.0:
1334; SSE3-NEXT:    movss %xmm0, (%rdi)
1335; SSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1336; SSE3-NEXT:    movss %xmm1, (%rsi)
1337; SSE3-NEXT:    addss %xmm1, %xmm0
1338; SSE3-NEXT:    retq
1339;
1340; AVX-LABEL: extract_extract_v4f32_fadd_f32_uses3:
1341; AVX:       # %bb.0:
1342; AVX-NEXT:    vmovss %xmm0, (%rdi)
1343; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1344; AVX-NEXT:    vmovss %xmm1, (%rsi)
1345; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1346; AVX-NEXT:    retq
1347  %x0 = extractelement <4 x float> %x, i32 0
1348  store float %x0, float* %p1
1349  %x1 = extractelement <4 x float> %x, i32 1
1350  store float %x1, float* %p2
1351  %x01 = fadd float %x0, %x1
1352  ret float %x01
1353}
1354
1355