1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3           | FileCheck %s --check-prefixes=SSSE3,SSSE3-SLOW
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3-FAST
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx             | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2            | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2-FAST
8; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-SHUF
9; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-SHUF
10
11define <8 x i16> @phaddw1(<8 x i16> %x, <8 x i16> %y) {
12; SSSE3-LABEL: phaddw1:
13; SSSE3:       # %bb.0:
14; SSSE3-NEXT:    phaddw %xmm1, %xmm0
15; SSSE3-NEXT:    retq
16;
17; AVX-LABEL: phaddw1:
18; AVX:       # %bb.0:
19; AVX-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
20; AVX-NEXT:    retq
21  %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
22  %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
23  %r = add <8 x i16> %a, %b
24  ret <8 x i16> %r
25}
26
27define <8 x i16> @phaddw2(<8 x i16> %x, <8 x i16> %y) {
28; SSSE3-LABEL: phaddw2:
29; SSSE3:       # %bb.0:
30; SSSE3-NEXT:    phaddw %xmm1, %xmm0
31; SSSE3-NEXT:    retq
32;
33; AVX-LABEL: phaddw2:
34; AVX:       # %bb.0:
35; AVX-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
36; AVX-NEXT:    retq
37  %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14>
38  %b = shufflevector <8 x i16> %y, <8 x i16> %x, <8 x i32> <i32 8, i32 11, i32 12, i32 15, i32 0, i32 3, i32 4, i32 7>
39  %r = add <8 x i16> %a, %b
40  ret <8 x i16> %r
41}
42
43define <4 x i32> @phaddd1(<4 x i32> %x, <4 x i32> %y) {
44; SSSE3-LABEL: phaddd1:
45; SSSE3:       # %bb.0:
46; SSSE3-NEXT:    phaddd %xmm1, %xmm0
47; SSSE3-NEXT:    retq
48;
49; AVX-LABEL: phaddd1:
50; AVX:       # %bb.0:
51; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
52; AVX-NEXT:    retq
53  %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
54  %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
55  %r = add <4 x i32> %a, %b
56  ret <4 x i32> %r
57}
58
59define <4 x i32> @phaddd2(<4 x i32> %x, <4 x i32> %y) {
60; SSSE3-LABEL: phaddd2:
61; SSSE3:       # %bb.0:
62; SSSE3-NEXT:    phaddd %xmm1, %xmm0
63; SSSE3-NEXT:    retq
64;
65; AVX-LABEL: phaddd2:
66; AVX:       # %bb.0:
67; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
68; AVX-NEXT:    retq
69  %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
70  %b = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3>
71  %r = add <4 x i32> %a, %b
72  ret <4 x i32> %r
73}
74
75define <4 x i32> @phaddd3(<4 x i32> %x) {
76; SSSE3-LABEL: phaddd3:
77; SSSE3:       # %bb.0:
78; SSSE3-NEXT:    phaddd %xmm0, %xmm0
79; SSSE3-NEXT:    retq
80;
81; AVX-LABEL: phaddd3:
82; AVX:       # %bb.0:
83; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
84; AVX-NEXT:    retq
85  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
86  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
87  %r = add <4 x i32> %a, %b
88  ret <4 x i32> %r
89}
90
91define <4 x i32> @phaddd4(<4 x i32> %x) {
92; SSSE3-LABEL: phaddd4:
93; SSSE3:       # %bb.0:
94; SSSE3-NEXT:    phaddd %xmm0, %xmm0
95; SSSE3-NEXT:    retq
96;
97; AVX-LABEL: phaddd4:
98; AVX:       # %bb.0:
99; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
100; AVX-NEXT:    retq
101  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
102  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
103  %r = add <4 x i32> %a, %b
104  ret <4 x i32> %r
105}
106
107define <4 x i32> @phaddd5(<4 x i32> %x) {
108; SSSE3-LABEL: phaddd5:
109; SSSE3:       # %bb.0:
110; SSSE3-NEXT:    phaddd %xmm0, %xmm0
111; SSSE3-NEXT:    retq
112;
113; AVX-LABEL: phaddd5:
114; AVX:       # %bb.0:
115; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
116; AVX-NEXT:    retq
117  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
118  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
119  %r = add <4 x i32> %a, %b
120  ret <4 x i32> %r
121}
122
123define <4 x i32> @phaddd6(<4 x i32> %x) {
124; SSSE3-SLOW-LABEL: phaddd6:
125; SSSE3-SLOW:       # %bb.0:
126; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
127; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
128; SSSE3-SLOW-NEXT:    retq
129;
130; SSSE3-FAST-LABEL: phaddd6:
131; SSSE3-FAST:       # %bb.0:
132; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
133; SSSE3-FAST-NEXT:    retq
134;
135; AVX-SLOW-LABEL: phaddd6:
136; AVX-SLOW:       # %bb.0:
137; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
138; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
139; AVX-SLOW-NEXT:    retq
140;
141; AVX-FAST-LABEL: phaddd6:
142; AVX-FAST:       # %bb.0:
143; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
144; AVX-FAST-NEXT:    retq
145;
146; AVX2-SHUF-LABEL: phaddd6:
147; AVX2-SHUF:       # %bb.0:
148; AVX2-SHUF-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
149; AVX2-SHUF-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
150; AVX2-SHUF-NEXT:    retq
151  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
152  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
153  %r = add <4 x i32> %a, %b
154  ret <4 x i32> %r
155}
156
157define <4 x i32> @phaddd7(<4 x i32> %x) {
158; SSSE3-LABEL: phaddd7:
159; SSSE3:       # %bb.0:
160; SSSE3-NEXT:    phaddd %xmm0, %xmm0
161; SSSE3-NEXT:    retq
162;
163; AVX-LABEL: phaddd7:
164; AVX:       # %bb.0:
165; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
166; AVX-NEXT:    retq
167  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
168  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
169  %r = add <4 x i32> %a, %b
170  ret <4 x i32> %r
171}
172
173define <8 x i16> @phsubw1(<8 x i16> %x, <8 x i16> %y) {
174; SSSE3-LABEL: phsubw1:
175; SSSE3:       # %bb.0:
176; SSSE3-NEXT:    phsubw %xmm1, %xmm0
177; SSSE3-NEXT:    retq
178;
179; AVX-LABEL: phsubw1:
180; AVX:       # %bb.0:
181; AVX-NEXT:    vphsubw %xmm1, %xmm0, %xmm0
182; AVX-NEXT:    retq
183  %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
184  %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
185  %r = sub <8 x i16> %a, %b
186  ret <8 x i16> %r
187}
188
189define <4 x i32> @phsubd1(<4 x i32> %x, <4 x i32> %y) {
190; SSSE3-LABEL: phsubd1:
191; SSSE3:       # %bb.0:
192; SSSE3-NEXT:    phsubd %xmm1, %xmm0
193; SSSE3-NEXT:    retq
194;
195; AVX-LABEL: phsubd1:
196; AVX:       # %bb.0:
197; AVX-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
198; AVX-NEXT:    retq
199  %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
200  %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
201  %r = sub <4 x i32> %a, %b
202  ret <4 x i32> %r
203}
204
205define <4 x i32> @phsubd2(<4 x i32> %x) {
206; SSSE3-LABEL: phsubd2:
207; SSSE3:       # %bb.0:
208; SSSE3-NEXT:    phsubd %xmm0, %xmm0
209; SSSE3-NEXT:    retq
210;
211; AVX-LABEL: phsubd2:
212; AVX:       # %bb.0:
213; AVX-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
214; AVX-NEXT:    retq
215  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
216  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
217  %r = sub <4 x i32> %a, %b
218  ret <4 x i32> %r
219}
220
221define <4 x i32> @phsubd3(<4 x i32> %x) {
222; SSSE3-LABEL: phsubd3:
223; SSSE3:       # %bb.0:
224; SSSE3-NEXT:    phsubd %xmm0, %xmm0
225; SSSE3-NEXT:    retq
226;
227; AVX-LABEL: phsubd3:
228; AVX:       # %bb.0:
229; AVX-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
230; AVX-NEXT:    retq
231  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
232  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
233  %r = sub <4 x i32> %a, %b
234  ret <4 x i32> %r
235}
236
237define <4 x i32> @phsubd4(<4 x i32> %x) {
238; SSSE3-SLOW-LABEL: phsubd4:
239; SSSE3-SLOW:       # %bb.0:
240; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
241; SSSE3-SLOW-NEXT:    psubd %xmm1, %xmm0
242; SSSE3-SLOW-NEXT:    retq
243;
244; SSSE3-FAST-LABEL: phsubd4:
245; SSSE3-FAST:       # %bb.0:
246; SSSE3-FAST-NEXT:    phsubd %xmm0, %xmm0
247; SSSE3-FAST-NEXT:    retq
248;
249; AVX-SLOW-LABEL: phsubd4:
250; AVX-SLOW:       # %bb.0:
251; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
252; AVX-SLOW-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
253; AVX-SLOW-NEXT:    retq
254;
255; AVX-FAST-LABEL: phsubd4:
256; AVX-FAST:       # %bb.0:
257; AVX-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
258; AVX-FAST-NEXT:    retq
259;
260; AVX2-SHUF-LABEL: phsubd4:
261; AVX2-SHUF:       # %bb.0:
262; AVX2-SHUF-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
263; AVX2-SHUF-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
264; AVX2-SHUF-NEXT:    retq
265  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
266  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
267  %r = sub <4 x i32> %a, %b
268  ret <4 x i32> %r
269}
270
271define <8 x i16> @phsubw1_reverse(<8 x i16> %x, <8 x i16> %y) {
272; SSSE3-LABEL: phsubw1_reverse:
273; SSSE3:       # %bb.0:
274; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
275; SSSE3-NEXT:    movdqa %xmm1, %xmm4
276; SSSE3-NEXT:    pshufb %xmm3, %xmm4
277; SSSE3-NEXT:    movdqa %xmm0, %xmm2
278; SSSE3-NEXT:    pshufb %xmm3, %xmm2
279; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
280; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
281; SSSE3-NEXT:    pshufb %xmm3, %xmm1
282; SSSE3-NEXT:    pshufb %xmm3, %xmm0
283; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
284; SSSE3-NEXT:    psubw %xmm0, %xmm2
285; SSSE3-NEXT:    movdqa %xmm2, %xmm0
286; SSSE3-NEXT:    retq
287;
288; AVX-LABEL: phsubw1_reverse:
289; AVX:       # %bb.0:
290; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
291; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm3
292; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
293; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
294; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
295; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7]
296; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
297; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
298; AVX-NEXT:    vpsubw %xmm0, %xmm2, %xmm0
299; AVX-NEXT:    retq
300  %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
301  %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
302  %r = sub <8 x i16> %a, %b
303  ret <8 x i16> %r
304}
305
306define <4 x i32> @phsubd1_reverse(<4 x i32> %x, <4 x i32> %y) {
307; SSSE3-LABEL: phsubd1_reverse:
308; SSSE3:       # %bb.0:
309; SSSE3-NEXT:    movaps %xmm0, %xmm2
310; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3]
311; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
312; SSSE3-NEXT:    psubd %xmm0, %xmm2
313; SSSE3-NEXT:    movdqa %xmm2, %xmm0
314; SSSE3-NEXT:    retq
315;
316; AVX-LABEL: phsubd1_reverse:
317; AVX:       # %bb.0:
318; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[1,3],xmm1[1,3]
319; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
320; AVX-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
321; AVX-NEXT:    retq
322  %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
323  %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
324  %r = sub <4 x i32> %a, %b
325  ret <4 x i32> %r
326}
327
328define <4 x i32> @phaddd_single_source1(<4 x i32> %x) {
329; SSSE3-LABEL: phaddd_single_source1:
330; SSSE3:       # %bb.0:
331; SSSE3-NEXT:    phaddd %xmm0, %xmm0
332; SSSE3-NEXT:    retq
333;
334; AVX-LABEL: phaddd_single_source1:
335; AVX:       # %bb.0:
336; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
337; AVX-NEXT:    retq
338  %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
339  %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
340  %add = add <4 x i32> %l, %r
341  ret <4 x i32> %add
342}
343
344define <4 x i32> @phaddd_single_source2(<4 x i32> %x) {
345; SSSE3-SLOW-LABEL: phaddd_single_source2:
346; SSSE3-SLOW:       # %bb.0:
347; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
348; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
349; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
350; SSSE3-SLOW-NEXT:    retq
351;
352; SSSE3-FAST-LABEL: phaddd_single_source2:
353; SSSE3-FAST:       # %bb.0:
354; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
355; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
356; SSSE3-FAST-NEXT:    retq
357;
358; AVX-SLOW-LABEL: phaddd_single_source2:
359; AVX-SLOW:       # %bb.0:
360; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
361; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
362; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
363; AVX-SLOW-NEXT:    retq
364;
365; AVX-FAST-LABEL: phaddd_single_source2:
366; AVX-FAST:       # %bb.0:
367; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
368; AVX-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
369; AVX-FAST-NEXT:    retq
370;
371; AVX2-SHUF-LABEL: phaddd_single_source2:
372; AVX2-SHUF:       # %bb.0:
373; AVX2-SHUF-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
374; AVX2-SHUF-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
375; AVX2-SHUF-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
376; AVX2-SHUF-NEXT:    retq
377  %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
378  %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
379  %add = add <4 x i32> %l, %r
380  %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 undef, i32 undef>
381  ret <4 x i32> %shuffle2
382}
383
384define <4 x i32> @phaddd_single_source3(<4 x i32> %x) {
385; SSSE3-LABEL: phaddd_single_source3:
386; SSSE3:       # %bb.0:
387; SSSE3-NEXT:    phaddd %xmm0, %xmm0
388; SSSE3-NEXT:    retq
389;
390; AVX-LABEL: phaddd_single_source3:
391; AVX:       # %bb.0:
392; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
393; AVX-NEXT:    retq
394  %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
395  %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
396  %add = add <4 x i32> %l, %r
397  ret <4 x i32> %add
398}
399
400define <4 x i32> @phaddd_single_source4(<4 x i32> %x) {
401; SSSE3-SLOW-LABEL: phaddd_single_source4:
402; SSSE3-SLOW:       # %bb.0:
403; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
404; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
405; SSSE3-SLOW-NEXT:    retq
406;
407; SSSE3-FAST-LABEL: phaddd_single_source4:
408; SSSE3-FAST:       # %bb.0:
409; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
410; SSSE3-FAST-NEXT:    retq
411;
412; AVX-SLOW-LABEL: phaddd_single_source4:
413; AVX-SLOW:       # %bb.0:
414; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
415; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
416; AVX-SLOW-NEXT:    retq
417;
418; AVX-FAST-LABEL: phaddd_single_source4:
419; AVX-FAST:       # %bb.0:
420; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
421; AVX-FAST-NEXT:    retq
422;
423; AVX2-SHUF-LABEL: phaddd_single_source4:
424; AVX2-SHUF:       # %bb.0:
425; AVX2-SHUF-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
426; AVX2-SHUF-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
427; AVX2-SHUF-NEXT:    retq
428  %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
429  %add = add <4 x i32> %l, %x
430  ret <4 x i32> %add
431}
432
433define <4 x i32> @phaddd_single_source5(<4 x i32> %x) {
434; SSSE3-SLOW-LABEL: phaddd_single_source5:
435; SSSE3-SLOW:       # %bb.0:
436; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
437; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
438; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
439; SSSE3-SLOW-NEXT:    retq
440;
441; SSSE3-FAST-LABEL: phaddd_single_source5:
442; SSSE3-FAST:       # %bb.0:
443; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
444; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
445; SSSE3-FAST-NEXT:    retq
446;
447; AVX-SLOW-LABEL: phaddd_single_source5:
448; AVX-SLOW:       # %bb.0:
449; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
450; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
451; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
452; AVX-SLOW-NEXT:    retq
453;
454; AVX-FAST-LABEL: phaddd_single_source5:
455; AVX-FAST:       # %bb.0:
456; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
457; AVX-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
458; AVX-FAST-NEXT:    retq
459;
460; AVX2-SHUF-LABEL: phaddd_single_source5:
461; AVX2-SHUF:       # %bb.0:
462; AVX2-SHUF-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
463; AVX2-SHUF-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
464; AVX2-SHUF-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
465; AVX2-SHUF-NEXT:    retq
466  %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
467  %add = add <4 x i32> %l, %x
468  %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
469  ret <4 x i32> %shuffle2
470}
471
472define <4 x i32> @phaddd_single_source6(<4 x i32> %x) {
473; SSSE3-LABEL: phaddd_single_source6:
474; SSSE3:       # %bb.0:
475; SSSE3-NEXT:    phaddd %xmm0, %xmm0
476; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
477; SSSE3-NEXT:    retq
478;
479; AVX-LABEL: phaddd_single_source6:
480; AVX:       # %bb.0:
481; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
482; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
483; AVX-NEXT:    retq
484  %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
485  %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
486  %add = add <4 x i32> %l, %r
487  %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
488  ret <4 x i32> %shuffle2
489}
490
491define <8 x i16> @phaddw_single_source1(<8 x i16> %x) {
492; SSSE3-LABEL: phaddw_single_source1:
493; SSSE3:       # %bb.0:
494; SSSE3-NEXT:    phaddw %xmm0, %xmm0
495; SSSE3-NEXT:    retq
496;
497; AVX-LABEL: phaddw_single_source1:
498; AVX:       # %bb.0:
499; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
500; AVX-NEXT:    retq
501  %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6>
502  %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
503  %add = add <8 x i16> %l, %r
504  ret <8 x i16> %add
505}
506
507define <8 x i16> @phaddw_single_source2(<8 x i16> %x) {
508; SSSE3-SLOW-LABEL: phaddw_single_source2:
509; SSSE3-SLOW:       # %bb.0:
510; SSSE3-SLOW-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7]
511; SSSE3-SLOW-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
512; SSSE3-SLOW-NEXT:    paddw %xmm1, %xmm0
513; SSSE3-SLOW-NEXT:    retq
514;
515; SSSE3-FAST-LABEL: phaddw_single_source2:
516; SSSE3-FAST:       # %bb.0:
517; SSSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
518; SSSE3-FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7]
519; SSSE3-FAST-NEXT:    retq
520;
521; AVX-SLOW-LABEL: phaddw_single_source2:
522; AVX-SLOW:       # %bb.0:
523; AVX-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7]
524; AVX-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
525; AVX-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
526; AVX-SLOW-NEXT:    retq
527;
528; AVX-FAST-LABEL: phaddw_single_source2:
529; AVX-FAST:       # %bb.0:
530; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
531; AVX-FAST-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7]
532; AVX-FAST-NEXT:    retq
533;
534; AVX2-SHUF-LABEL: phaddw_single_source2:
535; AVX2-SHUF:       # %bb.0:
536; AVX2-SHUF-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7]
537; AVX2-SHUF-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
538; AVX2-SHUF-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
539; AVX2-SHUF-NEXT:    retq
540  %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6>
541  %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
542  %add = add <8 x i16> %l, %r
543  %shuffle2 = shufflevector <8 x i16> %add, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 3, i32 2, i32 undef, i32 undef, i32 undef, i32 undef>
544  ret <8 x i16> %shuffle2
545}
546
547define <8 x i16> @phaddw_single_source3(<8 x i16> %x) {
548; SSSE3-LABEL: phaddw_single_source3:
549; SSSE3:       # %bb.0:
550; SSSE3-NEXT:    phaddw %xmm0, %xmm0
551; SSSE3-NEXT:    retq
552;
553; AVX-LABEL: phaddw_single_source3:
554; AVX:       # %bb.0:
555; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
556; AVX-NEXT:    retq
557  %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 undef, i32 undef>
558  %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 undef, i32 undef>
559  %add = add <8 x i16> %l, %r
560  ret <8 x i16> %add
561}
562
563define <8 x i16> @phaddw_single_source4(<8 x i16> %x) {
564; SSSE3-SLOW-LABEL: phaddw_single_source4:
565; SSSE3-SLOW:       # %bb.0:
566; SSSE3-SLOW-NEXT:    movdqa %xmm0, %xmm1
567; SSSE3-SLOW-NEXT:    pslld $16, %xmm1
568; SSSE3-SLOW-NEXT:    paddw %xmm0, %xmm1
569; SSSE3-SLOW-NEXT:    movdqa %xmm1, %xmm0
570; SSSE3-SLOW-NEXT:    retq
571;
572; SSSE3-FAST-LABEL: phaddw_single_source4:
573; SSSE3-FAST:       # %bb.0:
574; SSSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
575; SSSE3-FAST-NEXT:    retq
576;
577; AVX-SLOW-LABEL: phaddw_single_source4:
578; AVX-SLOW:       # %bb.0:
579; AVX-SLOW-NEXT:    vpslld $16, %xmm0, %xmm1
580; AVX-SLOW-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
581; AVX-SLOW-NEXT:    retq
582;
583; AVX-FAST-LABEL: phaddw_single_source4:
584; AVX-FAST:       # %bb.0:
585; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
586; AVX-FAST-NEXT:    retq
587;
588; AVX2-SHUF-LABEL: phaddw_single_source4:
589; AVX2-SHUF:       # %bb.0:
590; AVX2-SHUF-NEXT:    vpslld $16, %xmm0, %xmm1
591; AVX2-SHUF-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
592; AVX2-SHUF-NEXT:    retq
593  %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6>
594  %add = add <8 x i16> %l, %x
595  ret <8 x i16> %add
596}
597
598define <8 x i16> @phaddw_single_source6(<8 x i16> %x) {
599; SSSE3-LABEL: phaddw_single_source6:
600; SSSE3:       # %bb.0:
601; SSSE3-NEXT:    phaddw %xmm0, %xmm0
602; SSSE3-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
603; SSSE3-NEXT:    retq
604;
605; AVX-LABEL: phaddw_single_source6:
606; AVX:       # %bb.0:
607; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
608; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
609; AVX-NEXT:    retq
610  %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef>
611  %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef>
612  %add = add <8 x i16> %l, %r
613  %shuffle2 = shufflevector <8 x i16> %add, <8 x i16> undef, <8 x i32> <i32 undef, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
614  ret <8 x i16> %shuffle2
615}
616
617; PR39921 + PR39936
618define i32 @PR39936_v8i32(<8 x i32>) {
619; SSSE3-SLOW-LABEL: PR39936_v8i32:
620; SSSE3-SLOW:       # %bb.0:
621; SSSE3-SLOW-NEXT:    phaddd %xmm1, %xmm0
622; SSSE3-SLOW-NEXT:    phaddd %xmm0, %xmm0
623; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
624; SSSE3-SLOW-NEXT:    paddd %xmm0, %xmm1
625; SSSE3-SLOW-NEXT:    movd %xmm1, %eax
626; SSSE3-SLOW-NEXT:    retq
627;
628; SSSE3-FAST-LABEL: PR39936_v8i32:
629; SSSE3-FAST:       # %bb.0:
630; SSSE3-FAST-NEXT:    phaddd %xmm1, %xmm0
631; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
632; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
633; SSSE3-FAST-NEXT:    movd %xmm0, %eax
634; SSSE3-FAST-NEXT:    retq
635;
636; AVX1-SLOW-LABEL: PR39936_v8i32:
637; AVX1-SLOW:       # %bb.0:
638; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
639; AVX1-SLOW-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
640; AVX1-SLOW-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
641; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
642; AVX1-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
643; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
644; AVX1-SLOW-NEXT:    vzeroupper
645; AVX1-SLOW-NEXT:    retq
646;
647; AVX1-FAST-LABEL: PR39936_v8i32:
648; AVX1-FAST:       # %bb.0:
649; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
650; AVX1-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
651; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
652; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
653; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
654; AVX1-FAST-NEXT:    vzeroupper
655; AVX1-FAST-NEXT:    retq
656;
657; AVX2-SLOW-LABEL: PR39936_v8i32:
658; AVX2-SLOW:       # %bb.0:
659; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
660; AVX2-SLOW-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
661; AVX2-SLOW-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
662; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
663; AVX2-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
664; AVX2-SLOW-NEXT:    vmovd %xmm0, %eax
665; AVX2-SLOW-NEXT:    vzeroupper
666; AVX2-SLOW-NEXT:    retq
667;
668; AVX2-FAST-LABEL: PR39936_v8i32:
669; AVX2-FAST:       # %bb.0:
670; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
671; AVX2-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
672; AVX2-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
673; AVX2-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
674; AVX2-FAST-NEXT:    vmovd %xmm0, %eax
675; AVX2-FAST-NEXT:    vzeroupper
676; AVX2-FAST-NEXT:    retq
677;
678; AVX2-SHUF-LABEL: PR39936_v8i32:
679; AVX2-SHUF:       # %bb.0:
680; AVX2-SHUF-NEXT:    vextracti128 $1, %ymm0, %xmm1
681; AVX2-SHUF-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
682; AVX2-SHUF-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
683; AVX2-SHUF-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
684; AVX2-SHUF-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
685; AVX2-SHUF-NEXT:    vmovd %xmm0, %eax
686; AVX2-SHUF-NEXT:    vzeroupper
687; AVX2-SHUF-NEXT:    retq
688  %2 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
689  %3 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
690  %4 = add <8 x i32> %2, %3
691  %5 = shufflevector <8 x i32> %4, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
692  %6 = shufflevector <8 x i32> %4, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
693  %7 = add <8 x i32> %5, %6
694  %8 = shufflevector <8 x i32> %7, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
695  %9 = add <8 x i32> %8, %7
696  %10 = extractelement <8 x i32> %9, i32 0
697  ret i32 %10
698}
699
700