1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3           | FileCheck %s --check-prefixes=SSSE3,SSSE3-SLOW
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3-FAST
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx             | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1,AVX1-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1,AVX1-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2            | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2,AVX2-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2,AVX2-FAST
8; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SHUF
9
10define <8 x i16> @phaddw1(<8 x i16> %x, <8 x i16> %y) {
11; SSSE3-LABEL: phaddw1:
12; SSSE3:       # %bb.0:
13; SSSE3-NEXT:    phaddw %xmm1, %xmm0
14; SSSE3-NEXT:    retq
15;
16; AVX-LABEL: phaddw1:
17; AVX:       # %bb.0:
18; AVX-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
19; AVX-NEXT:    retq
20  %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
21  %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
22  %r = add <8 x i16> %a, %b
23  ret <8 x i16> %r
24}
25
26define <8 x i16> @phaddw2(<8 x i16> %x, <8 x i16> %y) {
27; SSSE3-LABEL: phaddw2:
28; SSSE3:       # %bb.0:
29; SSSE3-NEXT:    phaddw %xmm1, %xmm0
30; SSSE3-NEXT:    retq
31;
32; AVX-LABEL: phaddw2:
33; AVX:       # %bb.0:
34; AVX-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
35; AVX-NEXT:    retq
36  %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14>
37  %b = shufflevector <8 x i16> %y, <8 x i16> %x, <8 x i32> <i32 8, i32 11, i32 12, i32 15, i32 0, i32 3, i32 4, i32 7>
38  %r = add <8 x i16> %a, %b
39  ret <8 x i16> %r
40}
41
42define <4 x i32> @phaddd1(<4 x i32> %x, <4 x i32> %y) {
43; SSSE3-LABEL: phaddd1:
44; SSSE3:       # %bb.0:
45; SSSE3-NEXT:    phaddd %xmm1, %xmm0
46; SSSE3-NEXT:    retq
47;
48; AVX-LABEL: phaddd1:
49; AVX:       # %bb.0:
50; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
51; AVX-NEXT:    retq
52  %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
53  %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
54  %r = add <4 x i32> %a, %b
55  ret <4 x i32> %r
56}
57
58define <4 x i32> @phaddd2(<4 x i32> %x, <4 x i32> %y) {
59; SSSE3-LABEL: phaddd2:
60; SSSE3:       # %bb.0:
61; SSSE3-NEXT:    phaddd %xmm1, %xmm0
62; SSSE3-NEXT:    retq
63;
64; AVX-LABEL: phaddd2:
65; AVX:       # %bb.0:
66; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
67; AVX-NEXT:    retq
68  %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
69  %b = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3>
70  %r = add <4 x i32> %a, %b
71  ret <4 x i32> %r
72}
73
74define <4 x i32> @phaddd3(<4 x i32> %x) {
75; SSSE3-LABEL: phaddd3:
76; SSSE3:       # %bb.0:
77; SSSE3-NEXT:    phaddd %xmm0, %xmm0
78; SSSE3-NEXT:    retq
79;
80; AVX-LABEL: phaddd3:
81; AVX:       # %bb.0:
82; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
83; AVX-NEXT:    retq
84  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
85  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
86  %r = add <4 x i32> %a, %b
87  ret <4 x i32> %r
88}
89
90define <4 x i32> @phaddd4(<4 x i32> %x) {
91; SSSE3-LABEL: phaddd4:
92; SSSE3:       # %bb.0:
93; SSSE3-NEXT:    phaddd %xmm0, %xmm0
94; SSSE3-NEXT:    retq
95;
96; AVX-LABEL: phaddd4:
97; AVX:       # %bb.0:
98; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
99; AVX-NEXT:    retq
100  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
101  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
102  %r = add <4 x i32> %a, %b
103  ret <4 x i32> %r
104}
105
106define <4 x i32> @phaddd5(<4 x i32> %x) {
107; SSSE3-LABEL: phaddd5:
108; SSSE3:       # %bb.0:
109; SSSE3-NEXT:    phaddd %xmm0, %xmm0
110; SSSE3-NEXT:    retq
111;
112; AVX-LABEL: phaddd5:
113; AVX:       # %bb.0:
114; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
115; AVX-NEXT:    retq
116  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
117  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
118  %r = add <4 x i32> %a, %b
119  ret <4 x i32> %r
120}
121
122define <4 x i32> @phaddd6(<4 x i32> %x) {
123; SSSE3-SLOW-LABEL: phaddd6:
124; SSSE3-SLOW:       # %bb.0:
125; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
126; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
127; SSSE3-SLOW-NEXT:    retq
128;
129; SSSE3-FAST-LABEL: phaddd6:
130; SSSE3-FAST:       # %bb.0:
131; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
132; SSSE3-FAST-NEXT:    retq
133;
134; AVX-SLOW-LABEL: phaddd6:
135; AVX-SLOW:       # %bb.0:
136; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
137; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
138; AVX-SLOW-NEXT:    retq
139;
140; AVX-FAST-LABEL: phaddd6:
141; AVX-FAST:       # %bb.0:
142; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
143; AVX-FAST-NEXT:    retq
144;
145; AVX2-SHUF-LABEL: phaddd6:
146; AVX2-SHUF:       # %bb.0:
147; AVX2-SHUF-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
148; AVX2-SHUF-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
149; AVX2-SHUF-NEXT:    retq
150  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
151  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
152  %r = add <4 x i32> %a, %b
153  ret <4 x i32> %r
154}
155
156define <4 x i32> @phaddd7(<4 x i32> %x) {
157; SSSE3-LABEL: phaddd7:
158; SSSE3:       # %bb.0:
159; SSSE3-NEXT:    phaddd %xmm0, %xmm0
160; SSSE3-NEXT:    retq
161;
162; AVX-LABEL: phaddd7:
163; AVX:       # %bb.0:
164; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
165; AVX-NEXT:    retq
166  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
167  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
168  %r = add <4 x i32> %a, %b
169  ret <4 x i32> %r
170}
171
172define <8 x i16> @phsubw1(<8 x i16> %x, <8 x i16> %y) {
173; SSSE3-LABEL: phsubw1:
174; SSSE3:       # %bb.0:
175; SSSE3-NEXT:    phsubw %xmm1, %xmm0
176; SSSE3-NEXT:    retq
177;
178; AVX-LABEL: phsubw1:
179; AVX:       # %bb.0:
180; AVX-NEXT:    vphsubw %xmm1, %xmm0, %xmm0
181; AVX-NEXT:    retq
182  %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
183  %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
184  %r = sub <8 x i16> %a, %b
185  ret <8 x i16> %r
186}
187
188define <4 x i32> @phsubd1(<4 x i32> %x, <4 x i32> %y) {
189; SSSE3-LABEL: phsubd1:
190; SSSE3:       # %bb.0:
191; SSSE3-NEXT:    phsubd %xmm1, %xmm0
192; SSSE3-NEXT:    retq
193;
194; AVX-LABEL: phsubd1:
195; AVX:       # %bb.0:
196; AVX-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
197; AVX-NEXT:    retq
198  %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
199  %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
200  %r = sub <4 x i32> %a, %b
201  ret <4 x i32> %r
202}
203
204define <4 x i32> @phsubd2(<4 x i32> %x) {
205; SSSE3-LABEL: phsubd2:
206; SSSE3:       # %bb.0:
207; SSSE3-NEXT:    phsubd %xmm0, %xmm0
208; SSSE3-NEXT:    retq
209;
210; AVX-LABEL: phsubd2:
211; AVX:       # %bb.0:
212; AVX-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
213; AVX-NEXT:    retq
214  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
215  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
216  %r = sub <4 x i32> %a, %b
217  ret <4 x i32> %r
218}
219
220define <4 x i32> @phsubd3(<4 x i32> %x) {
221; SSSE3-LABEL: phsubd3:
222; SSSE3:       # %bb.0:
223; SSSE3-NEXT:    phsubd %xmm0, %xmm0
224; SSSE3-NEXT:    retq
225;
226; AVX-LABEL: phsubd3:
227; AVX:       # %bb.0:
228; AVX-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
229; AVX-NEXT:    retq
230  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
231  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
232  %r = sub <4 x i32> %a, %b
233  ret <4 x i32> %r
234}
235
236define <4 x i32> @phsubd4(<4 x i32> %x) {
237; SSSE3-SLOW-LABEL: phsubd4:
238; SSSE3-SLOW:       # %bb.0:
239; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
240; SSSE3-SLOW-NEXT:    psubd %xmm1, %xmm0
241; SSSE3-SLOW-NEXT:    retq
242;
243; SSSE3-FAST-LABEL: phsubd4:
244; SSSE3-FAST:       # %bb.0:
245; SSSE3-FAST-NEXT:    phsubd %xmm0, %xmm0
246; SSSE3-FAST-NEXT:    retq
247;
248; AVX-SLOW-LABEL: phsubd4:
249; AVX-SLOW:       # %bb.0:
250; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
251; AVX-SLOW-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
252; AVX-SLOW-NEXT:    retq
253;
254; AVX-FAST-LABEL: phsubd4:
255; AVX-FAST:       # %bb.0:
256; AVX-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
257; AVX-FAST-NEXT:    retq
258;
259; AVX2-SHUF-LABEL: phsubd4:
260; AVX2-SHUF:       # %bb.0:
261; AVX2-SHUF-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
262; AVX2-SHUF-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
263; AVX2-SHUF-NEXT:    retq
264  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
265  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
266  %r = sub <4 x i32> %a, %b
267  ret <4 x i32> %r
268}
269
270define <8 x i16> @phsubw1_reverse(<8 x i16> %x, <8 x i16> %y) {
271; SSSE3-LABEL: phsubw1_reverse:
272; SSSE3:       # %bb.0:
273; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
274; SSSE3-NEXT:    movdqa %xmm1, %xmm4
275; SSSE3-NEXT:    pshufb %xmm3, %xmm4
276; SSSE3-NEXT:    movdqa %xmm0, %xmm2
277; SSSE3-NEXT:    pshufb %xmm3, %xmm2
278; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
279; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
280; SSSE3-NEXT:    pshufb %xmm3, %xmm1
281; SSSE3-NEXT:    pshufb %xmm3, %xmm0
282; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
283; SSSE3-NEXT:    psubw %xmm0, %xmm2
284; SSSE3-NEXT:    movdqa %xmm2, %xmm0
285; SSSE3-NEXT:    retq
286;
287; AVX-LABEL: phsubw1_reverse:
288; AVX:       # %bb.0:
289; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
290; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm3
291; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
292; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
293; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
294; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7]
295; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
296; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
297; AVX-NEXT:    vpsubw %xmm0, %xmm2, %xmm0
298; AVX-NEXT:    retq
299  %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
300  %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
301  %r = sub <8 x i16> %a, %b
302  ret <8 x i16> %r
303}
304
305define <4 x i32> @phsubd1_reverse(<4 x i32> %x, <4 x i32> %y) {
306; SSSE3-LABEL: phsubd1_reverse:
307; SSSE3:       # %bb.0:
308; SSSE3-NEXT:    movaps %xmm0, %xmm2
309; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3]
310; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
311; SSSE3-NEXT:    psubd %xmm0, %xmm2
312; SSSE3-NEXT:    movdqa %xmm2, %xmm0
313; SSSE3-NEXT:    retq
314;
315; AVX-LABEL: phsubd1_reverse:
316; AVX:       # %bb.0:
317; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[1,3],xmm1[1,3]
318; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
319; AVX-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
320; AVX-NEXT:    retq
321  %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
322  %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
323  %r = sub <4 x i32> %a, %b
324  ret <4 x i32> %r
325}
326
327define <4 x i32> @phaddd_single_source1(<4 x i32> %x) {
328; SSSE3-LABEL: phaddd_single_source1:
329; SSSE3:       # %bb.0:
330; SSSE3-NEXT:    phaddd %xmm0, %xmm0
331; SSSE3-NEXT:    retq
332;
333; AVX-LABEL: phaddd_single_source1:
334; AVX:       # %bb.0:
335; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
336; AVX-NEXT:    retq
337  %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
338  %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
339  %add = add <4 x i32> %l, %r
340  ret <4 x i32> %add
341}
342
343define <4 x i32> @phaddd_single_source2(<4 x i32> %x) {
344; SSSE3-LABEL: phaddd_single_source2:
345; SSSE3:       # %bb.0:
346; SSSE3-NEXT:    phaddd %xmm0, %xmm0
347; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
348; SSSE3-NEXT:    retq
349;
350; AVX-LABEL: phaddd_single_source2:
351; AVX:       # %bb.0:
352; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
353; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
354; AVX-NEXT:    retq
355  %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
356  %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
357  %add = add <4 x i32> %l, %r
358  %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 undef, i32 undef>
359  ret <4 x i32> %shuffle2
360}
361
362define <4 x i32> @phaddd_single_source3(<4 x i32> %x) {
363; SSSE3-LABEL: phaddd_single_source3:
364; SSSE3:       # %bb.0:
365; SSSE3-NEXT:    phaddd %xmm0, %xmm0
366; SSSE3-NEXT:    retq
367;
368; AVX-LABEL: phaddd_single_source3:
369; AVX:       # %bb.0:
370; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
371; AVX-NEXT:    retq
372  %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
373  %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
374  %add = add <4 x i32> %l, %r
375  ret <4 x i32> %add
376}
377
378define <4 x i32> @phaddd_single_source4(<4 x i32> %x) {
379; SSSE3-SLOW-LABEL: phaddd_single_source4:
380; SSSE3-SLOW:       # %bb.0:
381; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
382; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
383; SSSE3-SLOW-NEXT:    retq
384;
385; SSSE3-FAST-LABEL: phaddd_single_source4:
386; SSSE3-FAST:       # %bb.0:
387; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
388; SSSE3-FAST-NEXT:    retq
389;
390; AVX-SLOW-LABEL: phaddd_single_source4:
391; AVX-SLOW:       # %bb.0:
392; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
393; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
394; AVX-SLOW-NEXT:    retq
395;
396; AVX-FAST-LABEL: phaddd_single_source4:
397; AVX-FAST:       # %bb.0:
398; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
399; AVX-FAST-NEXT:    retq
400;
401; AVX2-SHUF-LABEL: phaddd_single_source4:
402; AVX2-SHUF:       # %bb.0:
403; AVX2-SHUF-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
404; AVX2-SHUF-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
405; AVX2-SHUF-NEXT:    retq
406  %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
407  %add = add <4 x i32> %l, %x
408  ret <4 x i32> %add
409}
410
411define <4 x i32> @phaddd_single_source5(<4 x i32> %x) {
412; SSSE3-SLOW-LABEL: phaddd_single_source5:
413; SSSE3-SLOW:       # %bb.0:
414; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
415; SSSE3-SLOW-NEXT:    paddd %xmm0, %xmm1
416; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
417; SSSE3-SLOW-NEXT:    retq
418;
419; SSSE3-FAST-LABEL: phaddd_single_source5:
420; SSSE3-FAST:       # %bb.0:
421; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
422; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
423; SSSE3-FAST-NEXT:    retq
424;
425; AVX-SLOW-LABEL: phaddd_single_source5:
426; AVX-SLOW:       # %bb.0:
427; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
428; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
429; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
430; AVX-SLOW-NEXT:    retq
431;
432; AVX-FAST-LABEL: phaddd_single_source5:
433; AVX-FAST:       # %bb.0:
434; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
435; AVX-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
436; AVX-FAST-NEXT:    retq
437;
438; AVX2-SHUF-LABEL: phaddd_single_source5:
439; AVX2-SHUF:       # %bb.0:
440; AVX2-SHUF-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
441; AVX2-SHUF-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
442; AVX2-SHUF-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
443; AVX2-SHUF-NEXT:    retq
444  %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
445  %add = add <4 x i32> %l, %x
446  %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
447  ret <4 x i32> %shuffle2
448}
449
450define <4 x i32> @phaddd_single_source6(<4 x i32> %x) {
451; SSSE3-LABEL: phaddd_single_source6:
452; SSSE3:       # %bb.0:
453; SSSE3-NEXT:    phaddd %xmm0, %xmm0
454; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
455; SSSE3-NEXT:    retq
456;
457; AVX-LABEL: phaddd_single_source6:
458; AVX:       # %bb.0:
459; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
460; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
461; AVX-NEXT:    retq
462  %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
463  %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
464  %add = add <4 x i32> %l, %r
465  %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
466  ret <4 x i32> %shuffle2
467}
468
469define <8 x i16> @phaddw_single_source1(<8 x i16> %x) {
470; SSSE3-LABEL: phaddw_single_source1:
471; SSSE3:       # %bb.0:
472; SSSE3-NEXT:    phaddw %xmm0, %xmm0
473; SSSE3-NEXT:    retq
474;
475; AVX-LABEL: phaddw_single_source1:
476; AVX:       # %bb.0:
477; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
478; AVX-NEXT:    retq
479  %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6>
480  %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
481  %add = add <8 x i16> %l, %r
482  ret <8 x i16> %add
483}
484
485define <8 x i16> @phaddw_single_source2(<8 x i16> %x) {
486; SSSE3-LABEL: phaddw_single_source2:
487; SSSE3:       # %bb.0:
488; SSSE3-NEXT:    phaddw %xmm0, %xmm0
489; SSSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
490; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
491; SSSE3-NEXT:    retq
492;
493; AVX-SLOW-LABEL: phaddw_single_source2:
494; AVX-SLOW:       # %bb.0:
495; AVX-SLOW-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
496; AVX-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
497; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
498; AVX-SLOW-NEXT:    retq
499;
500; AVX-FAST-LABEL: phaddw_single_source2:
501; AVX-FAST:       # %bb.0:
502; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
503; AVX-FAST-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
504; AVX-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
505; AVX-FAST-NEXT:    retq
506;
507; AVX2-SHUF-LABEL: phaddw_single_source2:
508; AVX2-SHUF:       # %bb.0:
509; AVX2-SHUF-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
510; AVX2-SHUF-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,4,5,6,7,10,11,8,9,12,13,14,15]
511; AVX2-SHUF-NEXT:    retq
512  %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6>
513  %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
514  %add = add <8 x i16> %l, %r
515  %shuffle2 = shufflevector <8 x i16> %add, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 3, i32 2, i32 undef, i32 undef, i32 undef, i32 undef>
516  ret <8 x i16> %shuffle2
517}
518
519define <8 x i16> @phaddw_single_source3(<8 x i16> %x) {
520; SSSE3-LABEL: phaddw_single_source3:
521; SSSE3:       # %bb.0:
522; SSSE3-NEXT:    phaddw %xmm0, %xmm0
523; SSSE3-NEXT:    retq
524;
525; AVX-LABEL: phaddw_single_source3:
526; AVX:       # %bb.0:
527; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
528; AVX-NEXT:    retq
529  %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 undef, i32 undef>
530  %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 undef, i32 undef>
531  %add = add <8 x i16> %l, %r
532  ret <8 x i16> %add
533}
534
535define <8 x i16> @phaddw_single_source4(<8 x i16> %x) {
536; SSSE3-SLOW-LABEL: phaddw_single_source4:
537; SSSE3-SLOW:       # %bb.0:
538; SSSE3-SLOW-NEXT:    movdqa %xmm0, %xmm1
539; SSSE3-SLOW-NEXT:    pslld $16, %xmm1
540; SSSE3-SLOW-NEXT:    paddw %xmm0, %xmm1
541; SSSE3-SLOW-NEXT:    movdqa %xmm1, %xmm0
542; SSSE3-SLOW-NEXT:    retq
543;
544; SSSE3-FAST-LABEL: phaddw_single_source4:
545; SSSE3-FAST:       # %bb.0:
546; SSSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
547; SSSE3-FAST-NEXT:    retq
548;
549; AVX-SLOW-LABEL: phaddw_single_source4:
550; AVX-SLOW:       # %bb.0:
551; AVX-SLOW-NEXT:    vpslld $16, %xmm0, %xmm1
552; AVX-SLOW-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
553; AVX-SLOW-NEXT:    retq
554;
555; AVX-FAST-LABEL: phaddw_single_source4:
556; AVX-FAST:       # %bb.0:
557; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
558; AVX-FAST-NEXT:    retq
559;
560; AVX2-SHUF-LABEL: phaddw_single_source4:
561; AVX2-SHUF:       # %bb.0:
562; AVX2-SHUF-NEXT:    vpslld $16, %xmm0, %xmm1
563; AVX2-SHUF-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
564; AVX2-SHUF-NEXT:    retq
565  %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6>
566  %add = add <8 x i16> %l, %x
567  ret <8 x i16> %add
568}
569
570define <8 x i16> @phaddw_single_source6(<8 x i16> %x) {
571; SSSE3-LABEL: phaddw_single_source6:
572; SSSE3:       # %bb.0:
573; SSSE3-NEXT:    phaddw %xmm0, %xmm0
574; SSSE3-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
575; SSSE3-NEXT:    retq
576;
577; AVX-LABEL: phaddw_single_source6:
578; AVX:       # %bb.0:
579; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
580; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
581; AVX-NEXT:    retq
582  %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef>
583  %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef>
584  %add = add <8 x i16> %l, %r
585  %shuffle2 = shufflevector <8 x i16> %add, <8 x i16> undef, <8 x i32> <i32 undef, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
586  ret <8 x i16> %shuffle2
587}
588
589; PR39921 + PR39936
590define i32 @PR39936_v8i32(<8 x i32>) {
591; SSSE3-SLOW-LABEL: PR39936_v8i32:
592; SSSE3-SLOW:       # %bb.0:
593; SSSE3-SLOW-NEXT:    phaddd %xmm1, %xmm0
594; SSSE3-SLOW-NEXT:    phaddd %xmm0, %xmm0
595; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
596; SSSE3-SLOW-NEXT:    paddd %xmm0, %xmm1
597; SSSE3-SLOW-NEXT:    movd %xmm1, %eax
598; SSSE3-SLOW-NEXT:    retq
599;
600; SSSE3-FAST-LABEL: PR39936_v8i32:
601; SSSE3-FAST:       # %bb.0:
602; SSSE3-FAST-NEXT:    phaddd %xmm1, %xmm0
603; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
604; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
605; SSSE3-FAST-NEXT:    movd %xmm0, %eax
606; SSSE3-FAST-NEXT:    retq
607;
608; AVX1-SLOW-LABEL: PR39936_v8i32:
609; AVX1-SLOW:       # %bb.0:
610; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
611; AVX1-SLOW-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
612; AVX1-SLOW-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
613; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
614; AVX1-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
615; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
616; AVX1-SLOW-NEXT:    vzeroupper
617; AVX1-SLOW-NEXT:    retq
618;
619; AVX1-FAST-LABEL: PR39936_v8i32:
620; AVX1-FAST:       # %bb.0:
621; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
622; AVX1-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
623; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
624; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
625; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
626; AVX1-FAST-NEXT:    vzeroupper
627; AVX1-FAST-NEXT:    retq
628;
629; AVX2-SLOW-LABEL: PR39936_v8i32:
630; AVX2-SLOW:       # %bb.0:
631; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
632; AVX2-SLOW-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
633; AVX2-SLOW-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
634; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
635; AVX2-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
636; AVX2-SLOW-NEXT:    vmovd %xmm0, %eax
637; AVX2-SLOW-NEXT:    vzeroupper
638; AVX2-SLOW-NEXT:    retq
639;
640; AVX2-FAST-LABEL: PR39936_v8i32:
641; AVX2-FAST:       # %bb.0:
642; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
643; AVX2-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
644; AVX2-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
645; AVX2-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
646; AVX2-FAST-NEXT:    vmovd %xmm0, %eax
647; AVX2-FAST-NEXT:    vzeroupper
648; AVX2-FAST-NEXT:    retq
649;
650; AVX2-SHUF-LABEL: PR39936_v8i32:
651; AVX2-SHUF:       # %bb.0:
652; AVX2-SHUF-NEXT:    vextracti128 $1, %ymm0, %xmm1
653; AVX2-SHUF-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
654; AVX2-SHUF-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
655; AVX2-SHUF-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
656; AVX2-SHUF-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
657; AVX2-SHUF-NEXT:    vmovd %xmm0, %eax
658; AVX2-SHUF-NEXT:    vzeroupper
659; AVX2-SHUF-NEXT:    retq
660  %2 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
661  %3 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
662  %4 = add <8 x i32> %2, %3
663  %5 = shufflevector <8 x i32> %4, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
664  %6 = shufflevector <8 x i32> %4, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
665  %7 = add <8 x i32> %5, %6
666  %8 = shufflevector <8 x i32> %7, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
667  %9 = add <8 x i32> %8, %7
668  %10 = extractelement <8 x i32> %9, i32 0
669  ret i32 %10
670}
671
672