1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2            | FileCheck %s --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3           | FileCheck %s --check-prefix=SSSE3-SLOW
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefix=SSSE3-FAST
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx             | FileCheck %s --check-prefix=AVX1-SLOW
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s --check-prefix=AVX1-FAST
7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2            | FileCheck %s --check-prefix=AVX2
8
9; PR37890 - subvector reduction followed by shuffle reduction
10
11define i32 @PR37890_v4i32(<4 x i32> %a)  {
12; SSE2-LABEL: PR37890_v4i32:
13; SSE2:       # %bb.0:
14; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
15; SSE2-NEXT:    paddd %xmm0, %xmm1
16; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
17; SSE2-NEXT:    paddd %xmm1, %xmm0
18; SSE2-NEXT:    movd %xmm0, %eax
19; SSE2-NEXT:    retq
20;
21; SSSE3-SLOW-LABEL: PR37890_v4i32:
22; SSSE3-SLOW:       # %bb.0:
23; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
24; SSSE3-SLOW-NEXT:    paddd %xmm0, %xmm1
25; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
26; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
27; SSSE3-SLOW-NEXT:    movd %xmm0, %eax
28; SSSE3-SLOW-NEXT:    retq
29;
30; SSSE3-FAST-LABEL: PR37890_v4i32:
31; SSSE3-FAST:       # %bb.0:
32; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
33; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
34; SSSE3-FAST-NEXT:    movd %xmm0, %eax
35; SSSE3-FAST-NEXT:    retq
36;
37; AVX1-SLOW-LABEL: PR37890_v4i32:
38; AVX1-SLOW:       # %bb.0:
39; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
40; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
41; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
42; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
43; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
44; AVX1-SLOW-NEXT:    retq
45;
46; AVX1-FAST-LABEL: PR37890_v4i32:
47; AVX1-FAST:       # %bb.0:
48; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
49; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
50; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
51; AVX1-FAST-NEXT:    retq
52;
53; AVX2-LABEL: PR37890_v4i32:
54; AVX2:       # %bb.0:
55; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
56; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
57; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
58; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
59; AVX2-NEXT:    vmovd %xmm0, %eax
60; AVX2-NEXT:    retq
61  %hi0 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
62  %lo0 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
63  %sum0 = add <2 x i32> %lo0, %hi0
64  %hi1 = shufflevector <2 x i32> %sum0, <2 x i32> undef, <2 x i32> <i32 1, i32 undef>
65  %sum1 = add <2 x i32> %sum0, %hi1
66  %e = extractelement <2 x i32> %sum1, i32 0
67  ret i32 %e
68}
69
70define i16 @PR37890_v8i16(<8 x i16> %a)  {
71; SSE2-LABEL: PR37890_v8i16:
72; SSE2:       # %bb.0:
73; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
74; SSE2-NEXT:    paddw %xmm0, %xmm1
75; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
76; SSE2-NEXT:    paddw %xmm1, %xmm0
77; SSE2-NEXT:    movdqa %xmm0, %xmm1
78; SSE2-NEXT:    psrld $16, %xmm1
79; SSE2-NEXT:    paddw %xmm0, %xmm1
80; SSE2-NEXT:    movd %xmm1, %eax
81; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
82; SSE2-NEXT:    retq
83;
84; SSSE3-SLOW-LABEL: PR37890_v8i16:
85; SSSE3-SLOW:       # %bb.0:
86; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
87; SSSE3-SLOW-NEXT:    paddw %xmm0, %xmm1
88; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
89; SSSE3-SLOW-NEXT:    paddw %xmm1, %xmm0
90; SSSE3-SLOW-NEXT:    movdqa %xmm0, %xmm1
91; SSSE3-SLOW-NEXT:    psrld $16, %xmm1
92; SSSE3-SLOW-NEXT:    paddw %xmm0, %xmm1
93; SSSE3-SLOW-NEXT:    movd %xmm1, %eax
94; SSSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
95; SSSE3-SLOW-NEXT:    retq
96;
97; SSSE3-FAST-LABEL: PR37890_v8i16:
98; SSSE3-FAST:       # %bb.0:
99; SSSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
100; SSSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
101; SSSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
102; SSSE3-FAST-NEXT:    movd %xmm0, %eax
103; SSSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
104; SSSE3-FAST-NEXT:    retq
105;
106; AVX1-SLOW-LABEL: PR37890_v8i16:
107; AVX1-SLOW:       # %bb.0:
108; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
109; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
110; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
111; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
112; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
113; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
114; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
115; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
116; AVX1-SLOW-NEXT:    retq
117;
118; AVX1-FAST-LABEL: PR37890_v8i16:
119; AVX1-FAST:       # %bb.0:
120; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
121; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
122; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
123; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
124; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
125; AVX1-FAST-NEXT:    retq
126;
127; AVX2-LABEL: PR37890_v8i16:
128; AVX2:       # %bb.0:
129; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
130; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
131; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
132; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
133; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
134; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
135; AVX2-NEXT:    vmovd %xmm0, %eax
136; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
137; AVX2-NEXT:    retq
138  %hi0 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
139  %lo0 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
140  %sum0 = add <4 x i16> %lo0, %hi0
141  %hi1 = shufflevector <4 x i16> %sum0, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
142  %lo1 = shufflevector <4 x i16> %sum0, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
143  %sum1 = add <2 x i16> %lo1, %hi1
144  %hi2 = shufflevector <2 x i16> %sum1, <2 x i16> undef, <2 x i32> <i32 1, i32 undef>
145  %sum2 = add <2 x i16> %sum1, %hi2
146  %e = extractelement <2 x i16> %sum2, i32 0
147  ret i16 %e
148}
149
150define i32 @PR37890_v8i32(<8 x i32> %a)  {
151; SSE2-LABEL: PR37890_v8i32:
152; SSE2:       # %bb.0:
153; SSE2-NEXT:    paddd %xmm1, %xmm0
154; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
155; SSE2-NEXT:    paddd %xmm0, %xmm1
156; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
157; SSE2-NEXT:    paddd %xmm1, %xmm0
158; SSE2-NEXT:    movd %xmm0, %eax
159; SSE2-NEXT:    retq
160;
161; SSSE3-SLOW-LABEL: PR37890_v8i32:
162; SSSE3-SLOW:       # %bb.0:
163; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
164; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
165; SSSE3-SLOW-NEXT:    paddd %xmm0, %xmm1
166; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
167; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
168; SSSE3-SLOW-NEXT:    movd %xmm0, %eax
169; SSSE3-SLOW-NEXT:    retq
170;
171; SSSE3-FAST-LABEL: PR37890_v8i32:
172; SSSE3-FAST:       # %bb.0:
173; SSSE3-FAST-NEXT:    paddd %xmm1, %xmm0
174; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
175; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
176; SSSE3-FAST-NEXT:    movd %xmm0, %eax
177; SSSE3-FAST-NEXT:    retq
178;
179; AVX1-SLOW-LABEL: PR37890_v8i32:
180; AVX1-SLOW:       # %bb.0:
181; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
182; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
183; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
184; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
185; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
186; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
187; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
188; AVX1-SLOW-NEXT:    vzeroupper
189; AVX1-SLOW-NEXT:    retq
190;
191; AVX1-FAST-LABEL: PR37890_v8i32:
192; AVX1-FAST:       # %bb.0:
193; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
194; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm1, %xmm0
195; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
196; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
197; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
198; AVX1-FAST-NEXT:    vzeroupper
199; AVX1-FAST-NEXT:    retq
200;
201; AVX2-LABEL: PR37890_v8i32:
202; AVX2:       # %bb.0:
203; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
204; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
205; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
206; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
207; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
208; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
209; AVX2-NEXT:    vmovd %xmm0, %eax
210; AVX2-NEXT:    vzeroupper
211; AVX2-NEXT:    retq
212  %hi0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
213  %lo0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
214  %sum0 = add <4 x i32> %lo0, %hi0
215  %hi1 = shufflevector <4 x i32> %sum0, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
216  %lo1 = shufflevector <4 x i32> %sum0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
217  %sum1 = add <2 x i32> %lo1, %hi1
218  %hi2 = shufflevector <2 x i32> %sum1, <2 x i32> undef, <2 x i32> <i32 1, i32 undef>
219  %sum2 = add <2 x i32> %sum1, %hi2
220  %e = extractelement <2 x i32> %sum2, i32 0
221  ret i32 %e
222}
223
224define i16 @PR37890_v16i16(<16 x i16> %a)  {
225; SSE2-LABEL: PR37890_v16i16:
226; SSE2:       # %bb.0:
227; SSE2-NEXT:    paddw %xmm1, %xmm0
228; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
229; SSE2-NEXT:    paddw %xmm0, %xmm1
230; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
231; SSE2-NEXT:    paddw %xmm1, %xmm0
232; SSE2-NEXT:    movdqa %xmm0, %xmm1
233; SSE2-NEXT:    psrld $16, %xmm1
234; SSE2-NEXT:    paddw %xmm0, %xmm1
235; SSE2-NEXT:    movd %xmm1, %eax
236; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
237; SSE2-NEXT:    retq
238;
239; SSSE3-SLOW-LABEL: PR37890_v16i16:
240; SSSE3-SLOW:       # %bb.0:
241; SSSE3-SLOW-NEXT:    paddw %xmm1, %xmm0
242; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
243; SSSE3-SLOW-NEXT:    paddw %xmm0, %xmm1
244; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
245; SSSE3-SLOW-NEXT:    paddw %xmm1, %xmm0
246; SSSE3-SLOW-NEXT:    movdqa %xmm0, %xmm1
247; SSSE3-SLOW-NEXT:    psrld $16, %xmm1
248; SSSE3-SLOW-NEXT:    paddw %xmm0, %xmm1
249; SSSE3-SLOW-NEXT:    movd %xmm1, %eax
250; SSSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
251; SSSE3-SLOW-NEXT:    retq
252;
253; SSSE3-FAST-LABEL: PR37890_v16i16:
254; SSSE3-FAST:       # %bb.0:
255; SSSE3-FAST-NEXT:    paddw %xmm1, %xmm0
256; SSSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
257; SSSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
258; SSSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
259; SSSE3-FAST-NEXT:    movd %xmm0, %eax
260; SSSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
261; SSSE3-FAST-NEXT:    retq
262;
263; AVX1-SLOW-LABEL: PR37890_v16i16:
264; AVX1-SLOW:       # %bb.0:
265; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
266; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
267; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
268; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
269; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
270; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
271; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
272; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
273; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
274; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
275; AVX1-SLOW-NEXT:    vzeroupper
276; AVX1-SLOW-NEXT:    retq
277;
278; AVX1-FAST-LABEL: PR37890_v16i16:
279; AVX1-FAST:       # %bb.0:
280; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
281; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm1, %xmm0
282; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
283; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
284; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
285; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
286; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
287; AVX1-FAST-NEXT:    vzeroupper
288; AVX1-FAST-NEXT:    retq
289;
290; AVX2-LABEL: PR37890_v16i16:
291; AVX2:       # %bb.0:
292; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
293; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
294; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
295; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
296; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
297; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
298; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
299; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
300; AVX2-NEXT:    vmovd %xmm0, %eax
301; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
302; AVX2-NEXT:    vzeroupper
303; AVX2-NEXT:    retq
304  %hi0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
305  %lo0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
306  %sum0 = add <8 x i16> %lo0, %hi0
307  %hi1 = shufflevector <8 x i16> %sum0, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
308  %lo1 = shufflevector <8 x i16> %sum0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
309  %sum1 = add <4 x i16> %lo1, %hi1
310  %hi2 = shufflevector <4 x i16> %sum1, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
311  %lo2 = shufflevector <4 x i16> %sum1, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
312  %sum2 = add <2 x i16> %lo2, %hi2
313  %hi3 = shufflevector <2 x i16> %sum2, <2 x i16> undef, <2 x i32> <i32 1, i32 undef>
314  %sum3 = add <2 x i16> %sum2, %hi3
315  %e = extractelement <2 x i16> %sum3, i32 0
316  ret i16 %e
317}
318
319define i32 @PR37890_v16i32(<16 x i32> %a)  {
320; SSE2-LABEL: PR37890_v16i32:
321; SSE2:       # %bb.0:
322; SSE2-NEXT:    paddd %xmm3, %xmm1
323; SSE2-NEXT:    paddd %xmm2, %xmm1
324; SSE2-NEXT:    paddd %xmm0, %xmm1
325; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
326; SSE2-NEXT:    paddd %xmm1, %xmm0
327; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
328; SSE2-NEXT:    paddd %xmm0, %xmm1
329; SSE2-NEXT:    movd %xmm1, %eax
330; SSE2-NEXT:    retq
331;
332; SSSE3-SLOW-LABEL: PR37890_v16i32:
333; SSSE3-SLOW:       # %bb.0:
334; SSSE3-SLOW-NEXT:    paddd %xmm3, %xmm1
335; SSSE3-SLOW-NEXT:    paddd %xmm2, %xmm1
336; SSSE3-SLOW-NEXT:    paddd %xmm0, %xmm1
337; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
338; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
339; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
340; SSSE3-SLOW-NEXT:    paddd %xmm0, %xmm1
341; SSSE3-SLOW-NEXT:    movd %xmm1, %eax
342; SSSE3-SLOW-NEXT:    retq
343;
344; SSSE3-FAST-LABEL: PR37890_v16i32:
345; SSSE3-FAST:       # %bb.0:
346; SSSE3-FAST-NEXT:    paddd %xmm3, %xmm1
347; SSSE3-FAST-NEXT:    paddd %xmm2, %xmm1
348; SSSE3-FAST-NEXT:    paddd %xmm0, %xmm1
349; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
350; SSSE3-FAST-NEXT:    paddd %xmm1, %xmm0
351; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
352; SSSE3-FAST-NEXT:    movd %xmm0, %eax
353; SSSE3-FAST-NEXT:    retq
354;
355; AVX1-SLOW-LABEL: PR37890_v16i32:
356; AVX1-SLOW:       # %bb.0:
357; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
358; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm3
359; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
360; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
361; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
362; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
363; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
364; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
365; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
366; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
367; AVX1-SLOW-NEXT:    vzeroupper
368; AVX1-SLOW-NEXT:    retq
369;
370; AVX1-FAST-LABEL: PR37890_v16i32:
371; AVX1-FAST:       # %bb.0:
372; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
373; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm1
374; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
375; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
376; AVX1-FAST-NEXT:    vphaddd %xmm2, %xmm0, %xmm0
377; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
378; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
379; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
380; AVX1-FAST-NEXT:    vzeroupper
381; AVX1-FAST-NEXT:    retq
382;
383; AVX2-LABEL: PR37890_v16i32:
384; AVX2:       # %bb.0:
385; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
386; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
387; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
388; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
389; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
390; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
391; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
392; AVX2-NEXT:    vmovd %xmm0, %eax
393; AVX2-NEXT:    vzeroupper
394; AVX2-NEXT:    retq
395  %hi0 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
396  %lo0 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
397  %sum0 = add <8 x i32> %lo0, %hi0
398  %hi1 = shufflevector <8 x i32> %sum0, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
399  %lo1 = shufflevector <8 x i32> %sum0, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
400  %sum1 = add <4 x i32> %lo1, %hi1
401  %hi2 = shufflevector <4 x i32> %sum1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
402  %lo2 = shufflevector <4 x i32> %sum1, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
403  %sum2 = add <2 x i32> %lo2, %hi2
404  %hi3 = shufflevector <2 x i32> %sum2, <2 x i32> undef, <2 x i32> <i32 1, i32 undef>
405  %sum3 = add <2 x i32> %sum2, %hi3
406  %e = extractelement <2 x i32> %sum3, i32 0
407  ret i32 %e
408}
409