1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3           | FileCheck %s --check-prefixes=SSSE3-SLOW
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3-FAST
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx             | FileCheck %s --check-prefixes=AVX-SLOW,AVX1-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s --check-prefixes=AVX-FAST,AVX1-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2            | FileCheck %s --check-prefixes=AVX-SLOW,AVX2-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops  | FileCheck %s --check-prefixes=AVX-FAST,AVX2-FAST
8
9; Vectorized Pairwise Sum Reductions
10; e.g.
11; inline STYPE sum(VTYPE x) {
12;   return (x[0] + x[1]) + (x[2] + x[3]);
13; }
14;
15; VTYPE sum4(VTYPE A0, VTYPE A1, VTYPE A2, VTYPE A3) {
16;   return (VTYPE) { sum( A0 ), sum( A1 ), sum( A2 ), sum( A3 ) };
17; }
18
19define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
20; SSSE3-SLOW-LABEL: pair_sum_v4f32_v4f32:
21; SSSE3-SLOW:       # %bb.0:
22; SSSE3-SLOW-NEXT:    haddps %xmm1, %xmm0
23; SSSE3-SLOW-NEXT:    haddps %xmm2, %xmm3
24; SSSE3-SLOW-NEXT:    haddps %xmm3, %xmm0
25; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
26; SSSE3-SLOW-NEXT:    retq
27;
28; SSSE3-FAST-LABEL: pair_sum_v4f32_v4f32:
29; SSSE3-FAST:       # %bb.0:
30; SSSE3-FAST-NEXT:    haddps %xmm1, %xmm0
31; SSSE3-FAST-NEXT:    haddps %xmm3, %xmm2
32; SSSE3-FAST-NEXT:    haddps %xmm2, %xmm0
33; SSSE3-FAST-NEXT:    retq
34;
35; AVX1-SLOW-LABEL: pair_sum_v4f32_v4f32:
36; AVX1-SLOW:       # %bb.0:
37; AVX1-SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
38; AVX1-SLOW-NEXT:    vhaddps %xmm2, %xmm2, %xmm1
39; AVX1-SLOW-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1]
40; AVX1-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1]
41; AVX1-SLOW-NEXT:    vhaddps %xmm3, %xmm3, %xmm1
42; AVX1-SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[1]
43; AVX1-SLOW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
44; AVX1-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
45; AVX1-SLOW-NEXT:    retq
46;
47; AVX-FAST-LABEL: pair_sum_v4f32_v4f32:
48; AVX-FAST:       # %bb.0:
49; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
50; AVX-FAST-NEXT:    vhaddps %xmm3, %xmm2, %xmm1
51; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
52; AVX-FAST-NEXT:    retq
53;
54; AVX2-SLOW-LABEL: pair_sum_v4f32_v4f32:
55; AVX2-SLOW:       # %bb.0:
56; AVX2-SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
57; AVX2-SLOW-NEXT:    vhaddps %xmm2, %xmm2, %xmm1
58; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,3]
59; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1]
60; AVX2-SLOW-NEXT:    vhaddps %xmm3, %xmm3, %xmm1
61; AVX2-SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[1]
62; AVX2-SLOW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
63; AVX2-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
64; AVX2-SLOW-NEXT:    retq
65  %5 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2>
66  %6 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 1, i32 3>
67  %7 = fadd <2 x float> %5, %6
68  %8 = shufflevector <2 x float> %7, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
69  %9 = fadd <2 x float> %7, %8
70  %10 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 0, i32 2>
71  %11 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 1, i32 3>
72  %12 = fadd <2 x float> %10, %11
73  %13 = shufflevector <2 x float> %12, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
74  %14 = fadd <2 x float> %12, %13
75  %15 = shufflevector <2 x float> %9, <2 x float> %14, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
76  %16 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 0, i32 2>
77  %17 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 1, i32 3>
78  %18 = fadd <2 x float> %16, %17
79  %19 = shufflevector <2 x float> %18, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
80  %20 = fadd <2 x float> %18, %19
81  %21 = shufflevector <2 x float> %20, <2 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
82  %22 = shufflevector <4 x float> %15, <4 x float> %21, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
83  %23 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 0, i32 2>
84  %24 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 1, i32 3>
85  %25 = fadd <2 x float> %23, %24
86  %26 = shufflevector <2 x float> %25, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
87  %27 = fadd <2 x float> %25, %26
88  %28 = shufflevector <2 x float> %27, <2 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
89  %29 = shufflevector <4 x float> %22, <4 x float> %28, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
90  ret <4 x float> %29
91}
92
93define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
94; SSSE3-SLOW-LABEL: pair_sum_v4i32_v4i32:
95; SSSE3-SLOW:       # %bb.0:
96; SSSE3-SLOW-NEXT:    phaddd %xmm1, %xmm0
97; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
98; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
99; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
100; SSSE3-SLOW-NEXT:    phaddd %xmm2, %xmm3
101; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
102; SSSE3-SLOW-NEXT:    paddd %xmm3, %xmm1
103; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
104; SSSE3-SLOW-NEXT:    retq
105;
106; SSSE3-FAST-LABEL: pair_sum_v4i32_v4i32:
107; SSSE3-FAST:       # %bb.0:
108; SSSE3-FAST-NEXT:    phaddd %xmm3, %xmm2
109; SSSE3-FAST-NEXT:    phaddd %xmm1, %xmm0
110; SSSE3-FAST-NEXT:    phaddd %xmm2, %xmm0
111; SSSE3-FAST-NEXT:    retq
112;
113; AVX1-SLOW-LABEL: pair_sum_v4i32_v4i32:
114; AVX1-SLOW:       # %bb.0:
115; AVX1-SLOW-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
116; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
117; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
118; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
119; AVX1-SLOW-NEXT:    vphaddd %xmm2, %xmm2, %xmm1
120; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
121; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
122; AVX1-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
123; AVX1-SLOW-NEXT:    vphaddd %xmm3, %xmm3, %xmm1
124; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
125; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
126; AVX1-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
127; AVX1-SLOW-NEXT:    retq
128;
129; AVX-FAST-LABEL: pair_sum_v4i32_v4i32:
130; AVX-FAST:       # %bb.0:
131; AVX-FAST-NEXT:    vphaddd %xmm3, %xmm2, %xmm2
132; AVX-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
133; AVX-FAST-NEXT:    vphaddd %xmm2, %xmm0, %xmm0
134; AVX-FAST-NEXT:    retq
135;
136; AVX2-SLOW-LABEL: pair_sum_v4i32_v4i32:
137; AVX2-SLOW:       # %bb.0:
138; AVX2-SLOW-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
139; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
140; AVX2-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
141; AVX2-SLOW-NEXT:    vphaddd %xmm2, %xmm2, %xmm1
142; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
143; AVX2-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
144; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3]
145; AVX2-SLOW-NEXT:    vphaddd %xmm3, %xmm3, %xmm1
146; AVX2-SLOW-NEXT:    vpbroadcastd %xmm1, %xmm2
147; AVX2-SLOW-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
148; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
149; AVX2-SLOW-NEXT:    retq
150  %5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
151  %6 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
152  %7 = add <2 x i32> %5, %6
153  %8 = shufflevector <2 x i32> %7, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
154  %9 = add <2 x i32> %7, %8
155  %10 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
156  %11 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
157  %12 = add <2 x i32> %10, %11
158  %13 = shufflevector <2 x i32> %12, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
159  %14 = add <2 x i32> %12, %13
160  %15 = shufflevector <2 x i32> %9, <2 x i32> %14, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
161  %16 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
162  %17 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
163  %18 = add <2 x i32> %16, %17
164  %19 = shufflevector <2 x i32> %18, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
165  %20 = add <2 x i32> %18, %19
166  %21 = shufflevector <2 x i32> %20, <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
167  %22 = shufflevector <4 x i32> %15, <4 x i32> %21, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
168  %23 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
169  %24 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
170  %25 = add <2 x i32> %23, %24
171  %26 = shufflevector <2 x i32> %25, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
172  %27 = add <2 x i32> %25, %26
173  %28 = shufflevector <2 x i32> %27, <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
174  %29 = shufflevector <4 x i32> %22, <4 x i32> %28, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
175  ret <4 x i32> %29
176}
177
178define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, <4 x float> %5, <4 x float> %6, <4 x float> %7) {
179; SSSE3-SLOW-LABEL: pair_sum_v8f32_v4f32:
180; SSSE3-SLOW:       # %bb.0:
181; SSSE3-SLOW-NEXT:    haddps %xmm1, %xmm0
182; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
183; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3]
184; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
185; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm0
186; SSSE3-SLOW-NEXT:    haddps %xmm3, %xmm2
187; SSSE3-SLOW-NEXT:    movaps %xmm5, %xmm1
188; SSSE3-SLOW-NEXT:    haddps %xmm4, %xmm1
189; SSSE3-SLOW-NEXT:    haddps %xmm1, %xmm2
190; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1,3,2]
191; SSSE3-SLOW-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
192; SSSE3-SLOW-NEXT:    haddps %xmm7, %xmm6
193; SSSE3-SLOW-NEXT:    haddps %xmm5, %xmm4
194; SSSE3-SLOW-NEXT:    haddps %xmm6, %xmm4
195; SSSE3-SLOW-NEXT:    movaps %xmm4, %xmm1
196; SSSE3-SLOW-NEXT:    retq
197;
198; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32:
199; SSSE3-FAST:       # %bb.0:
200; SSSE3-FAST-NEXT:    haddps %xmm1, %xmm0
201; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm0
202; SSSE3-FAST-NEXT:    haddps %xmm3, %xmm2
203; SSSE3-FAST-NEXT:    haddps %xmm5, %xmm4
204; SSSE3-FAST-NEXT:    haddps %xmm4, %xmm2
205; SSSE3-FAST-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
206; SSSE3-FAST-NEXT:    haddps %xmm7, %xmm6
207; SSSE3-FAST-NEXT:    haddps %xmm6, %xmm4
208; SSSE3-FAST-NEXT:    movaps %xmm4, %xmm1
209; SSSE3-FAST-NEXT:    retq
210;
211; AVX1-SLOW-LABEL: pair_sum_v8f32_v4f32:
212; AVX1-SLOW:       # %bb.0:
213; AVX1-SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
214; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3]
215; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
216; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
217; AVX1-SLOW-NEXT:    vhaddps %xmm4, %xmm4, %xmm1
218; AVX1-SLOW-NEXT:    vhaddps %xmm5, %xmm5, %xmm4
219; AVX1-SLOW-NEXT:    vhaddps %xmm3, %xmm2, %xmm2
220; AVX1-SLOW-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
221; AVX1-SLOW-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
222; AVX1-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
223; AVX1-SLOW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1]
224; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm3, %xmm1
225; AVX1-SLOW-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
226; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
227; AVX1-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
228; AVX1-SLOW-NEXT:    vhaddps %xmm7, %xmm6, %xmm2
229; AVX1-SLOW-NEXT:    vhaddps %xmm2, %xmm2, %xmm2
230; AVX1-SLOW-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
231; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
232; AVX1-SLOW-NEXT:    retq
233;
234; AVX1-FAST-LABEL: pair_sum_v8f32_v4f32:
235; AVX1-FAST:       # %bb.0:
236; AVX1-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
237; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
238; AVX1-FAST-NEXT:    vhaddps %xmm4, %xmm4, %xmm1
239; AVX1-FAST-NEXT:    vhaddps %xmm5, %xmm5, %xmm4
240; AVX1-FAST-NEXT:    vhaddps %xmm3, %xmm2, %xmm2
241; AVX1-FAST-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
242; AVX1-FAST-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
243; AVX1-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
244; AVX1-FAST-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1]
245; AVX1-FAST-NEXT:    vaddps %xmm1, %xmm3, %xmm1
246; AVX1-FAST-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
247; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
248; AVX1-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
249; AVX1-FAST-NEXT:    vhaddps %xmm7, %xmm6, %xmm2
250; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm2, %xmm2
251; AVX1-FAST-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
252; AVX1-FAST-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
253; AVX1-FAST-NEXT:    retq
254;
255; AVX2-SLOW-LABEL: pair_sum_v8f32_v4f32:
256; AVX2-SLOW:       # %bb.0:
257; AVX2-SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
258; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3]
259; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
260; AVX2-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
261; AVX2-SLOW-NEXT:    vhaddps %xmm4, %xmm4, %xmm1
262; AVX2-SLOW-NEXT:    vhaddps %xmm5, %xmm5, %xmm4
263; AVX2-SLOW-NEXT:    vhaddps %xmm3, %xmm2, %xmm2
264; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3]
265; AVX2-SLOW-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
266; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
267; AVX2-SLOW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1]
268; AVX2-SLOW-NEXT:    vaddps %xmm1, %xmm3, %xmm1
269; AVX2-SLOW-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
270; AVX2-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
271; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
272; AVX2-SLOW-NEXT:    vhaddps %xmm7, %xmm6, %xmm2
273; AVX2-SLOW-NEXT:    vhaddps %xmm2, %xmm2, %xmm2
274; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
275; AVX2-SLOW-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
276; AVX2-SLOW-NEXT:    retq
277;
278; AVX2-FAST-LABEL: pair_sum_v8f32_v4f32:
279; AVX2-FAST:       # %bb.0:
280; AVX2-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
281; AVX2-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
282; AVX2-FAST-NEXT:    vhaddps %xmm4, %xmm4, %xmm1
283; AVX2-FAST-NEXT:    vhaddps %xmm5, %xmm5, %xmm4
284; AVX2-FAST-NEXT:    vhaddps %xmm3, %xmm2, %xmm2
285; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3]
286; AVX2-FAST-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
287; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
288; AVX2-FAST-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1]
289; AVX2-FAST-NEXT:    vaddps %xmm1, %xmm3, %xmm1
290; AVX2-FAST-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
291; AVX2-FAST-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
292; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
293; AVX2-FAST-NEXT:    vhaddps %xmm7, %xmm6, %xmm2
294; AVX2-FAST-NEXT:    vhaddps %xmm0, %xmm2, %xmm2
295; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
296; AVX2-FAST-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
297; AVX2-FAST-NEXT:    retq
298  %9 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2>
299  %10 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 1, i32 3>
300  %11 = fadd <2 x float> %9, %10
301  %12 = shufflevector <2 x float> %11, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
302  %13 = fadd <2 x float> %11, %12
303  %14 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 0, i32 2>
304  %15 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 1, i32 3>
305  %16 = fadd <2 x float> %14, %15
306  %17 = shufflevector <2 x float> %16, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
307  %18 = fadd <2 x float> %16, %17
308  %19 = shufflevector <2 x float> %13, <2 x float> %18, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
309  %20 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 0, i32 2>
310  %21 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 1, i32 3>
311  %22 = fadd <2 x float> %20, %21
312  %23 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 0, i32 2>
313  %24 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 1, i32 3>
314  %25 = fadd <2 x float> %23, %24
315  %26 = shufflevector <4 x float> %4, <4 x float> poison, <2 x i32> <i32 0, i32 2>
316  %27 = shufflevector <4 x float> %4, <4 x float> poison, <2 x i32> <i32 1, i32 3>
317  %28 = fadd <2 x float> %26, %27
318  %29 = shufflevector <2 x float> %28, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
319  %30 = shufflevector <4 x float> %5, <4 x float> poison, <2 x i32> <i32 0, i32 2>
320  %31 = shufflevector <4 x float> %5, <4 x float> poison, <2 x i32> <i32 1, i32 3>
321  %32 = fadd <2 x float> %30, %31
322  %33 = shufflevector <2 x float> %32, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
323  %34 = shufflevector <2 x float> %22, <2 x float> %25, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
324  %35 = shufflevector <4 x float> %34, <4 x float> %29, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
325  %36 = shufflevector <4 x float> %35, <4 x float> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
326  %37 = shufflevector <2 x float> %22, <2 x float> %25, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
327  %38 = shufflevector <4 x float> %37, <4 x float> %29, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
328  %39 = shufflevector <4 x float> %38, <4 x float> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
329  %40 = fadd <4 x float> %36, %39
330  %41 = shufflevector <4 x float> %40, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
331  %42 = shufflevector <8 x float> %19, <8 x float> %41, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
332  %43 = shufflevector <4 x float> %6, <4 x float> poison, <2 x i32> <i32 0, i32 2>
333  %44 = shufflevector <4 x float> %6, <4 x float> poison, <2 x i32> <i32 1, i32 3>
334  %45 = fadd <2 x float> %43, %44
335  %46 = shufflevector <4 x float> %7, <4 x float> poison, <2 x i32> <i32 0, i32 2>
336  %47 = shufflevector <4 x float> %7, <4 x float> poison, <2 x i32> <i32 1, i32 3>
337  %48 = fadd <2 x float> %46, %47
338  %49 = shufflevector <2 x float> %45, <2 x float> %48, <2 x i32> <i32 0, i32 2>
339  %50 = shufflevector <2 x float> %45, <2 x float> %48, <2 x i32> <i32 1, i32 3>
340  %51 = fadd <2 x float> %49, %50
341  %52 = shufflevector <2 x float> %51, <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
342  %53 = shufflevector <8 x float> %42, <8 x float> %52, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
343  ret <8 x float> %53
344}
345
346define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, <4 x i32> %5, <4 x i32> %6, <4 x i32> %7) {
347; SSSE3-SLOW-LABEL: pair_sum_v8i32_v4i32:
348; SSSE3-SLOW:       # %bb.0:
349; SSSE3-SLOW-NEXT:    phaddd %xmm1, %xmm0
350; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
351; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
352; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
353; SSSE3-SLOW-NEXT:    phaddd %xmm3, %xmm2
354; SSSE3-SLOW-NEXT:    phaddd %xmm4, %xmm5
355; SSSE3-SLOW-NEXT:    phaddd %xmm5, %xmm2
356; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,1,3,2]
357; SSSE3-SLOW-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
358; SSSE3-SLOW-NEXT:    phaddd %xmm7, %xmm6
359; SSSE3-SLOW-NEXT:    phaddd %xmm6, %xmm6
360; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[0,1,1,1]
361; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[0,2]
362; SSSE3-SLOW-NEXT:    retq
363;
364; SSSE3-FAST-LABEL: pair_sum_v8i32_v4i32:
365; SSSE3-FAST:       # %bb.0:
366; SSSE3-FAST-NEXT:    phaddd %xmm1, %xmm0
367; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
368; SSSE3-FAST-NEXT:    phaddd %xmm3, %xmm2
369; SSSE3-FAST-NEXT:    phaddd %xmm5, %xmm4
370; SSSE3-FAST-NEXT:    phaddd %xmm4, %xmm2
371; SSSE3-FAST-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
372; SSSE3-FAST-NEXT:    phaddd %xmm6, %xmm6
373; SSSE3-FAST-NEXT:    phaddd %xmm7, %xmm7
374; SSSE3-FAST-NEXT:    phaddd %xmm7, %xmm6
375; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,2]
376; SSSE3-FAST-NEXT:    movaps %xmm2, %xmm1
377; SSSE3-FAST-NEXT:    retq
378;
379; AVX1-SLOW-LABEL: pair_sum_v8i32_v4i32:
380; AVX1-SLOW:       # %bb.0:
381; AVX1-SLOW-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
382; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
383; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
384; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
385; AVX1-SLOW-NEXT:    vphaddd %xmm4, %xmm4, %xmm1
386; AVX1-SLOW-NEXT:    vphaddd %xmm5, %xmm5, %xmm4
387; AVX1-SLOW-NEXT:    vphaddd %xmm3, %xmm2, %xmm2
388; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
389; AVX1-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
390; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
391; AVX1-SLOW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
392; AVX1-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
393; AVX1-SLOW-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
394; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
395; AVX1-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
396; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
397; AVX1-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
398; AVX1-SLOW-NEXT:    vphaddd %xmm7, %xmm6, %xmm2
399; AVX1-SLOW-NEXT:    vphaddd %xmm2, %xmm2, %xmm2
400; AVX1-SLOW-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
401; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
402; AVX1-SLOW-NEXT:    retq
403;
404; AVX1-FAST-LABEL: pair_sum_v8i32_v4i32:
405; AVX1-FAST:       # %bb.0:
406; AVX1-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
407; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
408; AVX1-FAST-NEXT:    vphaddd %xmm4, %xmm4, %xmm1
409; AVX1-FAST-NEXT:    vphaddd %xmm5, %xmm5, %xmm4
410; AVX1-FAST-NEXT:    vphaddd %xmm3, %xmm2, %xmm2
411; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
412; AVX1-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
413; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
414; AVX1-FAST-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
415; AVX1-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
416; AVX1-FAST-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
417; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
418; AVX1-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
419; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
420; AVX1-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
421; AVX1-FAST-NEXT:    vphaddd %xmm7, %xmm6, %xmm2
422; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm2, %xmm2
423; AVX1-FAST-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
424; AVX1-FAST-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
425; AVX1-FAST-NEXT:    retq
426;
427; AVX2-SLOW-LABEL: pair_sum_v8i32_v4i32:
428; AVX2-SLOW:       # %bb.0:
429; AVX2-SLOW-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
430; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
431; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
432; AVX2-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
433; AVX2-SLOW-NEXT:    vphaddd %xmm4, %xmm4, %xmm1
434; AVX2-SLOW-NEXT:    vphaddd %xmm5, %xmm5, %xmm4
435; AVX2-SLOW-NEXT:    vphaddd %xmm3, %xmm2, %xmm2
436; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3]
437; AVX2-SLOW-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
438; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
439; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
440; AVX2-SLOW-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
441; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
442; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
443; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
444; AVX2-SLOW-NEXT:    vphaddd %xmm7, %xmm6, %xmm1
445; AVX2-SLOW-NEXT:    vphaddd %xmm1, %xmm1, %xmm1
446; AVX2-SLOW-NEXT:    vpbroadcastq %xmm1, %ymm1
447; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
448; AVX2-SLOW-NEXT:    retq
449;
450; AVX2-FAST-LABEL: pair_sum_v8i32_v4i32:
451; AVX2-FAST:       # %bb.0:
452; AVX2-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
453; AVX2-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
454; AVX2-FAST-NEXT:    vphaddd %xmm4, %xmm4, %xmm1
455; AVX2-FAST-NEXT:    vphaddd %xmm5, %xmm5, %xmm4
456; AVX2-FAST-NEXT:    vphaddd %xmm3, %xmm2, %xmm2
457; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3]
458; AVX2-FAST-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
459; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
460; AVX2-FAST-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
461; AVX2-FAST-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
462; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
463; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
464; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
465; AVX2-FAST-NEXT:    vphaddd %xmm7, %xmm6, %xmm1
466; AVX2-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm2
467; AVX2-FAST-NEXT:    vphaddd %xmm2, %xmm1, %xmm1
468; AVX2-FAST-NEXT:    vpbroadcastq %xmm1, %ymm1
469; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
470; AVX2-FAST-NEXT:    retq
471  %9 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
472  %10 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
473  %11 = add <2 x i32> %9, %10
474  %12 = shufflevector <2 x i32> %11, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
475  %13 = add <2 x i32> %11, %12
476  %14 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
477  %15 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
478  %16 = add <2 x i32> %14, %15
479  %17 = shufflevector <2 x i32> %16, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
480  %18 = add <2 x i32> %16, %17
481  %19 = shufflevector <2 x i32> %13, <2 x i32> %18, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
482  %20 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
483  %21 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
484  %22 = add <2 x i32> %20, %21
485  %23 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
486  %24 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
487  %25 = add <2 x i32> %23, %24
488  %26 = shufflevector <4 x i32> %4, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
489  %27 = shufflevector <4 x i32> %4, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
490  %28 = add <2 x i32> %26, %27
491  %29 = shufflevector <2 x i32> %28, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
492  %30 = shufflevector <4 x i32> %5, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
493  %31 = shufflevector <4 x i32> %5, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
494  %32 = add <2 x i32> %30, %31
495  %33 = shufflevector <2 x i32> %32, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
496  %34 = shufflevector <2 x i32> %22, <2 x i32> %25, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
497  %35 = shufflevector <4 x i32> %34, <4 x i32> %29, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
498  %36 = shufflevector <4 x i32> %35, <4 x i32> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
499  %37 = shufflevector <2 x i32> %22, <2 x i32> %25, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
500  %38 = shufflevector <4 x i32> %37, <4 x i32> %29, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
501  %39 = shufflevector <4 x i32> %38, <4 x i32> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
502  %40 = add <4 x i32> %36, %39
503  %41 = shufflevector <4 x i32> %40, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
504  %42 = shufflevector <8 x i32> %19, <8 x i32> %41, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
505  %43 = shufflevector <4 x i32> %6, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
506  %44 = shufflevector <4 x i32> %6, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
507  %45 = add <2 x i32> %43, %44
508  %46 = shufflevector <4 x i32> %7, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
509  %47 = shufflevector <4 x i32> %7, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
510  %48 = add <2 x i32> %46, %47
511  %49 = shufflevector <2 x i32> %45, <2 x i32> %48, <2 x i32> <i32 0, i32 2>
512  %50 = shufflevector <2 x i32> %45, <2 x i32> %48, <2 x i32> <i32 1, i32 3>
513  %51 = add <2 x i32> %49, %50
514  %52 = shufflevector <2 x i32> %51, <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
515  %53 = shufflevector <8 x i32> %42, <8 x i32> %52, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
516  ret <8 x i32> %53
517}
518
519; Vectorized Sequential Sum Reductions
520; e.g.
521; inline STYPE sum(VTYPE x) {
522;   return ((x[0] + x[1]) + x[2]) + x[3];
523; }
524;
525; VTYPE sum4(VTYPE A0, VTYPE A1, VTYPE A2, VTYPE A3) {
526;   return (VTYPE) { sum( A0 ), sum( A1 ), sum( A2 ), sum( A3 ) };
527; }
528
529define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
530; SSSE3-SLOW-LABEL: sequential_sum_v4f32_v4f32:
531; SSSE3-SLOW:       # %bb.0:
532; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm4
533; SSSE3-SLOW-NEXT:    haddps %xmm1, %xmm4
534; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm5
535; SSSE3-SLOW-NEXT:    unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
536; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
537; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
538; SSSE3-SLOW-NEXT:    addps %xmm2, %xmm0
539; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1]
540; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3]
541; SSSE3-SLOW-NEXT:    addps %xmm4, %xmm5
542; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3]
543; SSSE3-SLOW-NEXT:    addps %xmm5, %xmm1
544; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm3[1,1,3,3]
545; SSSE3-SLOW-NEXT:    addps %xmm3, %xmm0
546; SSSE3-SLOW-NEXT:    movaps %xmm3, %xmm2
547; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
548; SSSE3-SLOW-NEXT:    addps %xmm0, %xmm2
549; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
550; SSSE3-SLOW-NEXT:    addps %xmm2, %xmm3
551; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
552; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0]
553; SSSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
554; SSSE3-SLOW-NEXT:    retq
555;
556; SSSE3-FAST-LABEL: sequential_sum_v4f32_v4f32:
557; SSSE3-FAST:       # %bb.0:
558; SSSE3-FAST-NEXT:    movaps %xmm0, %xmm4
559; SSSE3-FAST-NEXT:    haddps %xmm1, %xmm4
560; SSSE3-FAST-NEXT:    movaps %xmm0, %xmm5
561; SSSE3-FAST-NEXT:    unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
562; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
563; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3]
564; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3]
565; SSSE3-FAST-NEXT:    haddps %xmm2, %xmm2
566; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,1]
567; SSSE3-FAST-NEXT:    addps %xmm4, %xmm5
568; SSSE3-FAST-NEXT:    addps %xmm5, %xmm1
569; SSSE3-FAST-NEXT:    movaps %xmm3, %xmm0
570; SSSE3-FAST-NEXT:    haddps %xmm3, %xmm0
571; SSSE3-FAST-NEXT:    movaps %xmm3, %xmm2
572; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
573; SSSE3-FAST-NEXT:    addps %xmm0, %xmm2
574; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
575; SSSE3-FAST-NEXT:    addps %xmm2, %xmm3
576; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
577; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0]
578; SSSE3-FAST-NEXT:    movaps %xmm1, %xmm0
579; SSSE3-FAST-NEXT:    retq
580;
581; AVX-SLOW-LABEL: sequential_sum_v4f32_v4f32:
582; AVX-SLOW:       # %bb.0:
583; AVX-SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm4
584; AVX-SLOW-NEXT:    vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
585; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
586; AVX-SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
587; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
588; AVX-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
589; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
590; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
591; AVX-SLOW-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
592; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
593; AVX-SLOW-NEXT:    vaddps %xmm3, %xmm4, %xmm4
594; AVX-SLOW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
595; AVX-SLOW-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]
596; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm2, %xmm1
597; AVX-SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
598; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
599; AVX-SLOW-NEXT:    retq
600;
601; AVX-FAST-LABEL: sequential_sum_v4f32_v4f32:
602; AVX-FAST:       # %bb.0:
603; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm4
604; AVX-FAST-NEXT:    vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
605; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
606; AVX-FAST-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
607; AVX-FAST-NEXT:    vhaddps %xmm2, %xmm2, %xmm1
608; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
609; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
610; AVX-FAST-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
611; AVX-FAST-NEXT:    vhaddps %xmm3, %xmm3, %xmm4
612; AVX-FAST-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
613; AVX-FAST-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]
614; AVX-FAST-NEXT:    vaddps %xmm1, %xmm2, %xmm1
615; AVX-FAST-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
616; AVX-FAST-NEXT:    vaddps %xmm1, %xmm0, %xmm0
617; AVX-FAST-NEXT:    retq
618  %5 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 0, i32 4>
619  %6 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 1, i32 5>
620  %7 = fadd <2 x float> %5, %6
621  %8 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 2, i32 6>
622  %9 = fadd <2 x float> %8, %7
623  %10 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 3, i32 7>
624  %11 = fadd <2 x float> %10, %9
625  %12 = shufflevector <2 x float> %11, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
626  %13 = shufflevector <4 x float> %2, <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
627  %14 = fadd <4 x float> %13, %2
628  %15 = shufflevector <4 x float> %2, <4 x float> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
629  %16 = fadd <4 x float> %15, %14
630  %17 = shufflevector <4 x float> %2, <4 x float> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
631  %18 = fadd <4 x float> %17, %16
632  %19 = shufflevector <4 x float> %12, <4 x float> %18, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
633  %20 = shufflevector <4 x float> %3, <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
634  %21 = fadd <4 x float> %20, %3
635  %22 = shufflevector <4 x float> %3, <4 x float> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
636  %23 = fadd <4 x float> %22, %21
637  %24 = shufflevector <4 x float> %3, <4 x float> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
638  %25 = fadd <4 x float> %24, %23
639  %26 = shufflevector <4 x float> %19, <4 x float> %25, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
640  ret <4 x float> %26
641}
642
643define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
644; SSSE3-SLOW-LABEL: sequential_sum_v4i32_v4i32:
645; SSSE3-SLOW:       # %bb.0:
646; SSSE3-SLOW-NEXT:    movdqa %xmm0, %xmm4
647; SSSE3-SLOW-NEXT:    phaddd %xmm1, %xmm4
648; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
649; SSSE3-SLOW-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
650; SSSE3-SLOW-NEXT:    paddd %xmm0, %xmm4
651; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
652; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,1,0,1]
653; SSSE3-SLOW-NEXT:    paddd %xmm2, %xmm5
654; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm5
655; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
656; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
657; SSSE3-SLOW-NEXT:    paddd %xmm3, %xmm6
658; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm6
659; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3]
660; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0]
661; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3]
662; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0]
663; SSSE3-SLOW-NEXT:    paddd %xmm4, %xmm0
664; SSSE3-SLOW-NEXT:    retq
665;
666; SSSE3-FAST-LABEL: sequential_sum_v4i32_v4i32:
667; SSSE3-FAST:       # %bb.0:
668; SSSE3-FAST-NEXT:    movdqa %xmm0, %xmm4
669; SSSE3-FAST-NEXT:    phaddd %xmm1, %xmm4
670; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
671; SSSE3-FAST-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
672; SSSE3-FAST-NEXT:    paddd %xmm0, %xmm4
673; SSSE3-FAST-NEXT:    movdqa %xmm2, %xmm1
674; SSSE3-FAST-NEXT:    phaddd %xmm2, %xmm1
675; SSSE3-FAST-NEXT:    paddd %xmm2, %xmm1
676; SSSE3-FAST-NEXT:    movdqa %xmm3, %xmm5
677; SSSE3-FAST-NEXT:    phaddd %xmm3, %xmm5
678; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
679; SSSE3-FAST-NEXT:    paddd %xmm5, %xmm6
680; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,3]
681; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0]
682; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3]
683; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0]
684; SSSE3-FAST-NEXT:    paddd %xmm4, %xmm0
685; SSSE3-FAST-NEXT:    retq
686;
687; AVX1-SLOW-LABEL: sequential_sum_v4i32_v4i32:
688; AVX1-SLOW:       # %bb.0:
689; AVX1-SLOW-NEXT:    vphaddd %xmm1, %xmm0, %xmm4
690; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
691; AVX1-SLOW-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
692; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
693; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
694; AVX1-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
695; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
696; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
697; AVX1-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
698; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
699; AVX1-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
700; AVX1-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
701; AVX1-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm2[4,5,6,7]
702; AVX1-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
703; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
704; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[0,0,0,0]
705; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2]
706; AVX1-SLOW-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
707; AVX1-SLOW-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
708; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
709; AVX1-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
710; AVX1-SLOW-NEXT:    retq
711;
712; AVX1-FAST-LABEL: sequential_sum_v4i32_v4i32:
713; AVX1-FAST:       # %bb.0:
714; AVX1-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm4
715; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
716; AVX1-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
717; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
718; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
719; AVX1-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
720; AVX1-FAST-NEXT:    vphaddd %xmm2, %xmm2, %xmm1
721; AVX1-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
722; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
723; AVX1-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
724; AVX1-FAST-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
725; AVX1-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm2[4,5,6,7]
726; AVX1-FAST-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
727; AVX1-FAST-NEXT:    vphaddd %xmm3, %xmm3, %xmm1
728; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
729; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
730; AVX1-FAST-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
731; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
732; AVX1-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
733; AVX1-FAST-NEXT:    retq
734;
735; AVX2-SLOW-LABEL: sequential_sum_v4i32_v4i32:
736; AVX2-SLOW:       # %bb.0:
737; AVX2-SLOW-NEXT:    vphaddd %xmm1, %xmm0, %xmm4
738; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
739; AVX2-SLOW-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
740; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
741; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
742; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
743; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
744; AVX2-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
745; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
746; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
747; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
748; AVX2-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
749; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm2[2,3]
750; AVX2-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
751; AVX2-SLOW-NEXT:    vpbroadcastq %xmm3, %xmm1
752; AVX2-SLOW-NEXT:    vpbroadcastd %xmm3, %xmm2
753; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2]
754; AVX2-SLOW-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
755; AVX2-SLOW-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
756; AVX2-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
757; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
758; AVX2-SLOW-NEXT:    retq
759;
760; AVX2-FAST-LABEL: sequential_sum_v4i32_v4i32:
761; AVX2-FAST:       # %bb.0:
762; AVX2-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm4
763; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
764; AVX2-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
765; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
766; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
767; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
768; AVX2-FAST-NEXT:    vphaddd %xmm2, %xmm2, %xmm1
769; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
770; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
771; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
772; AVX2-FAST-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
773; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm2[2,3]
774; AVX2-FAST-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
775; AVX2-FAST-NEXT:    vphaddd %xmm3, %xmm3, %xmm1
776; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
777; AVX2-FAST-NEXT:    vpbroadcastd %xmm1, %xmm1
778; AVX2-FAST-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
779; AVX2-FAST-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
780; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
781; AVX2-FAST-NEXT:    retq
782  %5 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 0, i32 4>
783  %6 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 1, i32 5>
784  %7 = add <2 x i32> %5, %6
785  %8 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 2, i32 6>
786  %9 = add <2 x i32> %8, %7
787  %10 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 3, i32 7>
788  %11 = add <2 x i32> %10, %9
789  %12 = shufflevector <2 x i32> %11, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
790  %13 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
791  %14 = add <4 x i32> %13, %2
792  %15 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
793  %16 = add <4 x i32> %15, %14
794  %17 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
795  %18 = add <4 x i32> %17, %16
796  %19 = shufflevector <4 x i32> %12, <4 x i32> %18, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
797  %20 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
798  %21 = add <4 x i32> %20, %3
799  %22 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
800  %23 = add <4 x i32> %22, %21
801  %24 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
802  %25 = add <4 x i32> %24, %23
803  %26 = shufflevector <4 x i32> %19, <4 x i32> %25, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
804  ret <4 x i32> %26
805}
806
807; Vectorized Reductions
808; e.g.
809; VTYPE sum4(VTYPE A0, VTYPE A1, VTYPE A2, VTYPE A3) {
810;   return (VTYPE) { reduce( A0 ), reduce( A1 ), reduce( A2 ), reduce( A3 ) };
811; }
812
813define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
814; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32:
815; SSSE3-SLOW:       # %bb.0:
816; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
817; SSSE3-SLOW-NEXT:    addss %xmm0, %xmm4
818; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm5
819; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
820; SSSE3-SLOW-NEXT:    addss %xmm4, %xmm5
821; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
822; SSSE3-SLOW-NEXT:    addss %xmm5, %xmm0
823; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
824; SSSE3-SLOW-NEXT:    addss %xmm1, %xmm4
825; SSSE3-SLOW-NEXT:    movaps %xmm1, %xmm5
826; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
827; SSSE3-SLOW-NEXT:    addss %xmm4, %xmm5
828; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
829; SSSE3-SLOW-NEXT:    addss %xmm5, %xmm1
830; SSSE3-SLOW-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
831; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
832; SSSE3-SLOW-NEXT:    addss %xmm2, %xmm1
833; SSSE3-SLOW-NEXT:    movaps %xmm2, %xmm4
834; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
835; SSSE3-SLOW-NEXT:    addss %xmm1, %xmm4
836; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
837; SSSE3-SLOW-NEXT:    addss %xmm4, %xmm2
838; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
839; SSSE3-SLOW-NEXT:    addss %xmm3, %xmm1
840; SSSE3-SLOW-NEXT:    movaps %xmm3, %xmm4
841; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
842; SSSE3-SLOW-NEXT:    addss %xmm1, %xmm4
843; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
844; SSSE3-SLOW-NEXT:    addss %xmm4, %xmm3
845; SSSE3-SLOW-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
846; SSSE3-SLOW-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
847; SSSE3-SLOW-NEXT:    retq
848;
849; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32:
850; SSSE3-FAST:       # %bb.0:
851; SSSE3-FAST-NEXT:    movaps %xmm0, %xmm4
852; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm4
853; SSSE3-FAST-NEXT:    movaps %xmm0, %xmm5
854; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
855; SSSE3-FAST-NEXT:    addss %xmm4, %xmm5
856; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
857; SSSE3-FAST-NEXT:    addss %xmm5, %xmm0
858; SSSE3-FAST-NEXT:    movaps %xmm1, %xmm4
859; SSSE3-FAST-NEXT:    haddps %xmm1, %xmm4
860; SSSE3-FAST-NEXT:    movaps %xmm1, %xmm5
861; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
862; SSSE3-FAST-NEXT:    addss %xmm4, %xmm5
863; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
864; SSSE3-FAST-NEXT:    addss %xmm5, %xmm1
865; SSSE3-FAST-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
866; SSSE3-FAST-NEXT:    movaps %xmm2, %xmm1
867; SSSE3-FAST-NEXT:    haddps %xmm2, %xmm1
868; SSSE3-FAST-NEXT:    movaps %xmm2, %xmm4
869; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
870; SSSE3-FAST-NEXT:    addss %xmm1, %xmm4
871; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
872; SSSE3-FAST-NEXT:    addss %xmm4, %xmm2
873; SSSE3-FAST-NEXT:    movaps %xmm3, %xmm1
874; SSSE3-FAST-NEXT:    haddps %xmm3, %xmm1
875; SSSE3-FAST-NEXT:    movaps %xmm3, %xmm4
876; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
877; SSSE3-FAST-NEXT:    addss %xmm1, %xmm4
878; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
879; SSSE3-FAST-NEXT:    addss %xmm4, %xmm3
880; SSSE3-FAST-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
881; SSSE3-FAST-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
882; SSSE3-FAST-NEXT:    retq
883;
884; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32:
885; AVX-SLOW:       # %bb.0:
886; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
887; AVX-SLOW-NEXT:    vaddss %xmm4, %xmm0, %xmm4
888; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm0[1,0]
889; AVX-SLOW-NEXT:    vaddss %xmm5, %xmm4, %xmm4
890; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
891; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm4, %xmm0
892; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
893; AVX-SLOW-NEXT:    vaddss %xmm4, %xmm1, %xmm4
894; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
895; AVX-SLOW-NEXT:    vaddss %xmm5, %xmm4, %xmm4
896; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
897; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm4, %xmm1
898; AVX-SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
899; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
900; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm2, %xmm1
901; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
902; AVX-SLOW-NEXT:    vaddss %xmm4, %xmm1, %xmm1
903; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
904; AVX-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
905; AVX-SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
906; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
907; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm3, %xmm1
908; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
909; AVX-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
910; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm2 = xmm3[3,3,3,3]
911; AVX-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
912; AVX-SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
913; AVX-SLOW-NEXT:    retq
914;
915; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32:
916; AVX-FAST:       # %bb.0:
917; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm4
918; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm0[1,0]
919; AVX-FAST-NEXT:    vaddss %xmm5, %xmm4, %xmm4
920; AVX-FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
921; AVX-FAST-NEXT:    vaddss %xmm0, %xmm4, %xmm0
922; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm4
923; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
924; AVX-FAST-NEXT:    vaddss %xmm5, %xmm4, %xmm4
925; AVX-FAST-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
926; AVX-FAST-NEXT:    vaddss %xmm1, %xmm4, %xmm1
927; AVX-FAST-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
928; AVX-FAST-NEXT:    vhaddps %xmm2, %xmm2, %xmm1
929; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
930; AVX-FAST-NEXT:    vaddss %xmm4, %xmm1, %xmm1
931; AVX-FAST-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
932; AVX-FAST-NEXT:    vaddss %xmm2, %xmm1, %xmm1
933; AVX-FAST-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
934; AVX-FAST-NEXT:    vhaddps %xmm3, %xmm3, %xmm1
935; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
936; AVX-FAST-NEXT:    vaddss %xmm2, %xmm1, %xmm1
937; AVX-FAST-NEXT:    vpermilps {{.*#+}} xmm2 = xmm3[3,3,3,3]
938; AVX-FAST-NEXT:    vaddss %xmm2, %xmm1, %xmm1
939; AVX-FAST-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
940; AVX-FAST-NEXT:    retq
941  %5 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0)
942  %6 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1)
943  %7 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %2)
944  %8 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %3)
945  %9 = insertelement <4 x float> undef, float %5, i32 0
946  %10 = insertelement <4 x float> %9,   float %6, i32 1
947  %11 = insertelement <4 x float> %10,  float %7, i32 2
948  %12 = insertelement <4 x float> %11,  float %8, i32 3
949  ret <4 x float> %12
950}
951declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
952
953define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
954; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc:
955; SSSE3-SLOW:       # %bb.0:
956; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm4
957; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
958; SSSE3-SLOW-NEXT:    addps %xmm0, %xmm4
959; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3]
960; SSSE3-SLOW-NEXT:    movaps %xmm1, %xmm5
961; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
962; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm5
963; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm5[1,1,3,3]
964; SSSE3-SLOW-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
965; SSSE3-SLOW-NEXT:    movaps %xmm2, %xmm1
966; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
967; SSSE3-SLOW-NEXT:    addps %xmm2, %xmm1
968; SSSE3-SLOW-NEXT:    movaps %xmm3, %xmm2
969; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
970; SSSE3-SLOW-NEXT:    addps %xmm3, %xmm2
971; SSSE3-SLOW-NEXT:    movaps %xmm2, %xmm3
972; SSSE3-SLOW-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
973; SSSE3-SLOW-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
974; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
975; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
976; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
977; SSSE3-SLOW-NEXT:    addps %xmm0, %xmm4
978; SSSE3-SLOW-NEXT:    movaps %xmm4, %xmm0
979; SSSE3-SLOW-NEXT:    retq
980;
981; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
982; SSSE3-FAST:       # %bb.0:
983; SSSE3-FAST-NEXT:    movaps %xmm0, %xmm4
984; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
985; SSSE3-FAST-NEXT:    addps %xmm0, %xmm4
986; SSSE3-FAST-NEXT:    movaps %xmm1, %xmm0
987; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
988; SSSE3-FAST-NEXT:    addps %xmm1, %xmm0
989; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm4
990; SSSE3-FAST-NEXT:    movaps %xmm2, %xmm0
991; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
992; SSSE3-FAST-NEXT:    addps %xmm2, %xmm0
993; SSSE3-FAST-NEXT:    movaps %xmm3, %xmm1
994; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
995; SSSE3-FAST-NEXT:    addps %xmm3, %xmm1
996; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm1
997; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,0]
998; SSSE3-FAST-NEXT:    movaps %xmm4, %xmm0
999; SSSE3-FAST-NEXT:    retq
1000;
1001; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc:
1002; AVX-SLOW:       # %bb.0:
1003; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
1004; AVX-SLOW-NEXT:    vaddps %xmm4, %xmm0, %xmm0
1005; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
1006; AVX-SLOW-NEXT:    vaddps %xmm4, %xmm1, %xmm1
1007; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
1008; AVX-SLOW-NEXT:    vaddps %xmm4, %xmm2, %xmm2
1009; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
1010; AVX-SLOW-NEXT:    vaddps %xmm4, %xmm3, %xmm3
1011; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm4 = xmm3[1,1],xmm2[1,1]
1012; AVX-SLOW-NEXT:    vinsertps {{.*#+}} xmm5 = xmm0[1],xmm1[1],zero,zero
1013; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,0]
1014; AVX-SLOW-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
1015; AVX-SLOW-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1016; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
1017; AVX-SLOW-NEXT:    vaddps %xmm4, %xmm0, %xmm0
1018; AVX-SLOW-NEXT:    retq
1019;
1020; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
1021; AVX-FAST:       # %bb.0:
1022; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
1023; AVX-FAST-NEXT:    vaddps %xmm4, %xmm0, %xmm0
1024; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
1025; AVX-FAST-NEXT:    vaddps %xmm4, %xmm1, %xmm1
1026; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
1027; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1028; AVX-FAST-NEXT:    vaddps %xmm1, %xmm2, %xmm1
1029; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
1030; AVX-FAST-NEXT:    vaddps %xmm2, %xmm3, %xmm2
1031; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm2, %xmm1
1032; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,0]
1033; AVX-FAST-NEXT:    retq
1034  %5 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0)
1035  %6 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1)
1036  %7 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %2)
1037  %8 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %3)
1038  %9 = insertelement <4 x float> undef, float %5, i32 0
1039  %10 = insertelement <4 x float> %9,   float %6, i32 1
1040  %11 = insertelement <4 x float> %10,  float %7, i32 2
1041  %12 = insertelement <4 x float> %11,  float %8, i32 3
1042  ret <4 x float> %12
1043}
1044
1045define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
1046; SSSE3-SLOW-LABEL: reduction_sum_v4i32_v4i32:
1047; SSSE3-SLOW:       # %bb.0:
1048; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1049; SSSE3-SLOW-NEXT:    paddd %xmm0, %xmm4
1050; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
1051; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
1052; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm5
1053; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1]
1054; SSSE3-SLOW-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1055; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1056; SSSE3-SLOW-NEXT:    paddd %xmm2, %xmm1
1057; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
1058; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
1059; SSSE3-SLOW-NEXT:    paddd %xmm3, %xmm6
1060; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1]
1061; SSSE3-SLOW-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1062; SSSE3-SLOW-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1063; SSSE3-SLOW-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
1064; SSSE3-SLOW-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1065; SSSE3-SLOW-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
1066; SSSE3-SLOW-NEXT:    paddd %xmm0, %xmm4
1067; SSSE3-SLOW-NEXT:    movdqa %xmm4, %xmm0
1068; SSSE3-SLOW-NEXT:    retq
1069;
1070; SSSE3-FAST-LABEL: reduction_sum_v4i32_v4i32:
1071; SSSE3-FAST:       # %bb.0:
1072; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1073; SSSE3-FAST-NEXT:    paddd %xmm4, %xmm0
1074; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
1075; SSSE3-FAST-NEXT:    paddd %xmm1, %xmm4
1076; SSSE3-FAST-NEXT:    phaddd %xmm4, %xmm0
1077; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1078; SSSE3-FAST-NEXT:    paddd %xmm2, %xmm1
1079; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1080; SSSE3-FAST-NEXT:    paddd %xmm3, %xmm2
1081; SSSE3-FAST-NEXT:    phaddd %xmm2, %xmm1
1082; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1083; SSSE3-FAST-NEXT:    retq
1084;
1085; AVX-SLOW-LABEL: reduction_sum_v4i32_v4i32:
1086; AVX-SLOW:       # %bb.0:
1087; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1088; AVX-SLOW-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
1089; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
1090; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
1091; AVX-SLOW-NEXT:    vpaddd %xmm5, %xmm1, %xmm1
1092; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
1093; AVX-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1094; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
1095; AVX-SLOW-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
1096; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
1097; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
1098; AVX-SLOW-NEXT:    vpaddd %xmm6, %xmm3, %xmm3
1099; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
1100; AVX-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
1101; AVX-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1102; AVX-SLOW-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
1103; AVX-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1104; AVX-SLOW-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
1105; AVX-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1106; AVX-SLOW-NEXT:    retq
1107;
1108; AVX1-FAST-LABEL: reduction_sum_v4i32_v4i32:
1109; AVX1-FAST:       # %bb.0:
1110; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1111; AVX1-FAST-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
1112; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
1113; AVX1-FAST-NEXT:    vpaddd %xmm4, %xmm1, %xmm1
1114; AVX1-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
1115; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1116; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
1117; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1118; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
1119; AVX1-FAST-NEXT:    vphaddd %xmm2, %xmm1, %xmm1
1120; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,0,2]
1121; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1122; AVX1-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1123; AVX1-FAST-NEXT:    retq
1124;
1125; AVX2-FAST-LABEL: reduction_sum_v4i32_v4i32:
1126; AVX2-FAST:       # %bb.0:
1127; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1128; AVX2-FAST-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
1129; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
1130; AVX2-FAST-NEXT:    vpaddd %xmm4, %xmm1, %xmm1
1131; AVX2-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
1132; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1133; AVX2-FAST-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
1134; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1135; AVX2-FAST-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
1136; AVX2-FAST-NEXT:    vphaddd %xmm2, %xmm1, %xmm1
1137; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,0,2]
1138; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1139; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1140; AVX2-FAST-NEXT:    retq
1141  %5 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %0)
1142  %6 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %1)
1143  %7 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %2)
1144  %8 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %3)
1145  %9 = insertelement <4 x i32> undef, i32 %5, i32 0
1146  %10 = insertelement <4 x i32> %9,   i32 %6, i32 1
1147  %11 = insertelement <4 x i32> %10,  i32 %7, i32 2
1148  %12 = insertelement <4 x i32> %11,  i32 %8, i32 3
1149  ret <4 x i32> %12
1150}
1151declare i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32>)
1152