1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3           | FileCheck %s --check-prefix=SSE
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefix=SSE
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx             | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s --check-prefixes=AVX,AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2            | FileCheck %s --check-prefixes=AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX2
8
9define <8 x i16> @hadd_reverse_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
10; SSE-LABEL: hadd_reverse_v8i16:
11; SSE:       # %bb.0:
12; SSE-NEXT:    phaddw %xmm1, %xmm0
13; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
14; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
15; SSE-NEXT:    retq
16;
17; AVX-LABEL: hadd_reverse_v8i16:
18; AVX:       # %bb.0:
19; AVX-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
20; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
21; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
22; AVX-NEXT:    retq
23  %lhs = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 7, i32 5, i32 3, i32 1, i32 15, i32 13, i32 11, i32 9>
24  %rhs = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 6, i32 4, i32 2, i32 0, i32 14, i32 12, i32 10, i32 8>
25  %add = add <8 x i16> %lhs, %rhs
26  ret <8 x i16> %add
27}
28
29define <8 x i16> @hadd_reverse2_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
30; SSE-LABEL: hadd_reverse2_v8i16:
31; SSE:       # %bb.0:
32; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
33; SSE-NEXT:    pshufb %xmm2, %xmm0
34; SSE-NEXT:    pshufb %xmm2, %xmm1
35; SSE-NEXT:    phaddw %xmm1, %xmm0
36; SSE-NEXT:    retq
37;
38; AVX-LABEL: hadd_reverse2_v8i16:
39; AVX:       # %bb.0:
40; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
41; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
42; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
43; AVX-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
44; AVX-NEXT:    retq
45  %shuf0 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
46  %shuf1 = shufflevector <8 x i16> %a1, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
47  %lhs = shufflevector <8 x i16> %shuf0, <8 x i16> %shuf1, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
48  %rhs = shufflevector <8 x i16> %shuf0, <8 x i16> %shuf1, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
49  %add = add <8 x i16> %lhs, %rhs
50  ret <8 x i16> %add
51}
52
53define <8 x float> @hadd_reverse_v8f32(<8 x float> %a0, <8 x float> %a1) {
54; SSE-LABEL: hadd_reverse_v8f32:
55; SSE:       # %bb.0:
56; SSE-NEXT:    movaps %xmm0, %xmm4
57; SSE-NEXT:    haddps %xmm3, %xmm1
58; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0,3,2]
59; SSE-NEXT:    haddps %xmm2, %xmm4
60; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,0,3,2]
61; SSE-NEXT:    movaps %xmm1, %xmm0
62; SSE-NEXT:    movaps %xmm4, %xmm1
63; SSE-NEXT:    retq
64;
65; AVX1-LABEL: hadd_reverse_v8f32:
66; AVX1:       # %bb.0:
67; AVX1-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
68; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
69; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
70; AVX1-NEXT:    retq
71;
72; AVX2-LABEL: hadd_reverse_v8f32:
73; AVX2:       # %bb.0:
74; AVX2-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
75; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
76; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
77; AVX2-NEXT:    retq
78  %lhs = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 7, i32 5, i32 15, i32 13, i32 3, i32 1, i32 11, i32 9>
79  %rhs = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 6, i32 4, i32 14, i32 12, i32 2, i32 0, i32 10, i32 8>
80  %add = fadd <8 x float> %lhs, %rhs
81  ret <8 x float> %add
82}
83
84define <8 x float> @hadd_reverse2_v8f32(<8 x float> %a0, <8 x float> %a1) {
85; SSE-LABEL: hadd_reverse2_v8f32:
86; SSE:       # %bb.0:
87; SSE-NEXT:    movaps %xmm0, %xmm4
88; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,2],xmm0[1,0]
89; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,2,1,0]
90; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,2,1,0]
91; SSE-NEXT:    haddps %xmm2, %xmm4
92; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,2,1,0]
93; SSE-NEXT:    haddps %xmm3, %xmm1
94; SSE-NEXT:    movaps %xmm1, %xmm0
95; SSE-NEXT:    movaps %xmm4, %xmm1
96; SSE-NEXT:    retq
97;
98; AVX1-LABEL: hadd_reverse2_v8f32:
99; AVX1:       # %bb.0:
100; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
101; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
102; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
103; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
104; AVX1-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
105; AVX1-NEXT:    retq
106;
107; AVX2-LABEL: hadd_reverse2_v8f32:
108; AVX2:       # %bb.0:
109; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
110; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
111; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
112; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1]
113; AVX2-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
114; AVX2-NEXT:    retq
115  %shuf0 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
116  %shuf1 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
117  %lhs = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
118  %rhs = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
119  %add = fadd <8 x float> %lhs, %rhs
120  ret <8 x float> %add
121}
122
123define <8 x float> @hadd_reverse3_v8f32(<8 x float> %a0, <8 x float> %a1) {
124; SSE-LABEL: hadd_reverse3_v8f32:
125; SSE:       # %bb.0:
126; SSE-NEXT:    haddps %xmm1, %xmm3
127; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0,3,2]
128; SSE-NEXT:    haddps %xmm0, %xmm2
129; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0,3,2]
130; SSE-NEXT:    movaps %xmm3, %xmm0
131; SSE-NEXT:    movaps %xmm2, %xmm1
132; SSE-NEXT:    retq
133;
134; AVX1-LABEL: hadd_reverse3_v8f32:
135; AVX1:       # %bb.0:
136; AVX1-NEXT:    vhaddps %ymm0, %ymm1, %ymm0
137; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
138; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
139; AVX1-NEXT:    retq
140;
141; AVX2-LABEL: hadd_reverse3_v8f32:
142; AVX2:       # %bb.0:
143; AVX2-NEXT:    vhaddps %ymm0, %ymm1, %ymm0
144; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
145; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
146; AVX2-NEXT:    retq
147  %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
148  %shuf1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
149  %add = fadd <8 x float> %shuf0, %shuf1
150  %shuf2 = shufflevector <8 x float> %add, <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
151  ret <8 x float> %shuf2
152}
153
154define <16 x i16> @hadd_reverse_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
155; SSE-LABEL: hadd_reverse_v16i16:
156; SSE:       # %bb.0:
157; SSE-NEXT:    phaddw %xmm3, %xmm1
158; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
159; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,6,5,4]
160; SSE-NEXT:    phaddw %xmm2, %xmm0
161; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
162; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
163; SSE-NEXT:    movdqa %xmm3, %xmm0
164; SSE-NEXT:    retq
165;
166; AVX1-LABEL: hadd_reverse_v16i16:
167; AVX1:       # %bb.0:
168; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
169; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
170; AVX1-NEXT:    vphaddw %xmm2, %xmm3, %xmm2
171; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
172; AVX1-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
173; AVX1-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
174; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
175; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
176; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
177; AVX1-NEXT:    retq
178;
179; AVX2-LABEL: hadd_reverse_v16i16:
180; AVX2:       # %bb.0:
181; AVX2-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
182; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
183; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
184; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
185; AVX2-NEXT:    retq
186  %lhs = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 15, i32 13, i32 11, i32 9, i32 31, i32 29, i32 27, i32 25, i32 7, i32 5, i32 3, i32 1, i32 23, i32 21, i32 19, i32 17>
187  %rhs = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 14, i32 12, i32 10, i32 8, i32 30, i32 28, i32 26, i32 24, i32 6, i32 4, i32 2, i32 0, i32 22, i32 20, i32 18, i32 16>
188  %add = add <16 x i16> %lhs, %rhs
189  ret <16 x i16> %add
190}
191
192define <16 x i16> @hadd_reverse2_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
193; SSE-LABEL: hadd_reverse2_v16i16:
194; SSE:       # %bb.0:
195; SSE-NEXT:    movdqa %xmm0, %xmm4
196; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
197; SSE-NEXT:    pshufb %xmm0, %xmm4
198; SSE-NEXT:    pshufb %xmm0, %xmm1
199; SSE-NEXT:    pshufb %xmm0, %xmm2
200; SSE-NEXT:    phaddw %xmm2, %xmm4
201; SSE-NEXT:    pshufb %xmm0, %xmm3
202; SSE-NEXT:    phaddw %xmm3, %xmm1
203; SSE-NEXT:    movdqa %xmm1, %xmm0
204; SSE-NEXT:    movdqa %xmm4, %xmm1
205; SSE-NEXT:    retq
206;
207; AVX1-LABEL: hadd_reverse2_v16i16:
208; AVX1:       # %bb.0:
209; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
210; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
211; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
212; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
213; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
214; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
215; AVX1-NEXT:    vphaddw %xmm4, %xmm2, %xmm2
216; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
217; AVX1-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
218; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
219; AVX1-NEXT:    retq
220;
221; AVX2-LABEL: hadd_reverse2_v16i16:
222; AVX2:       # %bb.0:
223; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,30,31,28,29,26,27,24,25,22,23,20,21,18,19,16,17]
224; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
225; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
226; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
227; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
228; AVX2-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
229; AVX2-NEXT:    retq
230  %shuf0 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
231  %shuf1 = shufflevector <16 x i16> %a1, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
232  %lhs = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
233  %rhs = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
234  %add = add <16 x i16> %lhs, %rhs
235  ret <16 x i16> %add
236}
237
238define <8 x double> @hadd_reverse_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind {
239; SSE-LABEL: hadd_reverse_v8f64:
240; SSE:       # %bb.0:
241; SSE-NEXT:    movapd %xmm1, %xmm8
242; SSE-NEXT:    movapd %xmm0, %xmm9
243; SSE-NEXT:    haddpd %xmm7, %xmm3
244; SSE-NEXT:    haddpd %xmm6, %xmm2
245; SSE-NEXT:    haddpd %xmm5, %xmm8
246; SSE-NEXT:    haddpd %xmm4, %xmm9
247; SSE-NEXT:    movapd %xmm3, %xmm0
248; SSE-NEXT:    movapd %xmm2, %xmm1
249; SSE-NEXT:    movapd %xmm8, %xmm2
250; SSE-NEXT:    movapd %xmm9, %xmm3
251; SSE-NEXT:    retq
252;
253; AVX1-LABEL: hadd_reverse_v8f64:
254; AVX1:       # %bb.0:
255; AVX1-NEXT:    vhaddpd %ymm3, %ymm1, %ymm1
256; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1]
257; AVX1-NEXT:    vhaddpd %ymm2, %ymm0, %ymm0
258; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
259; AVX1-NEXT:    vmovapd %ymm3, %ymm0
260; AVX1-NEXT:    retq
261;
262; AVX2-LABEL: hadd_reverse_v8f64:
263; AVX2:       # %bb.0:
264; AVX2-NEXT:    vhaddpd %ymm3, %ymm1, %ymm1
265; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm1[2,3,0,1]
266; AVX2-NEXT:    vhaddpd %ymm2, %ymm0, %ymm0
267; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[2,3,0,1]
268; AVX2-NEXT:    vmovapd %ymm3, %ymm0
269; AVX2-NEXT:    retq
270  %lhs = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 7, i32 15, i32 5, i32 13, i32 3, i32 11, i32 1, i32 9>
271  %rhs = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 6, i32 14, i32 4, i32 12, i32 2, i32 10, i32 0, i32 8>
272  %fadd = fadd <8 x double> %lhs, %rhs
273  ret <8 x double> %fadd
274}
275
276define <8 x double> @hadd_reverse2_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind {
277; SSE-LABEL: hadd_reverse2_v8f64:
278; SSE:       # %bb.0:
279; SSE-NEXT:    movapd %xmm1, %xmm8
280; SSE-NEXT:    movapd %xmm0, %xmm9
281; SSE-NEXT:    shufpd {{.*#+}} xmm9 = xmm9[1],xmm0[0]
282; SSE-NEXT:    shufpd {{.*#+}} xmm8 = xmm8[1],xmm1[0]
283; SSE-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[1,0]
284; SSE-NEXT:    shufpd {{.*#+}} xmm3 = xmm3[1,0]
285; SSE-NEXT:    shufpd {{.*#+}} xmm4 = xmm4[1,0]
286; SSE-NEXT:    haddpd %xmm4, %xmm9
287; SSE-NEXT:    shufpd {{.*#+}} xmm5 = xmm5[1,0]
288; SSE-NEXT:    haddpd %xmm5, %xmm8
289; SSE-NEXT:    shufpd {{.*#+}} xmm6 = xmm6[1,0]
290; SSE-NEXT:    haddpd %xmm6, %xmm2
291; SSE-NEXT:    shufpd {{.*#+}} xmm7 = xmm7[1,0]
292; SSE-NEXT:    haddpd %xmm7, %xmm3
293; SSE-NEXT:    movapd %xmm3, %xmm0
294; SSE-NEXT:    movapd %xmm2, %xmm1
295; SSE-NEXT:    movapd %xmm8, %xmm2
296; SSE-NEXT:    movapd %xmm9, %xmm3
297; SSE-NEXT:    retq
298;
299; AVX1-LABEL: hadd_reverse2_v8f64:
300; AVX1:       # %bb.0:
301; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
302; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
303; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
304; AVX1-NEXT:    vpermilpd {{.*#+}} ymm4 = ymm1[1,0,3,2]
305; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
306; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,0,3,2]
307; AVX1-NEXT:    vhaddpd %ymm1, %ymm0, %ymm1
308; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1]
309; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
310; AVX1-NEXT:    vhaddpd %ymm0, %ymm4, %ymm0
311; AVX1-NEXT:    retq
312;
313; AVX2-LABEL: hadd_reverse2_v8f64:
314; AVX2:       # %bb.0:
315; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
316; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm1[3,2,1,0]
317; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm2[3,2,1,0]
318; AVX2-NEXT:    vhaddpd %ymm1, %ymm0, %ymm1
319; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm3[3,2,1,0]
320; AVX2-NEXT:    vhaddpd %ymm0, %ymm4, %ymm0
321; AVX2-NEXT:    retq
322  %shuf0 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
323  %shuf1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
324  %lhs = shufflevector <8 x double> %shuf0, <8 x double> %shuf1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
325  %rhs = shufflevector <8 x double> %shuf0, <8 x double> %shuf1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
326  %fadd = fadd <8 x double> %lhs, %rhs
327  ret <8 x double> %fadd
328}
329
330define <16 x float> @hadd_reverse_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind {
331; SSE-LABEL: hadd_reverse_v16f32:
332; SSE:       # %bb.0:
333; SSE-NEXT:    movaps %xmm5, %xmm8
334; SSE-NEXT:    movaps %xmm1, %xmm5
335; SSE-NEXT:    haddps %xmm2, %xmm3
336; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0,3,2]
337; SSE-NEXT:    haddps %xmm6, %xmm7
338; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,0,3,2]
339; SSE-NEXT:    haddps %xmm0, %xmm5
340; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,0,3,2]
341; SSE-NEXT:    haddps %xmm4, %xmm8
342; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[1,0,3,2]
343; SSE-NEXT:    movaps %xmm3, %xmm0
344; SSE-NEXT:    movaps %xmm7, %xmm1
345; SSE-NEXT:    movaps %xmm5, %xmm2
346; SSE-NEXT:    movaps %xmm8, %xmm3
347; SSE-NEXT:    retq
348;
349; AVX1-LABEL: hadd_reverse_v16f32:
350; AVX1:       # %bb.0:
351; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm2[2,3]
352; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
353; AVX1-NEXT:    vhaddps %ymm0, %ymm4, %ymm2
354; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm3[2,3]
355; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
356; AVX1-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
357; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
358; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm2[1,0,3,2,5,4,7,6]
359; AVX1-NEXT:    retq
360;
361; AVX2-LABEL: hadd_reverse_v16f32:
362; AVX2:       # %bb.0:
363; AVX2-NEXT:    vhaddps %ymm3, %ymm1, %ymm1
364; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[1,0,3,2,5,4,7,6]
365; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm1[2,0,3,1]
366; AVX2-NEXT:    vhaddps %ymm2, %ymm0, %ymm0
367; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
368; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[2,0,3,1]
369; AVX2-NEXT:    vmovaps %ymm3, %ymm0
370; AVX2-NEXT:    retq
371  %lhs = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 15, i32 13, i32 11, i32 9, i32 31, i32 29, i32 27, i32 25, i32 7, i32 5, i32 3, i32 1, i32 23, i32 21, i32 19, i32 17>
372  %rhs = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 14, i32 12, i32 10, i32 8, i32 30, i32 28, i32 26, i32 24, i32 6, i32 4, i32 2, i32 0, i32 22, i32 20, i32 18, i32 16>
373  %fadd = fadd <16 x float> %lhs, %rhs
374  ret <16 x float> %fadd
375}
376
377define <16 x float> @hadd_reverse2_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind {
378; SSE-LABEL: hadd_reverse2_v16f32:
379; SSE:       # %bb.0:
380; SSE-NEXT:    movaps %xmm1, %xmm8
381; SSE-NEXT:    movaps %xmm0, %xmm9
382; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[3,2],xmm0[1,0]
383; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[3,2],xmm1[1,0]
384; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,2,1,0]
385; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,2,1,0]
386; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,2,1,0]
387; SSE-NEXT:    haddps %xmm4, %xmm9
388; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,2,1,0]
389; SSE-NEXT:    haddps %xmm5, %xmm8
390; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[3,2,1,0]
391; SSE-NEXT:    haddps %xmm6, %xmm2
392; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[3,2,1,0]
393; SSE-NEXT:    haddps %xmm7, %xmm3
394; SSE-NEXT:    movaps %xmm3, %xmm0
395; SSE-NEXT:    movaps %xmm2, %xmm1
396; SSE-NEXT:    movaps %xmm8, %xmm2
397; SSE-NEXT:    movaps %xmm9, %xmm3
398; SSE-NEXT:    retq
399;
400; AVX1-LABEL: hadd_reverse2_v16f32:
401; AVX1:       # %bb.0:
402; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
403; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
404; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
405; AVX1-NEXT:    vpermilps {{.*#+}} ymm4 = ymm1[3,2,1,0,7,6,5,4]
406; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
407; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
408; AVX1-NEXT:    vhaddps %ymm1, %ymm0, %ymm1
409; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1]
410; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
411; AVX1-NEXT:    vhaddps %ymm0, %ymm4, %ymm0
412; AVX1-NEXT:    retq
413;
414; AVX2-LABEL: hadd_reverse2_v16f32:
415; AVX2:       # %bb.0:
416; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
417; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
418; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
419; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm1[2,3,0,1]
420; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm2[3,2,1,0,7,6,5,4]
421; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1]
422; AVX2-NEXT:    vhaddps %ymm1, %ymm0, %ymm1
423; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm3[3,2,1,0,7,6,5,4]
424; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
425; AVX2-NEXT:    vhaddps %ymm0, %ymm4, %ymm0
426; AVX2-NEXT:    retq
427  %shuf0 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
428  %shuf1 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
429  %lhs = shufflevector <16 x float> %shuf0, <16 x float> %shuf1, <16 x i32> <i32 0, i32 2, i32 16, i32 18, i32 4, i32 6, i32 20, i32 22, i32 8, i32 10, i32 24, i32 26, i32 12, i32 14, i32 28, i32 30>
430  %rhs = shufflevector <16 x float> %shuf0, <16 x float> %shuf1, <16 x i32> <i32 1, i32 3, i32 17, i32 19, i32 5, i32 7, i32 21, i32 23, i32 9, i32 11, i32 25, i32 27, i32 13, i32 15, i32 29, i32 31>
431  %fadd = fadd <16 x float> %lhs, %rhs
432  ret <16 x float> %fadd
433}
434