1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -disable-peephole | FileCheck %s --check-prefixes=ALL,AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -disable-peephole | FileCheck %s --check-prefixes=ALL,AVX2
4
5define <8 x float> @shuffle_v8f32_45670123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
6; AVX1-LABEL: shuffle_v8f32_45670123:
7; AVX1:       # %bb.0: # %entry
8; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
9; AVX1-NEXT:    retq
10;
11; AVX2-LABEL: shuffle_v8f32_45670123:
12; AVX2:       # %bb.0: # %entry
13; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
14; AVX2-NEXT:    retq
15entry:
16  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
17  ret <8 x float> %shuffle
18}
19
20define <8 x float> @shuffle_v8f32_45670123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
21; AVX1-LABEL: shuffle_v8f32_45670123_mem:
22; AVX1:       # %bb.0: # %entry
23; AVX1-NEXT:    vperm2f128 $35, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,0,1]
24; AVX1-NEXT:    retq
25;
26; AVX2-LABEL: shuffle_v8f32_45670123_mem:
27; AVX2:       # %bb.0: # %entry
28; AVX2-NEXT:    vpermpd $78, (%rdi), %ymm0 # ymm0 = mem[2,3,0,1]
29; AVX2-NEXT:    retq
30entry:
31  %a = load <8 x float>, <8 x float>* %pa
32  %b = load <8 x float>, <8 x float>* %pb
33  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
34  ret <8 x float> %shuffle
35}
36
37define <8 x float> @shuffle_v8f32_0123cdef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
38; ALL-LABEL: shuffle_v8f32_0123cdef:
39; ALL:       # %bb.0: # %entry
40; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
41; ALL-NEXT:    retq
42entry:
43  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
44  ret <8 x float> %shuffle
45}
46
47define <8 x float> @shuffle_v8f32_01230123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
48; AVX1-LABEL: shuffle_v8f32_01230123:
49; AVX1:       # %bb.0: # %entry
50; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
51; AVX1-NEXT:    retq
52;
53; AVX2-LABEL: shuffle_v8f32_01230123:
54; AVX2:       # %bb.0: # %entry
55; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
56; AVX2-NEXT:    retq
57entry:
58  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
59  ret <8 x float> %shuffle
60}
61
62define <8 x float> @shuffle_v8f32_01230123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
63; ALL-LABEL: shuffle_v8f32_01230123_mem:
64; ALL:       # %bb.0: # %entry
65; ALL-NEXT:    vbroadcastf128 (%rdi), %ymm0 # ymm0 = mem[0,1,0,1]
66; ALL-NEXT:    retq
67entry:
68  %a = load <8 x float>, <8 x float>* %pa
69  %b = load <8 x float>, <8 x float>* %pb
70  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
71  ret <8 x float> %shuffle
72}
73
74define <8 x float> @shuffle_v8f32_45674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
75; AVX1-LABEL: shuffle_v8f32_45674567:
76; AVX1:       # %bb.0: # %entry
77; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
78; AVX1-NEXT:    retq
79;
80; AVX2-LABEL: shuffle_v8f32_45674567:
81; AVX2:       # %bb.0: # %entry
82; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
83; AVX2-NEXT:    retq
84entry:
85  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
86  ret <8 x float> %shuffle
87}
88
89define <8 x float> @shuffle_v8f32_45674567_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
90; ALL-LABEL: shuffle_v8f32_45674567_mem:
91; ALL:       # %bb.0: # %entry
92; ALL-NEXT:    vbroadcastf128 16(%rdi), %ymm0 # ymm0 = mem[0,1,0,1]
93; ALL-NEXT:    retq
94entry:
95  %a = load <8 x float>, <8 x float>* %pa
96  %b = load <8 x float>, <8 x float>* %pb
97  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
98  ret <8 x float> %shuffle
99}
100
101define <32 x i8> @shuffle_v32i8_2323(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
102; AVX1-LABEL: shuffle_v32i8_2323:
103; AVX1:       # %bb.0: # %entry
104; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
105; AVX1-NEXT:    retq
106;
107; AVX2-LABEL: shuffle_v32i8_2323:
108; AVX2:       # %bb.0: # %entry
109; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
110; AVX2-NEXT:    retq
111entry:
112  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
113  ret <32 x i8> %shuffle
114}
115
116define <32 x i8> @shuffle_v32i8_2323_domain(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
117; AVX1-LABEL: shuffle_v32i8_2323_domain:
118; AVX1:       # %bb.0: # %entry
119; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
120; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
121; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
122; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
123; AVX1-NEXT:    retq
124;
125; AVX2-LABEL: shuffle_v32i8_2323_domain:
126; AVX2:       # %bb.0: # %entry
127; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
128; AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
129; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
130; AVX2-NEXT:    retq
131entry:
132  ; add forces execution domain
133  %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
134  %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
135  ret <32 x i8> %shuffle
136}
137
138define <4 x i64> @shuffle_v4i64_6701(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
139; ALL-LABEL: shuffle_v4i64_6701:
140; ALL:       # %bb.0: # %entry
141; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
142; ALL-NEXT:    retq
143entry:
144  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
145  ret <4 x i64> %shuffle
146}
147
148define <4 x i64> @shuffle_v4i64_6701_domain(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
149; AVX1-LABEL: shuffle_v4i64_6701_domain:
150; AVX1:       # %bb.0: # %entry
151; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
152; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
153; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
154; AVX1-NEXT:    retq
155;
156; AVX2-LABEL: shuffle_v4i64_6701_domain:
157; AVX2:       # %bb.0: # %entry
158; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
159; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
160; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
161; AVX2-NEXT:    retq
162entry:
163  ; add forces execution domain
164  %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
165  %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
166  ret <4 x i64> %shuffle
167}
168
169define <8 x i32> @shuffle_v8i32_u5u7cdef(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
170; AVX1-LABEL: shuffle_v8i32_u5u7cdef:
171; AVX1:       # %bb.0: # %entry
172; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
173; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
174; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
175; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
176; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
177; AVX1-NEXT:    retq
178;
179; AVX2-LABEL: shuffle_v8i32_u5u7cdef:
180; AVX2:       # %bb.0: # %entry
181; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
182; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
183; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
184; AVX2-NEXT:    retq
185entry:
186  ; add forces execution domain
187  %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
188  %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15>
189  ret <8 x i32> %shuffle
190}
191
192define <16 x i16> @shuffle_v16i16_4501(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
193; AVX1-LABEL: shuffle_v16i16_4501:
194; AVX1:       # %bb.0: # %entry
195; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
196; AVX1-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
197; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
198; AVX1-NEXT:    retq
199;
200; AVX2-LABEL: shuffle_v16i16_4501:
201; AVX2:       # %bb.0: # %entry
202; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
203; AVX2-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
204; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
205; AVX2-NEXT:    retq
206entry:
207  ; add forces execution domain
208  %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
209  %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
210  ret <16 x i16> %shuffle
211}
212
213define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
214; AVX1-LABEL: shuffle_v16i16_4501_mem:
215; AVX1:       # %bb.0: # %entry
216; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
217; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
218; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
219; AVX1-NEXT:    vperm2f128 $2, (%rsi), %ymm0, %ymm0 # ymm0 = mem[0,1],ymm0[0,1]
220; AVX1-NEXT:    retq
221;
222; AVX2-LABEL: shuffle_v16i16_4501_mem:
223; AVX2:       # %bb.0: # %entry
224; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
225; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
226; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
227; AVX2-NEXT:    vperm2i128 $2, (%rsi), %ymm0, %ymm0 # ymm0 = mem[0,1],ymm0[0,1]
228; AVX2-NEXT:    retq
229entry:
230  %c = load <16 x i16>, <16 x i16>* %a
231  %d = load <16 x i16>, <16 x i16>* %b
232  %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
233  %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
234  ret <16 x i16> %shuffle
235}
236
237;;;; Cases with undef indicies mixed in the mask
238
239define <8 x float> @shuffle_v8f32_uu67u9ub(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
240; ALL-LABEL: shuffle_v8f32_uu67u9ub:
241; ALL:       # %bb.0: # %entry
242; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
243; ALL-NEXT:    retq
244entry:
245  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11>
246  ret <8 x float> %shuffle
247}
248
249define <8 x float> @shuffle_v8f32_uu67uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
250; AVX1-LABEL: shuffle_v8f32_uu67uu67:
251; AVX1:       # %bb.0: # %entry
252; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
253; AVX1-NEXT:    retq
254;
255; AVX2-LABEL: shuffle_v8f32_uu67uu67:
256; AVX2:       # %bb.0: # %entry
257; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
258; AVX2-NEXT:    retq
259entry:
260  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
261  ret <8 x float> %shuffle
262}
263
264define <8 x float> @shuffle_v8f32_uu67uuab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
265; ALL-LABEL: shuffle_v8f32_uu67uuab:
266; ALL:       # %bb.0: # %entry
267; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
268; ALL-NEXT:    retq
269entry:
270  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 10, i32 11>
271  ret <8 x float> %shuffle
272}
273
274define <8 x float> @shuffle_v8f32_uu67uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
275; ALL-LABEL: shuffle_v8f32_uu67uuef:
276; ALL:       # %bb.0: # %entry
277; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
278; ALL-NEXT:    retq
279entry:
280  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15>
281  ret <8 x float> %shuffle
282}
283
284define <8 x float> @shuffle_v8f32_uu674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
285; AVX1-LABEL: shuffle_v8f32_uu674567:
286; AVX1:       # %bb.0: # %entry
287; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
288; AVX1-NEXT:    retq
289;
290; AVX2-LABEL: shuffle_v8f32_uu674567:
291; AVX2:       # %bb.0: # %entry
292; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
293; AVX2-NEXT:    retq
294entry:
295  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
296  ret <8 x float> %shuffle
297}
298
299define <8 x float> @shuffle_v8f32_uu6789ab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
300; ALL-LABEL: shuffle_v8f32_uu6789ab:
301; ALL:       # %bb.0: # %entry
302; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
303; ALL-NEXT:    retq
304entry:
305  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
306  ret <8 x float> %shuffle
307}
308
309define <8 x float> @shuffle_v8f32_4567uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
310; AVX1-LABEL: shuffle_v8f32_4567uu67:
311; AVX1:       # %bb.0: # %entry
312; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
313; AVX1-NEXT:    retq
314;
315; AVX2-LABEL: shuffle_v8f32_4567uu67:
316; AVX2:       # %bb.0: # %entry
317; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
318; AVX2-NEXT:    retq
319entry:
320  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
321  ret <8 x float> %shuffle
322}
323
324define <8 x float> @shuffle_v8f32_4567uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
325; ALL-LABEL: shuffle_v8f32_4567uuef:
326; ALL:       # %bb.0: # %entry
327; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
328; ALL-NEXT:    retq
329entry:
330  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15>
331  ret <8 x float> %shuffle
332}
333
334;;;; Cases we must not select vperm2f128
335
336define <8 x float> @shuffle_v8f32_uu67ucuf(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
337; ALL-LABEL: shuffle_v8f32_uu67ucuf:
338; ALL:       # %bb.0: # %entry
339; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
340; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
341; ALL-NEXT:    retq
342entry:
343  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15>
344  ret <8 x float> %shuffle
345}
346
347;; Test zero mask generation.
348;; PR22984: https://llvm.org/bugs/show_bug.cgi?id=22984
349;; Prefer xor+vblendpd over vperm2f128 because that has better performance,
350;; unless building for optsize where we should still use vperm2f128.
351
352define <4 x double> @shuffle_v4f64_zz01(<4 x double> %a) {
353; ALL-LABEL: shuffle_v4f64_zz01:
354; ALL:       # %bb.0:
355; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
356; ALL-NEXT:    retq
357  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
358  ret <4 x double> %s
359}
360define <4 x double> @shuffle_v4f64_zz01_optsize(<4 x double> %a) optsize {
361; ALL-LABEL: shuffle_v4f64_zz01_optsize:
362; ALL:       # %bb.0:
363; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
364; ALL-NEXT:    retq
365  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
366  ret <4 x double> %s
367}
368
369define <4 x double> @shuffle_v4f64_zz23(<4 x double> %a) {
370; ALL-LABEL: shuffle_v4f64_zz23:
371; ALL:       # %bb.0:
372; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
373; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
374; ALL-NEXT:    retq
375  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
376  ret <4 x double> %s
377}
378define <4 x double> @shuffle_v4f64_zz23_optsize(<4 x double> %a) optsize {
379; ALL-LABEL: shuffle_v4f64_zz23_optsize:
380; ALL:       # %bb.0:
381; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3]
382; ALL-NEXT:    retq
383  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
384  ret <4 x double> %s
385}
386define <4 x double> @shuffle_v4f64_zz23_pgso(<4 x double> %a) !prof !14 {
387; ALL-LABEL: shuffle_v4f64_zz23_pgso:
388; ALL:       # %bb.0:
389; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3]
390; ALL-NEXT:    retq
391  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
392  ret <4 x double> %s
393}
394
395define <4 x double> @shuffle_v4f64_zz45(<4 x double> %a) {
396; ALL-LABEL: shuffle_v4f64_zz45:
397; ALL:       # %bb.0:
398; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
399; ALL-NEXT:    retq
400  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
401  ret <4 x double> %s
402}
403define <4 x double> @shuffle_v4f64_zz45_optsize(<4 x double> %a) optsize {
404; ALL-LABEL: shuffle_v4f64_zz45_optsize:
405; ALL:       # %bb.0:
406; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
407; ALL-NEXT:    retq
408  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
409  ret <4 x double> %s
410}
411
412define <4 x double> @shuffle_v4f64_zz67(<4 x double> %a) {
413; ALL-LABEL: shuffle_v4f64_zz67:
414; ALL:       # %bb.0:
415; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
416; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
417; ALL-NEXT:    retq
418  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
419  ret <4 x double> %s
420}
421define <4 x double> @shuffle_v4f64_zz67_optsize(<4 x double> %a) optsize {
422; ALL-LABEL: shuffle_v4f64_zz67_optsize:
423; ALL:       # %bb.0:
424; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3]
425; ALL-NEXT:    retq
426  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
427  ret <4 x double> %s
428}
429define <4 x double> @shuffle_v4f64_zz67_pgso(<4 x double> %a) !prof !14 {
430; ALL-LABEL: shuffle_v4f64_zz67_pgso:
431; ALL:       # %bb.0:
432; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3]
433; ALL-NEXT:    retq
434  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
435  ret <4 x double> %s
436}
437
438define <4 x double> @shuffle_v4f64_01zz(<4 x double> %a) {
439; ALL-LABEL: shuffle_v4f64_01zz:
440; ALL:       # %bb.0:
441; ALL-NEXT:    vmovaps %xmm0, %xmm0
442; ALL-NEXT:    retq
443  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
444  ret <4 x double> %s
445}
446define <4 x double> @shuffle_v4f64_01zz_optsize(<4 x double> %a) optsize {
447; ALL-LABEL: shuffle_v4f64_01zz_optsize:
448; ALL:       # %bb.0:
449; ALL-NEXT:    vmovaps %xmm0, %xmm0
450; ALL-NEXT:    retq
451  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
452  ret <4 x double> %s
453}
454
455define <4 x double> @shuffle_v4f64_23zz(<4 x double> %a) {
456; ALL-LABEL: shuffle_v4f64_23zz:
457; ALL:       # %bb.0:
458; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
459; ALL-NEXT:    retq
460  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
461  ret <4 x double> %s
462}
463define <4 x double> @shuffle_v4f64_23zz_optsize(<4 x double> %a) optsize {
464; ALL-LABEL: shuffle_v4f64_23zz_optsize:
465; ALL:       # %bb.0:
466; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
467; ALL-NEXT:    retq
468  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
469  ret <4 x double> %s
470}
471
472define <4 x double> @shuffle_v4f64_45zz(<4 x double> %a) {
473; ALL-LABEL: shuffle_v4f64_45zz:
474; ALL:       # %bb.0:
475; ALL-NEXT:    vmovaps %xmm0, %xmm0
476; ALL-NEXT:    retq
477  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
478  ret <4 x double> %s
479}
480define <4 x double> @shuffle_v4f64_45zz_optsize(<4 x double> %a) optsize {
481; ALL-LABEL: shuffle_v4f64_45zz_optsize:
482; ALL:       # %bb.0:
483; ALL-NEXT:    vmovaps %xmm0, %xmm0
484; ALL-NEXT:    retq
485  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
486  ret <4 x double> %s
487}
488
489define <4 x double> @shuffle_v4f64_67zz(<4 x double> %a) {
490; ALL-LABEL: shuffle_v4f64_67zz:
491; ALL:       # %bb.0:
492; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
493; ALL-NEXT:    retq
494  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
495  ret <4 x double> %s
496}
497define <4 x double> @shuffle_v4f64_67zz_optsize(<4 x double> %a) optsize {
498; ALL-LABEL: shuffle_v4f64_67zz_optsize:
499; ALL:       # %bb.0:
500; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
501; ALL-NEXT:    retq
502  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
503  ret <4 x double> %s
504}
505
506;; With AVX2 select the integer version of the instruction. Use an add to force the domain selection.
507
508define <4 x i64> @shuffle_v4i64_67zz(<4 x i64> %a, <4 x i64> %b) {
509; AVX1-LABEL: shuffle_v4i64_67zz:
510; AVX1:       # %bb.0:
511; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
512; AVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
513; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
514; AVX1-NEXT:    retq
515;
516; AVX2-LABEL: shuffle_v4i64_67zz:
517; AVX2:       # %bb.0:
518; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
519; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
520; AVX2-NEXT:    retq
521  %s = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
522  %c = add <4 x i64> %b, %s
523  ret <4 x i64> %c
524}
525
526;;; Memory folding cases
527
528define <4 x double> @ld0_hi0_lo1_4f64(<4 x double> * %pa, <4 x double> %b) nounwind uwtable readnone ssp {
529; AVX1-LABEL: ld0_hi0_lo1_4f64:
530; AVX1:       # %bb.0: # %entry
531; AVX1-NEXT:    vperm2f128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1]
532; AVX1-NEXT:    vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
533; AVX1-NEXT:    retq
534;
535; AVX2-LABEL: ld0_hi0_lo1_4f64:
536; AVX2:       # %bb.0: # %entry
537; AVX2-NEXT:    vperm2f128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1]
538; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
539; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
540; AVX2-NEXT:    retq
541entry:
542  %a = load <4 x double>, <4 x double> * %pa
543  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
544  %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0>
545  ret <4 x double> %res
546}
547
548define <4 x double> @ld1_hi0_hi1_4f64(<4 x double> %a, <4 x double> * %pb) nounwind uwtable readnone ssp {
549; AVX1-LABEL: ld1_hi0_hi1_4f64:
550; AVX1:       # %bb.0: # %entry
551; AVX1-NEXT:    vperm2f128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3]
552; AVX1-NEXT:    vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
553; AVX1-NEXT:    retq
554;
555; AVX2-LABEL: ld1_hi0_hi1_4f64:
556; AVX2:       # %bb.0: # %entry
557; AVX2-NEXT:    vperm2f128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3]
558; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
559; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
560; AVX2-NEXT:    retq
561entry:
562  %b = load <4 x double>, <4 x double> * %pb
563  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
564  %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0>
565  ret <4 x double> %res
566}
567
568define <8 x float> @ld0_hi0_lo1_8f32(<8 x float> * %pa, <8 x float> %b) nounwind uwtable readnone ssp {
569; AVX1-LABEL: ld0_hi0_lo1_8f32:
570; AVX1:       # %bb.0: # %entry
571; AVX1-NEXT:    vperm2f128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1]
572; AVX1-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
573; AVX1-NEXT:    retq
574;
575; AVX2-LABEL: ld0_hi0_lo1_8f32:
576; AVX2:       # %bb.0: # %entry
577; AVX2-NEXT:    vperm2f128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1]
578; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
579; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
580; AVX2-NEXT:    retq
581entry:
582  %a = load <8 x float>, <8 x float> * %pa
583  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
584  %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
585  ret <8 x float> %res
586}
587
588define <8 x float> @ld1_hi0_hi1_8f32(<8 x float> %a, <8 x float> * %pb) nounwind uwtable readnone ssp {
589; AVX1-LABEL: ld1_hi0_hi1_8f32:
590; AVX1:       # %bb.0: # %entry
591; AVX1-NEXT:    vperm2f128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3]
592; AVX1-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
593; AVX1-NEXT:    retq
594;
595; AVX2-LABEL: ld1_hi0_hi1_8f32:
596; AVX2:       # %bb.0: # %entry
597; AVX2-NEXT:    vperm2f128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3]
598; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
599; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
600; AVX2-NEXT:    retq
601entry:
602  %b = load <8 x float>, <8 x float> * %pb
603  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
604  %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
605  ret <8 x float> %res
606}
607
608define <4 x i64> @ld0_hi0_lo1_4i64(<4 x i64> * %pa, <4 x i64> %b) nounwind uwtable readnone ssp {
609; AVX1-LABEL: ld0_hi0_lo1_4i64:
610; AVX1:       # %bb.0: # %entry
611; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
612; AVX1-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
613; AVX1-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
614; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
615; AVX1-NEXT:    retq
616;
617; AVX2-LABEL: ld0_hi0_lo1_4i64:
618; AVX2:       # %bb.0: # %entry
619; AVX2-NEXT:    vperm2i128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1]
620; AVX2-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
621; AVX2-NEXT:    retq
622entry:
623  %a = load <4 x i64>, <4 x i64> * %pa
624  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
625  %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4>
626  ret <4 x i64> %res
627}
628
629define <4 x i64> @ld1_hi0_hi1_4i64(<4 x i64> %a, <4 x i64> * %pb) nounwind uwtable readnone ssp {
630; AVX1-LABEL: ld1_hi0_hi1_4i64:
631; AVX1:       # %bb.0: # %entry
632; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
633; AVX1-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
634; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
635; AVX1-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
636; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
637; AVX1-NEXT:    retq
638;
639; AVX2-LABEL: ld1_hi0_hi1_4i64:
640; AVX2:       # %bb.0: # %entry
641; AVX2-NEXT:    vperm2i128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3]
642; AVX2-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
643; AVX2-NEXT:    retq
644entry:
645  %b = load <4 x i64>, <4 x i64> * %pb
646  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
647  %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4>
648  ret <4 x i64> %res
649}
650
651define <8 x i32> @ld0_hi0_lo1_8i32(<8 x i32> * %pa, <8 x i32> %b) nounwind uwtable readnone ssp {
652; AVX1-LABEL: ld0_hi0_lo1_8i32:
653; AVX1:       # %bb.0: # %entry
654; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2,3,4]
655; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
656; AVX1-NEXT:    vpaddd 16(%rdi), %xmm1, %xmm1
657; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
658; AVX1-NEXT:    retq
659;
660; AVX2-LABEL: ld0_hi0_lo1_8i32:
661; AVX2:       # %bb.0: # %entry
662; AVX2-NEXT:    vperm2i128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1]
663; AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
664; AVX2-NEXT:    retq
665entry:
666  %a = load <8 x i32>, <8 x i32> * %pa
667  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
668  %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
669  ret <8 x i32> %res
670}
671
672define <8 x i32> @ld1_hi0_hi1_8i32(<8 x i32> %a, <8 x i32> * %pb) nounwind uwtable readnone ssp {
673; AVX1-LABEL: ld1_hi0_hi1_8i32:
674; AVX1:       # %bb.0: # %entry
675; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2,3,4]
676; AVX1-NEXT:    vpaddd 16(%rdi), %xmm1, %xmm2
677; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
678; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
679; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
680; AVX1-NEXT:    retq
681;
682; AVX2-LABEL: ld1_hi0_hi1_8i32:
683; AVX2:       # %bb.0: # %entry
684; AVX2-NEXT:    vperm2i128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3]
685; AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
686; AVX2-NEXT:    retq
687entry:
688  %b = load <8 x i32>, <8 x i32> * %pb
689  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
690  %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
691  ret <8 x i32> %res
692}
693
694define void @PR50053(<4 x i64>* nocapture %0, <4 x i64>* nocapture readonly %1) {
695; ALL-LABEL: PR50053:
696; ALL:       # %bb.0:
697; ALL-NEXT:    vmovaps (%rsi), %ymm0
698; ALL-NEXT:    vmovaps 32(%rsi), %xmm1
699; ALL-NEXT:    vmovaps 48(%rsi), %xmm2
700; ALL-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[0,1],ymm1[0,1]
701; ALL-NEXT:    vmovaps %ymm1, (%rdi)
702; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
703; ALL-NEXT:    vmovaps %ymm0, 32(%rdi)
704; ALL-NEXT:    vzeroupper
705; ALL-NEXT:    retq
706  %3 = load <4 x i64>, <4 x i64>* %1, align 32
707  %4 = getelementptr inbounds <4 x i64>, <4 x i64>* %1, i64 1
708  %5 = bitcast <4 x i64>* %4 to <2 x i64>*
709  %6 = load <2 x i64>, <2 x i64>* %5, align 16
710  %7 = getelementptr inbounds <2 x i64>, <2 x i64>* %5, i64 1
711  %8 = load <2 x i64>, <2 x i64>* %7, align 16
712  %9 = shufflevector <2 x i64> %6, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
713  %10 = shufflevector <4 x i64> %3, <4 x i64> %9, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
714  store <4 x i64> %10, <4 x i64>* %0, align 32
715  %11 = shufflevector <2 x i64> %8, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
716  %12 = shufflevector <4 x i64> %11, <4 x i64> %3, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
717  %13 = getelementptr inbounds <4 x i64>, <4 x i64>* %0, i64 1
718  store <4 x i64> %12, <4 x i64>* %13, align 32
719  ret void
720}
721
722!llvm.module.flags = !{!0}
723!0 = !{i32 1, !"ProfileSummary", !1}
724!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
725!2 = !{!"ProfileFormat", !"InstrProf"}
726!3 = !{!"TotalCount", i64 10000}
727!4 = !{!"MaxCount", i64 10}
728!5 = !{!"MaxInternalCount", i64 1}
729!6 = !{!"MaxFunctionCount", i64 1000}
730!7 = !{!"NumCounts", i64 3}
731!8 = !{!"NumFunctions", i64 3}
732!9 = !{!"DetailedSummary", !10}
733!10 = !{!11, !12, !13}
734!11 = !{i32 10000, i64 100, i32 1}
735!12 = !{i32 999000, i64 100, i32 1}
736!13 = !{i32 999999, i64 1, i32 2}
737!14 = !{!"function_entry_count", i64 0}
738