1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -disable-peephole | FileCheck %s --check-prefixes=ALL,AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -disable-peephole | FileCheck %s --check-prefixes=ALL,AVX2
5define <8 x float> @shuffle_v8f32_45670123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
6; AVX1-LABEL: shuffle_v8f32_45670123:
7; AVX1:       # %bb.0: # %entry
8; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
9; AVX1-NEXT:    retq
11; AVX2-LABEL: shuffle_v8f32_45670123:
12; AVX2:       # %bb.0: # %entry
13; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
14; AVX2-NEXT:    retq
16  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
17  ret <8 x float> %shuffle
20define <8 x float> @shuffle_v8f32_45670123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
21; AVX1-LABEL: shuffle_v8f32_45670123_mem:
22; AVX1:       # %bb.0: # %entry
23; AVX1-NEXT:    vperm2f128 $35, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,0,1]
24; AVX1-NEXT:    retq
26; AVX2-LABEL: shuffle_v8f32_45670123_mem:
27; AVX2:       # %bb.0: # %entry
28; AVX2-NEXT:    vpermpd $78, (%rdi), %ymm0 # ymm0 = mem[2,3,0,1]
29; AVX2-NEXT:    retq
31  %a = load <8 x float>, <8 x float>* %pa
32  %b = load <8 x float>, <8 x float>* %pb
33  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
34  ret <8 x float> %shuffle
37define <8 x float> @shuffle_v8f32_0123cdef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
38; ALL-LABEL: shuffle_v8f32_0123cdef:
39; ALL:       # %bb.0: # %entry
40; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
41; ALL-NEXT:    retq
43  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
44  ret <8 x float> %shuffle
47define <8 x float> @shuffle_v8f32_01230123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
48; AVX1-LABEL: shuffle_v8f32_01230123:
49; AVX1:       # %bb.0: # %entry
50; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
51; AVX1-NEXT:    retq
53; AVX2-LABEL: shuffle_v8f32_01230123:
54; AVX2:       # %bb.0: # %entry
55; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
56; AVX2-NEXT:    retq
58  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
59  ret <8 x float> %shuffle
62define <8 x float> @shuffle_v8f32_01230123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
63; ALL-LABEL: shuffle_v8f32_01230123_mem:
64; ALL:       # %bb.0: # %entry
65; ALL-NEXT:    vbroadcastf128 (%rdi), %ymm0 # ymm0 = mem[0,1,0,1]
66; ALL-NEXT:    retq
68  %a = load <8 x float>, <8 x float>* %pa
69  %b = load <8 x float>, <8 x float>* %pb
70  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
71  ret <8 x float> %shuffle
74define <8 x float> @shuffle_v8f32_45674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
75; AVX1-LABEL: shuffle_v8f32_45674567:
76; AVX1:       # %bb.0: # %entry
77; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
78; AVX1-NEXT:    retq
80; AVX2-LABEL: shuffle_v8f32_45674567:
81; AVX2:       # %bb.0: # %entry
82; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
83; AVX2-NEXT:    retq
85  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
86  ret <8 x float> %shuffle
89define <8 x float> @shuffle_v8f32_45674567_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
90; ALL-LABEL: shuffle_v8f32_45674567_mem:
91; ALL:       # %bb.0: # %entry
92; ALL-NEXT:    vbroadcastf128 16(%rdi), %ymm0 # ymm0 = mem[0,1,0,1]
93; ALL-NEXT:    retq
95  %a = load <8 x float>, <8 x float>* %pa
96  %b = load <8 x float>, <8 x float>* %pb
97  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
98  ret <8 x float> %shuffle
101define <32 x i8> @shuffle_v32i8_2323(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
102; AVX1-LABEL: shuffle_v32i8_2323:
103; AVX1:       # %bb.0: # %entry
104; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
105; AVX1-NEXT:    retq
107; AVX2-LABEL: shuffle_v32i8_2323:
108; AVX2:       # %bb.0: # %entry
109; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
110; AVX2-NEXT:    retq
112  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
113  ret <32 x i8> %shuffle
116define <32 x i8> @shuffle_v32i8_2323_domain(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
117; AVX1-LABEL: shuffle_v32i8_2323_domain:
118; AVX1:       # %bb.0: # %entry
119; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
120; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
121; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
122; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
123; AVX1-NEXT:    retq
125; AVX2-LABEL: shuffle_v32i8_2323_domain:
126; AVX2:       # %bb.0: # %entry
127; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
128; AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
129; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
130; AVX2-NEXT:    retq
132  ; add forces execution domain
133  %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
134  %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
135  ret <32 x i8> %shuffle
138define <4 x i64> @shuffle_v4i64_6701(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
139; ALL-LABEL: shuffle_v4i64_6701:
140; ALL:       # %bb.0: # %entry
141; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
142; ALL-NEXT:    retq
144  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
145  ret <4 x i64> %shuffle
148define <4 x i64> @shuffle_v4i64_6701_domain(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
149; AVX1-LABEL: shuffle_v4i64_6701_domain:
150; AVX1:       # %bb.0: # %entry
151; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
152; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
153; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
154; AVX1-NEXT:    retq
156; AVX2-LABEL: shuffle_v4i64_6701_domain:
157; AVX2:       # %bb.0: # %entry
158; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
159; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
160; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
161; AVX2-NEXT:    retq
163  ; add forces execution domain
164  %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
165  %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
166  ret <4 x i64> %shuffle
169define <8 x i32> @shuffle_v8i32_u5u7cdef(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
170; AVX1-LABEL: shuffle_v8i32_u5u7cdef:
171; AVX1:       # %bb.0: # %entry
172; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
173; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
174; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
175; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
176; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
177; AVX1-NEXT:    retq
179; AVX2-LABEL: shuffle_v8i32_u5u7cdef:
180; AVX2:       # %bb.0: # %entry
181; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
182; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
183; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
184; AVX2-NEXT:    retq
186  ; add forces execution domain
187  %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
188  %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15>
189  ret <8 x i32> %shuffle
192define <16 x i16> @shuffle_v16i16_4501(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
193; AVX1-LABEL: shuffle_v16i16_4501:
194; AVX1:       # %bb.0: # %entry
195; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
196; AVX1-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
197; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
198; AVX1-NEXT:    retq
200; AVX2-LABEL: shuffle_v16i16_4501:
201; AVX2:       # %bb.0: # %entry
202; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
203; AVX2-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
204; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
205; AVX2-NEXT:    retq
207  ; add forces execution domain
208  %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
209  %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
210  ret <16 x i16> %shuffle
213define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
214; AVX1-LABEL: shuffle_v16i16_4501_mem:
215; AVX1:       # %bb.0: # %entry
216; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
217; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
218; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
219; AVX1-NEXT:    vperm2f128 $2, (%rsi), %ymm0, %ymm0 # ymm0 = mem[0,1],ymm0[0,1]
220; AVX1-NEXT:    retq
222; AVX2-LABEL: shuffle_v16i16_4501_mem:
223; AVX2:       # %bb.0: # %entry
224; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
225; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
226; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
227; AVX2-NEXT:    vperm2i128 $2, (%rsi), %ymm0, %ymm0 # ymm0 = mem[0,1],ymm0[0,1]
228; AVX2-NEXT:    retq
230  %c = load <16 x i16>, <16 x i16>* %a
231  %d = load <16 x i16>, <16 x i16>* %b
232  %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
233  %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
234  ret <16 x i16> %shuffle
237;;;; Cases with undef indicies mixed in the mask
239define <8 x float> @shuffle_v8f32_uu67u9ub(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
240; ALL-LABEL: shuffle_v8f32_uu67u9ub:
241; ALL:       # %bb.0: # %entry
242; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
243; ALL-NEXT:    retq
245  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11>
246  ret <8 x float> %shuffle
249define <8 x float> @shuffle_v8f32_uu67uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
250; AVX1-LABEL: shuffle_v8f32_uu67uu67:
251; AVX1:       # %bb.0: # %entry
252; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
253; AVX1-NEXT:    retq
255; AVX2-LABEL: shuffle_v8f32_uu67uu67:
256; AVX2:       # %bb.0: # %entry
257; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
258; AVX2-NEXT:    retq
260  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
261  ret <8 x float> %shuffle
264define <8 x float> @shuffle_v8f32_uu67uuab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
265; ALL-LABEL: shuffle_v8f32_uu67uuab:
266; ALL:       # %bb.0: # %entry
267; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
268; ALL-NEXT:    retq
270  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 10, i32 11>
271  ret <8 x float> %shuffle
274define <8 x float> @shuffle_v8f32_uu67uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
275; ALL-LABEL: shuffle_v8f32_uu67uuef:
276; ALL:       # %bb.0: # %entry
277; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
278; ALL-NEXT:    retq
280  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15>
281  ret <8 x float> %shuffle
284define <8 x float> @shuffle_v8f32_uu674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
285; AVX1-LABEL: shuffle_v8f32_uu674567:
286; AVX1:       # %bb.0: # %entry
287; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
288; AVX1-NEXT:    retq
290; AVX2-LABEL: shuffle_v8f32_uu674567:
291; AVX2:       # %bb.0: # %entry
292; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
293; AVX2-NEXT:    retq
295  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
296  ret <8 x float> %shuffle
299define <8 x float> @shuffle_v8f32_uu6789ab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
300; ALL-LABEL: shuffle_v8f32_uu6789ab:
301; ALL:       # %bb.0: # %entry
302; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
303; ALL-NEXT:    retq
305  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
306  ret <8 x float> %shuffle
309define <8 x float> @shuffle_v8f32_4567uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
310; AVX1-LABEL: shuffle_v8f32_4567uu67:
311; AVX1:       # %bb.0: # %entry
312; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
313; AVX1-NEXT:    retq
315; AVX2-LABEL: shuffle_v8f32_4567uu67:
316; AVX2:       # %bb.0: # %entry
317; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
318; AVX2-NEXT:    retq
320  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
321  ret <8 x float> %shuffle
324define <8 x float> @shuffle_v8f32_4567uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
325; ALL-LABEL: shuffle_v8f32_4567uuef:
326; ALL:       # %bb.0: # %entry
327; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
328; ALL-NEXT:    retq
330  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15>
331  ret <8 x float> %shuffle
334;;;; Cases we must not select vperm2f128
336define <8 x float> @shuffle_v8f32_uu67ucuf(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
337; ALL-LABEL: shuffle_v8f32_uu67ucuf:
338; ALL:       # %bb.0: # %entry
339; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
340; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
341; ALL-NEXT:    retq
343  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15>
344  ret <8 x float> %shuffle
347;; Test zero mask generation.
348;; PR22984: https://llvm.org/bugs/show_bug.cgi?id=22984
349;; Prefer xor+vblendpd over vperm2f128 because that has better performance,
350;; unless building for optsize where we should still use vperm2f128.
352define <4 x double> @shuffle_v4f64_zz01(<4 x double> %a) {
353; ALL-LABEL: shuffle_v4f64_zz01:
354; ALL:       # %bb.0:
355; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
356; ALL-NEXT:    retq
357  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
358  ret <4 x double> %s
360define <4 x double> @shuffle_v4f64_zz01_optsize(<4 x double> %a) optsize {
361; ALL-LABEL: shuffle_v4f64_zz01_optsize:
362; ALL:       # %bb.0:
363; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
364; ALL-NEXT:    retq
365  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
366  ret <4 x double> %s
369define <4 x double> @shuffle_v4f64_zz23(<4 x double> %a) {
370; ALL-LABEL: shuffle_v4f64_zz23:
371; ALL:       # %bb.0:
372; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
373; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
374; ALL-NEXT:    retq
375  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
376  ret <4 x double> %s
378define <4 x double> @shuffle_v4f64_zz23_optsize(<4 x double> %a) optsize {
379; ALL-LABEL: shuffle_v4f64_zz23_optsize:
380; ALL:       # %bb.0:
381; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3]
382; ALL-NEXT:    retq
383  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
384  ret <4 x double> %s
386define <4 x double> @shuffle_v4f64_zz23_pgso(<4 x double> %a) !prof !14 {
387; ALL-LABEL: shuffle_v4f64_zz23_pgso:
388; ALL:       # %bb.0:
389; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3]
390; ALL-NEXT:    retq
391  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
392  ret <4 x double> %s
395define <4 x double> @shuffle_v4f64_zz45(<4 x double> %a) {
396; ALL-LABEL: shuffle_v4f64_zz45:
397; ALL:       # %bb.0:
398; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
399; ALL-NEXT:    retq
400  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
401  ret <4 x double> %s
403define <4 x double> @shuffle_v4f64_zz45_optsize(<4 x double> %a) optsize {
404; ALL-LABEL: shuffle_v4f64_zz45_optsize:
405; ALL:       # %bb.0:
406; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
407; ALL-NEXT:    retq
408  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
409  ret <4 x double> %s
412define <4 x double> @shuffle_v4f64_zz67(<4 x double> %a) {
413; ALL-LABEL: shuffle_v4f64_zz67:
414; ALL:       # %bb.0:
415; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
416; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
417; ALL-NEXT:    retq
418  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
419  ret <4 x double> %s
421define <4 x double> @shuffle_v4f64_zz67_optsize(<4 x double> %a) optsize {
422; ALL-LABEL: shuffle_v4f64_zz67_optsize:
423; ALL:       # %bb.0:
424; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3]
425; ALL-NEXT:    retq
426  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
427  ret <4 x double> %s
429define <4 x double> @shuffle_v4f64_zz67_pgso(<4 x double> %a) !prof !14 {
430; ALL-LABEL: shuffle_v4f64_zz67_pgso:
431; ALL:       # %bb.0:
432; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3]
433; ALL-NEXT:    retq
434  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
435  ret <4 x double> %s
438define <4 x double> @shuffle_v4f64_01zz(<4 x double> %a) {
439; ALL-LABEL: shuffle_v4f64_01zz:
440; ALL:       # %bb.0:
441; ALL-NEXT:    vmovaps %xmm0, %xmm0
442; ALL-NEXT:    retq
443  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
444  ret <4 x double> %s
446define <4 x double> @shuffle_v4f64_01zz_optsize(<4 x double> %a) optsize {
447; ALL-LABEL: shuffle_v4f64_01zz_optsize:
448; ALL:       # %bb.0:
449; ALL-NEXT:    vmovaps %xmm0, %xmm0
450; ALL-NEXT:    retq
451  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
452  ret <4 x double> %s
455define <4 x double> @shuffle_v4f64_23zz(<4 x double> %a) {
456; ALL-LABEL: shuffle_v4f64_23zz:
457; ALL:       # %bb.0:
458; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
459; ALL-NEXT:    retq
460  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
461  ret <4 x double> %s
463define <4 x double> @shuffle_v4f64_23zz_optsize(<4 x double> %a) optsize {
464; ALL-LABEL: shuffle_v4f64_23zz_optsize:
465; ALL:       # %bb.0:
466; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
467; ALL-NEXT:    retq
468  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
469  ret <4 x double> %s
472define <4 x double> @shuffle_v4f64_45zz(<4 x double> %a) {
473; ALL-LABEL: shuffle_v4f64_45zz:
474; ALL:       # %bb.0:
475; ALL-NEXT:    vmovaps %xmm0, %xmm0
476; ALL-NEXT:    retq
477  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
478  ret <4 x double> %s
480define <4 x double> @shuffle_v4f64_45zz_optsize(<4 x double> %a) optsize {
481; ALL-LABEL: shuffle_v4f64_45zz_optsize:
482; ALL:       # %bb.0:
483; ALL-NEXT:    vmovaps %xmm0, %xmm0
484; ALL-NEXT:    retq
485  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
486  ret <4 x double> %s
489define <4 x double> @shuffle_v4f64_67zz(<4 x double> %a) {
490; ALL-LABEL: shuffle_v4f64_67zz:
491; ALL:       # %bb.0:
492; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
493; ALL-NEXT:    retq
494  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
495  ret <4 x double> %s
497define <4 x double> @shuffle_v4f64_67zz_optsize(<4 x double> %a) optsize {
498; ALL-LABEL: shuffle_v4f64_67zz_optsize:
499; ALL:       # %bb.0:
500; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
501; ALL-NEXT:    retq
502  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
503  ret <4 x double> %s
506;; With AVX2 select the integer version of the instruction. Use an add to force the domain selection.
508define <4 x i64> @shuffle_v4i64_67zz(<4 x i64> %a, <4 x i64> %b) {
509; AVX1-LABEL: shuffle_v4i64_67zz:
510; AVX1:       # %bb.0:
511; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
512; AVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
513; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
514; AVX1-NEXT:    retq
516; AVX2-LABEL: shuffle_v4i64_67zz:
517; AVX2:       # %bb.0:
518; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
519; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
520; AVX2-NEXT:    retq
521  %s = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
522  %c = add <4 x i64> %b, %s
523  ret <4 x i64> %c
526;;; Memory folding cases
528define <4 x double> @ld0_hi0_lo1_4f64(<4 x double> * %pa, <4 x double> %b) nounwind uwtable readnone ssp {
529; AVX1-LABEL: ld0_hi0_lo1_4f64:
530; AVX1:       # %bb.0: # %entry
531; AVX1-NEXT:    vperm2f128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1]
532; AVX1-NEXT:    vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
533; AVX1-NEXT:    retq
535; AVX2-LABEL: ld0_hi0_lo1_4f64:
536; AVX2:       # %bb.0: # %entry
537; AVX2-NEXT:    vperm2f128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1]
538; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
539; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
540; AVX2-NEXT:    retq
542  %a = load <4 x double>, <4 x double> * %pa
543  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
544  %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0>
545  ret <4 x double> %res
548define <4 x double> @ld1_hi0_hi1_4f64(<4 x double> %a, <4 x double> * %pb) nounwind uwtable readnone ssp {
549; AVX1-LABEL: ld1_hi0_hi1_4f64:
550; AVX1:       # %bb.0: # %entry
551; AVX1-NEXT:    vperm2f128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3]
552; AVX1-NEXT:    vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
553; AVX1-NEXT:    retq
555; AVX2-LABEL: ld1_hi0_hi1_4f64:
556; AVX2:       # %bb.0: # %entry
557; AVX2-NEXT:    vperm2f128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3]
558; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
559; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
560; AVX2-NEXT:    retq
562  %b = load <4 x double>, <4 x double> * %pb
563  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
564  %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0>
565  ret <4 x double> %res
568define <8 x float> @ld0_hi0_lo1_8f32(<8 x float> * %pa, <8 x float> %b) nounwind uwtable readnone ssp {
569; AVX1-LABEL: ld0_hi0_lo1_8f32:
570; AVX1:       # %bb.0: # %entry
571; AVX1-NEXT:    vperm2f128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1]
572; AVX1-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
573; AVX1-NEXT:    retq
575; AVX2-LABEL: ld0_hi0_lo1_8f32:
576; AVX2:       # %bb.0: # %entry
577; AVX2-NEXT:    vperm2f128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1]
578; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
579; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
580; AVX2-NEXT:    retq
582  %a = load <8 x float>, <8 x float> * %pa
583  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
584  %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
585  ret <8 x float> %res
588define <8 x float> @ld1_hi0_hi1_8f32(<8 x float> %a, <8 x float> * %pb) nounwind uwtable readnone ssp {
589; AVX1-LABEL: ld1_hi0_hi1_8f32:
590; AVX1:       # %bb.0: # %entry
591; AVX1-NEXT:    vperm2f128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3]
592; AVX1-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
593; AVX1-NEXT:    retq
595; AVX2-LABEL: ld1_hi0_hi1_8f32:
596; AVX2:       # %bb.0: # %entry
597; AVX2-NEXT:    vperm2f128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3]
598; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
599; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
600; AVX2-NEXT:    retq
602  %b = load <8 x float>, <8 x float> * %pb
603  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
604  %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
605  ret <8 x float> %res
608define <4 x i64> @ld0_hi0_lo1_4i64(<4 x i64> * %pa, <4 x i64> %b) nounwind uwtable readnone ssp {
609; AVX1-LABEL: ld0_hi0_lo1_4i64:
610; AVX1:       # %bb.0: # %entry
611; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
612; AVX1-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
613; AVX1-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
614; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
615; AVX1-NEXT:    retq
617; AVX2-LABEL: ld0_hi0_lo1_4i64:
618; AVX2:       # %bb.0: # %entry
619; AVX2-NEXT:    vperm2i128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1]
620; AVX2-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
621; AVX2-NEXT:    retq
623  %a = load <4 x i64>, <4 x i64> * %pa
624  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
625  %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4>
626  ret <4 x i64> %res
629define <4 x i64> @ld1_hi0_hi1_4i64(<4 x i64> %a, <4 x i64> * %pb) nounwind uwtable readnone ssp {
630; AVX1-LABEL: ld1_hi0_hi1_4i64:
631; AVX1:       # %bb.0: # %entry
632; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
633; AVX1-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
634; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
635; AVX1-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
636; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
637; AVX1-NEXT:    retq
639; AVX2-LABEL: ld1_hi0_hi1_4i64:
640; AVX2:       # %bb.0: # %entry
641; AVX2-NEXT:    vperm2i128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3]
642; AVX2-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
643; AVX2-NEXT:    retq
645  %b = load <4 x i64>, <4 x i64> * %pb
646  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
647  %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4>
648  ret <4 x i64> %res
651define <8 x i32> @ld0_hi0_lo1_8i32(<8 x i32> * %pa, <8 x i32> %b) nounwind uwtable readnone ssp {
652; AVX1-LABEL: ld0_hi0_lo1_8i32:
653; AVX1:       # %bb.0: # %entry
654; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2,3,4]
655; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
656; AVX1-NEXT:    vpaddd 16(%rdi), %xmm1, %xmm1
657; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
658; AVX1-NEXT:    retq
660; AVX2-LABEL: ld0_hi0_lo1_8i32:
661; AVX2:       # %bb.0: # %entry
662; AVX2-NEXT:    vperm2i128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1]
663; AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
664; AVX2-NEXT:    retq
666  %a = load <8 x i32>, <8 x i32> * %pa
667  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
668  %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
669  ret <8 x i32> %res
672define <8 x i32> @ld1_hi0_hi1_8i32(<8 x i32> %a, <8 x i32> * %pb) nounwind uwtable readnone ssp {
673; AVX1-LABEL: ld1_hi0_hi1_8i32:
674; AVX1:       # %bb.0: # %entry
675; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2,3,4]
676; AVX1-NEXT:    vpaddd 16(%rdi), %xmm1, %xmm2
677; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
678; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
679; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
680; AVX1-NEXT:    retq
682; AVX2-LABEL: ld1_hi0_hi1_8i32:
683; AVX2:       # %bb.0: # %entry
684; AVX2-NEXT:    vperm2i128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3]
685; AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
686; AVX2-NEXT:    retq
688  %b = load <8 x i32>, <8 x i32> * %pb
689  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
690  %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
691  ret <8 x i32> %res
694define void @PR50053(<4 x i64>* nocapture %0, <4 x i64>* nocapture readonly %1) {
695; ALL-LABEL: PR50053:
696; ALL:       # %bb.0:
697; ALL-NEXT:    vmovaps (%rsi), %ymm0
698; ALL-NEXT:    vmovaps 32(%rsi), %xmm1
699; ALL-NEXT:    vmovaps 48(%rsi), %xmm2
700; ALL-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[0,1],ymm1[0,1]
701; ALL-NEXT:    vmovaps %ymm1, (%rdi)
702; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
703; ALL-NEXT:    vmovaps %ymm0, 32(%rdi)
704; ALL-NEXT:    vzeroupper
705; ALL-NEXT:    retq
706  %3 = load <4 x i64>, <4 x i64>* %1, align 32
707  %4 = getelementptr inbounds <4 x i64>, <4 x i64>* %1, i64 1
708  %5 = bitcast <4 x i64>* %4 to <2 x i64>*
709  %6 = load <2 x i64>, <2 x i64>* %5, align 16
710  %7 = getelementptr inbounds <2 x i64>, <2 x i64>* %5, i64 1
711  %8 = load <2 x i64>, <2 x i64>* %7, align 16
712  %9 = shufflevector <2 x i64> %6, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
713  %10 = shufflevector <4 x i64> %3, <4 x i64> %9, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
714  store <4 x i64> %10, <4 x i64>* %0, align 32
715  %11 = shufflevector <2 x i64> %8, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
716  %12 = shufflevector <4 x i64> %11, <4 x i64> %3, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
717  %13 = getelementptr inbounds <4 x i64>, <4 x i64>* %0, i64 1
718  store <4 x i64> %12, <4 x i64>* %13, align 32
719  ret void
722!llvm.module.flags = !{!0}
723!0 = !{i32 1, !"ProfileSummary", !1}
724!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
725!2 = !{!"ProfileFormat", !"InstrProf"}
726!3 = !{!"TotalCount", i64 10000}
727!4 = !{!"MaxCount", i64 10}
728!5 = !{!"MaxInternalCount", i64 1}
729!6 = !{!"MaxFunctionCount", i64 1000}
730!7 = !{!"NumCounts", i64 3}
731!8 = !{!"NumFunctions", i64 3}
732!9 = !{!"DetailedSummary", !10}
733!10 = !{!11, !12, !13}
734!11 = !{i32 10000, i64 100, i32 1}
735!12 = !{i32 999000, i64 100, i32 1}
736!13 = !{i32 999999, i64 1, i32 2}
737!14 = !{!"function_entry_count", i64 0}