1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
13
14define void @shuffle_v32i8_to_v16i8_1(<32 x i8>* %L, <16 x i8>* %S) nounwind {
15; AVX-LABEL: shuffle_v32i8_to_v16i8_1:
16; AVX:       # %bb.0:
17; AVX-NEXT:    vmovdqa (%rdi), %xmm0
18; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
19; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
20; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
21; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
22; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
23; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
24; AVX-NEXT:    retq
25;
26; AVX512-LABEL: shuffle_v32i8_to_v16i8_1:
27; AVX512:       # %bb.0:
28; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
29; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
30; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
31; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
32; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
33; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
34; AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
35; AVX512-NEXT:    retq
36  %vec = load <32 x i8>, <32 x i8>* %L
37  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
38  store <16 x i8> %strided.vec, <16 x i8>* %S
39  ret void
40}
41
42define void @shuffle_v16i16_to_v8i16_1(<16 x i16>* %L, <8 x i16>* %S) nounwind {
43; AVX-LABEL: shuffle_v16i16_to_v8i16_1:
44; AVX:       # %bb.0:
45; AVX-NEXT:    vmovdqa (%rdi), %xmm0
46; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
47; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
48; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
49; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
50; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
51; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
52; AVX-NEXT:    retq
53;
54; AVX512F-LABEL: shuffle_v16i16_to_v8i16_1:
55; AVX512F:       # %bb.0:
56; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
57; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
58; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
59; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
60; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
61; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
62; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
63; AVX512F-NEXT:    retq
64;
65; AVX512VL-LABEL: shuffle_v16i16_to_v8i16_1:
66; AVX512VL:       # %bb.0:
67; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
68; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
69; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
70; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
71; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
72; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
73; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
74; AVX512VL-NEXT:    retq
75;
76; AVX512BW-LABEL: shuffle_v16i16_to_v8i16_1:
77; AVX512BW:       # %bb.0:
78; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm0 = [1,3,5,7,33,35,37,39]
79; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm1
80; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm2
81; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
82; AVX512BW-NEXT:    vmovdqa %xmm1, (%rsi)
83; AVX512BW-NEXT:    vzeroupper
84; AVX512BW-NEXT:    retq
85;
86; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16_1:
87; AVX512BWVL:       # %bb.0:
88; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
89; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,3,5,7,9,11,13,15]
90; AVX512BWVL-NEXT:    vpermi2w 16(%rdi), %xmm0, %xmm1
91; AVX512BWVL-NEXT:    vmovdqa %xmm1, (%rsi)
92; AVX512BWVL-NEXT:    retq
93  %vec = load <16 x i16>, <16 x i16>* %L
94  %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
95  store <8 x i16> %strided.vec, <8 x i16>* %S
96  ret void
97}
98
99define void @shuffle_v8i32_to_v4i32_1(<8 x i32>* %L, <4 x i32>* %S) nounwind {
100; AVX-LABEL: shuffle_v8i32_to_v4i32_1:
101; AVX:       # %bb.0:
102; AVX-NEXT:    vmovaps (%rdi), %xmm0
103; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
104; AVX-NEXT:    vmovaps %xmm0, (%rsi)
105; AVX-NEXT:    retq
106;
107; AVX512-LABEL: shuffle_v8i32_to_v4i32_1:
108; AVX512:       # %bb.0:
109; AVX512-NEXT:    vmovaps (%rdi), %xmm0
110; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
111; AVX512-NEXT:    vmovaps %xmm0, (%rsi)
112; AVX512-NEXT:    retq
113  %vec = load <8 x i32>, <8 x i32>* %L
114  %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
115  store <4 x i32> %strided.vec, <4 x i32>* %S
116  ret void
117}
118
119define void @shuffle_v32i8_to_v8i8_1(<32 x i8>* %L, <8 x i8>* %S) nounwind {
120; AVX-LABEL: shuffle_v32i8_to_v8i8_1:
121; AVX:       # %bb.0:
122; AVX-NEXT:    vmovdqa (%rdi), %xmm0
123; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
124; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
125; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
126; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
127; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
128; AVX-NEXT:    vmovq %xmm0, (%rsi)
129; AVX-NEXT:    retq
130;
131; AVX512-LABEL: shuffle_v32i8_to_v8i8_1:
132; AVX512:       # %bb.0:
133; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
134; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
135; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
136; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
137; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
138; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
139; AVX512-NEXT:    vmovq %xmm0, (%rsi)
140; AVX512-NEXT:    retq
141  %vec = load <32 x i8>, <32 x i8>* %L
142  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
143  store <8 x i8> %strided.vec, <8 x i8>* %S
144  ret void
145}
146
147define void @shuffle_v32i8_to_v8i8_2(<32 x i8>* %L, <8 x i8>* %S) nounwind {
148; AVX-LABEL: shuffle_v32i8_to_v8i8_2:
149; AVX:       # %bb.0:
150; AVX-NEXT:    vmovdqa (%rdi), %xmm0
151; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
152; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
153; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
154; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
155; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
156; AVX-NEXT:    vmovq %xmm0, (%rsi)
157; AVX-NEXT:    retq
158;
159; AVX512-LABEL: shuffle_v32i8_to_v8i8_2:
160; AVX512:       # %bb.0:
161; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
162; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
163; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
164; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
165; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
166; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
167; AVX512-NEXT:    vmovq %xmm0, (%rsi)
168; AVX512-NEXT:    retq
169  %vec = load <32 x i8>, <32 x i8>* %L
170  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
171  store <8 x i8> %strided.vec, <8 x i8>* %S
172  ret void
173}
174
175define void @shuffle_v32i8_to_v8i8_3(<32 x i8>* %L, <8 x i8>* %S) nounwind {
176; AVX-LABEL: shuffle_v32i8_to_v8i8_3:
177; AVX:       # %bb.0:
178; AVX-NEXT:    vmovdqa (%rdi), %xmm0
179; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
180; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
181; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
182; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
183; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
184; AVX-NEXT:    vmovq %xmm0, (%rsi)
185; AVX-NEXT:    retq
186;
187; AVX512-LABEL: shuffle_v32i8_to_v8i8_3:
188; AVX512:       # %bb.0:
189; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
190; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
191; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
192; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
193; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
194; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
195; AVX512-NEXT:    vmovq %xmm0, (%rsi)
196; AVX512-NEXT:    retq
197  %vec = load <32 x i8>, <32 x i8>* %L
198  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
199  store <8 x i8> %strided.vec, <8 x i8>* %S
200  ret void
201}
202
203define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind {
204; AVX1-LABEL: shuffle_v16i16_to_v4i16_1:
205; AVX1:       # %bb.0:
206; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
207; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
208; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
209; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
210; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
211; AVX1-NEXT:    vmovq %xmm0, (%rsi)
212; AVX1-NEXT:    retq
213;
214; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_1:
215; AVX2-SLOW:       # %bb.0:
216; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
217; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
218; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
219; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
220; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
221; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
222; AVX2-SLOW-NEXT:    retq
223;
224; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_1:
225; AVX2-FAST:       # %bb.0:
226; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
227; AVX2-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
228; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
229; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
230; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
231; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
232; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
233; AVX2-FAST-NEXT:    retq
234;
235; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1:
236; AVX512F:       # %bb.0:
237; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
238; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
239; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
240; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
241; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
242; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
243; AVX512F-NEXT:    retq
244;
245; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_1:
246; AVX512VL:       # %bb.0:
247; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
248; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
249; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
250; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
251; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
252; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
253; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
254; AVX512VL-NEXT:    retq
255;
256; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1:
257; AVX512BW:       # %bb.0:
258; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm0 = [1,5,33,37,4,5,36,37]
259; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm1
260; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm2
261; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
262; AVX512BW-NEXT:    vmovq %xmm1, (%rsi)
263; AVX512BW-NEXT:    vzeroupper
264; AVX512BW-NEXT:    retq
265;
266; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1:
267; AVX512BWVL:       # %bb.0:
268; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
269; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm1 = <1,5,9,13,u,u,u,u>
270; AVX512BWVL-NEXT:    vpermi2w 16(%rdi), %xmm0, %xmm1
271; AVX512BWVL-NEXT:    vmovq %xmm1, (%rsi)
272; AVX512BWVL-NEXT:    retq
273  %vec = load <16 x i16>, <16 x i16>* %L
274  %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
275  store <4 x i16> %strided.vec, <4 x i16>* %S
276  ret void
277}
278
279define void @shuffle_v16i16_to_v4i16_2(<16 x i16>* %L, <4 x i16>* %S) nounwind {
280; AVX1-LABEL: shuffle_v16i16_to_v4i16_2:
281; AVX1:       # %bb.0:
282; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
283; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
284; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
285; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
286; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
287; AVX1-NEXT:    vmovq %xmm0, (%rsi)
288; AVX1-NEXT:    retq
289;
290; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_2:
291; AVX2-SLOW:       # %bb.0:
292; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
293; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
294; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
295; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
296; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
297; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
298; AVX2-SLOW-NEXT:    retq
299;
300; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_2:
301; AVX2-FAST:       # %bb.0:
302; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
303; AVX2-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
304; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
305; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
306; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
307; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
308; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
309; AVX2-FAST-NEXT:    retq
310;
311; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2:
312; AVX512F:       # %bb.0:
313; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
314; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
315; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
316; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
317; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
318; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
319; AVX512F-NEXT:    retq
320;
321; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_2:
322; AVX512VL:       # %bb.0:
323; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
324; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
325; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
326; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
327; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
328; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
329; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
330; AVX512VL-NEXT:    retq
331;
332; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2:
333; AVX512BW:       # %bb.0:
334; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,6,34,38,2,3,34,35]
335; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm1
336; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm2
337; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
338; AVX512BW-NEXT:    vmovq %xmm1, (%rsi)
339; AVX512BW-NEXT:    vzeroupper
340; AVX512BW-NEXT:    retq
341;
342; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2:
343; AVX512BWVL:       # %bb.0:
344; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
345; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm1 = <2,6,10,14,u,u,u,u>
346; AVX512BWVL-NEXT:    vpermi2w 16(%rdi), %xmm0, %xmm1
347; AVX512BWVL-NEXT:    vmovq %xmm1, (%rsi)
348; AVX512BWVL-NEXT:    retq
349  %vec = load <16 x i16>, <16 x i16>* %L
350  %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
351  store <4 x i16> %strided.vec, <4 x i16>* %S
352  ret void
353}
354
355define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind {
356; AVX1-LABEL: shuffle_v16i16_to_v4i16_3:
357; AVX1:       # %bb.0:
358; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
359; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
360; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
361; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
362; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
363; AVX1-NEXT:    vmovq %xmm0, (%rsi)
364; AVX1-NEXT:    retq
365;
366; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_3:
367; AVX2-SLOW:       # %bb.0:
368; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
369; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
370; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
371; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
372; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
373; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
374; AVX2-SLOW-NEXT:    retq
375;
376; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_3:
377; AVX2-FAST:       # %bb.0:
378; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
379; AVX2-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
380; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
381; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
382; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
383; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
384; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
385; AVX2-FAST-NEXT:    retq
386;
387; AVX512F-LABEL: shuffle_v16i16_to_v4i16_3:
388; AVX512F:       # %bb.0:
389; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
390; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
391; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
392; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
393; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
394; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
395; AVX512F-NEXT:    retq
396;
397; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_3:
398; AVX512VL:       # %bb.0:
399; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
400; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
401; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
402; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
403; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
404; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
405; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
406; AVX512VL-NEXT:    retq
407;
408; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_3:
409; AVX512BW:       # %bb.0:
410; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm0 = [3,7,35,39,2,3,34,35]
411; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm1
412; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm2
413; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
414; AVX512BW-NEXT:    vmovq %xmm1, (%rsi)
415; AVX512BW-NEXT:    vzeroupper
416; AVX512BW-NEXT:    retq
417;
418; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3:
419; AVX512BWVL:       # %bb.0:
420; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
421; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm1 = <3,7,11,15,u,u,u,u>
422; AVX512BWVL-NEXT:    vpermi2w 16(%rdi), %xmm0, %xmm1
423; AVX512BWVL-NEXT:    vmovq %xmm1, (%rsi)
424; AVX512BWVL-NEXT:    retq
425  %vec = load <16 x i16>, <16 x i16>* %L
426  %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
427  store <4 x i16> %strided.vec, <4 x i16>* %S
428  ret void
429}
430
431define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind {
432; AVX-LABEL: shuffle_v32i8_to_v4i8_1:
433; AVX:       # %bb.0:
434; AVX-NEXT:    vmovdqa (%rdi), %xmm0
435; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
436; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
437; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
438; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
439; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
440; AVX-NEXT:    vmovd %xmm0, (%rsi)
441; AVX-NEXT:    retq
442;
443; AVX512-LABEL: shuffle_v32i8_to_v4i8_1:
444; AVX512:       # %bb.0:
445; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
446; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
447; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
448; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
449; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
450; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
451; AVX512-NEXT:    vmovd %xmm0, (%rsi)
452; AVX512-NEXT:    retq
453  %vec = load <32 x i8>, <32 x i8>* %L
454  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 1, i32 9, i32 17, i32 25>
455  store <4 x i8> %strided.vec, <4 x i8>* %S
456  ret void
457}
458
459define void @shuffle_v32i8_to_v4i8_2(<32 x i8>* %L, <4 x i8>* %S) nounwind {
460; AVX-LABEL: shuffle_v32i8_to_v4i8_2:
461; AVX:       # %bb.0:
462; AVX-NEXT:    vmovdqa (%rdi), %xmm0
463; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
464; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
465; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
466; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
467; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
468; AVX-NEXT:    vmovd %xmm0, (%rsi)
469; AVX-NEXT:    retq
470;
471; AVX512-LABEL: shuffle_v32i8_to_v4i8_2:
472; AVX512:       # %bb.0:
473; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
474; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
475; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
476; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
477; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
478; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
479; AVX512-NEXT:    vmovd %xmm0, (%rsi)
480; AVX512-NEXT:    retq
481  %vec = load <32 x i8>, <32 x i8>* %L
482  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 2, i32 10, i32 18, i32 26>
483  store <4 x i8> %strided.vec, <4 x i8>* %S
484  ret void
485}
486
487define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind {
488; AVX-LABEL: shuffle_v32i8_to_v4i8_3:
489; AVX:       # %bb.0:
490; AVX-NEXT:    vmovdqa (%rdi), %xmm0
491; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
492; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
493; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
494; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
495; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
496; AVX-NEXT:    vmovd %xmm0, (%rsi)
497; AVX-NEXT:    retq
498;
499; AVX512-LABEL: shuffle_v32i8_to_v4i8_3:
500; AVX512:       # %bb.0:
501; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
502; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
503; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
504; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
505; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
506; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
507; AVX512-NEXT:    vmovd %xmm0, (%rsi)
508; AVX512-NEXT:    retq
509  %vec = load <32 x i8>, <32 x i8>* %L
510  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
511  store <4 x i8> %strided.vec, <4 x i8>* %S
512  ret void
513}
514
515define void @shuffle_v32i8_to_v4i8_4(<32 x i8>* %L, <4 x i8>* %S) nounwind {
516; AVX-LABEL: shuffle_v32i8_to_v4i8_4:
517; AVX:       # %bb.0:
518; AVX-NEXT:    vmovdqa (%rdi), %xmm0
519; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
520; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
521; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
522; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
523; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
524; AVX-NEXT:    vmovd %xmm0, (%rsi)
525; AVX-NEXT:    retq
526;
527; AVX512-LABEL: shuffle_v32i8_to_v4i8_4:
528; AVX512:       # %bb.0:
529; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
530; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
531; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
532; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
533; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
534; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
535; AVX512-NEXT:    vmovd %xmm0, (%rsi)
536; AVX512-NEXT:    retq
537  %vec = load <32 x i8>, <32 x i8>* %L
538  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 4, i32 12, i32 20, i32 28>
539  store <4 x i8> %strided.vec, <4 x i8>* %S
540  ret void
541}
542
543define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind {
544; AVX-LABEL: shuffle_v32i8_to_v4i8_5:
545; AVX:       # %bb.0:
546; AVX-NEXT:    vmovdqa (%rdi), %xmm0
547; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
548; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
549; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
550; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
551; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
552; AVX-NEXT:    vmovd %xmm0, (%rsi)
553; AVX-NEXT:    retq
554;
555; AVX512-LABEL: shuffle_v32i8_to_v4i8_5:
556; AVX512:       # %bb.0:
557; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
558; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
559; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
560; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
561; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
562; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
563; AVX512-NEXT:    vmovd %xmm0, (%rsi)
564; AVX512-NEXT:    retq
565  %vec = load <32 x i8>, <32 x i8>* %L
566  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 5, i32 13, i32 21, i32 29>
567  store <4 x i8> %strided.vec, <4 x i8>* %S
568  ret void
569}
570
571define void @shuffle_v32i8_to_v4i8_6(<32 x i8>* %L, <4 x i8>* %S) nounwind {
572; AVX-LABEL: shuffle_v32i8_to_v4i8_6:
573; AVX:       # %bb.0:
574; AVX-NEXT:    vmovdqa (%rdi), %xmm0
575; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
576; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
577; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
578; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
579; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
580; AVX-NEXT:    vmovd %xmm0, (%rsi)
581; AVX-NEXT:    retq
582;
583; AVX512-LABEL: shuffle_v32i8_to_v4i8_6:
584; AVX512:       # %bb.0:
585; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
586; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
587; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
588; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
589; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
590; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
591; AVX512-NEXT:    vmovd %xmm0, (%rsi)
592; AVX512-NEXT:    retq
593  %vec = load <32 x i8>, <32 x i8>* %L
594  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 6, i32 14, i32 22, i32 30>
595  store <4 x i8> %strided.vec, <4 x i8>* %S
596  ret void
597}
598
599define void @shuffle_v32i8_to_v4i8_7(<32 x i8>* %L, <4 x i8>* %S) nounwind {
600; AVX-LABEL: shuffle_v32i8_to_v4i8_7:
601; AVX:       # %bb.0:
602; AVX-NEXT:    vmovdqa (%rdi), %xmm0
603; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
604; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
605; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
606; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
607; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
608; AVX-NEXT:    vmovd %xmm0, (%rsi)
609; AVX-NEXT:    retq
610;
611; AVX512-LABEL: shuffle_v32i8_to_v4i8_7:
612; AVX512:       # %bb.0:
613; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
614; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
615; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
616; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
617; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
618; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
619; AVX512-NEXT:    vmovd %xmm0, (%rsi)
620; AVX512-NEXT:    retq
621  %vec = load <32 x i8>, <32 x i8>* %L
622  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 7, i32 15, i32 23, i32 31>
623  store <4 x i8> %strided.vec, <4 x i8>* %S
624  ret void
625}
626
627