1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST-ALL
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST-PERLANE
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=VL_BW_DQ --check-prefix=VL_BW_DQ-FAST-ALL
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=VL_BW_DQ --check-prefix=VL_BW_DQ-FAST-PERLANE
7
8define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
9; AVX512F-LABEL: shuf2i1_1_0:
10; AVX512F:       # %bb.0:
11; AVX512F-NEXT:    vpsllq $63, %xmm0, %xmm0
12; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
13; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
14; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
15; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
16; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
17; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
18; AVX512F-NEXT:    vzeroupper
19; AVX512F-NEXT:    retq
20;
21; AVX512VL-LABEL: shuf2i1_1_0:
22; AVX512VL:       # %bb.0:
23; AVX512VL-NEXT:    vpsllq $63, %xmm0, %xmm0
24; AVX512VL-NEXT:    vptestmq %xmm0, %xmm0, %k1
25; AVX512VL-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
26; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1} {z}
27; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
28; AVX512VL-NEXT:    vptestmq %xmm1, %xmm1, %k1
29; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
30; AVX512VL-NEXT:    retq
31;
32; VL_BW_DQ-LABEL: shuf2i1_1_0:
33; VL_BW_DQ:       # %bb.0:
34; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
35; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
36; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
37; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
38; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
39; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
40; VL_BW_DQ-NEXT:    retq
41  %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
42  ret <2 x i1> %b
43}
44
45define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
46; AVX512F-LABEL: shuf2i1_1_2:
47; AVX512F:       # %bb.0:
48; AVX512F-NEXT:    vpsllq $63, %xmm0, %xmm0
49; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
50; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
51; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709551615,0]
52; AVX512F-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
53; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
54; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
55; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
56; AVX512F-NEXT:    vzeroupper
57; AVX512F-NEXT:    retq
58;
59; AVX512VL-LABEL: shuf2i1_1_2:
60; AVX512VL:       # %bb.0:
61; AVX512VL-NEXT:    vpsllq $63, %xmm0, %xmm0
62; AVX512VL-NEXT:    vptestmq %xmm0, %xmm0, %k1
63; AVX512VL-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
64; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1} {z}
65; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [18446744073709551615,0]
66; AVX512VL-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
67; AVX512VL-NEXT:    vptestmq %xmm1, %xmm1, %k1
68; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
69; AVX512VL-NEXT:    retq
70;
71; VL_BW_DQ-LABEL: shuf2i1_1_2:
72; VL_BW_DQ:       # %bb.0:
73; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
74; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
75; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
76; VL_BW_DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709551615,0]
77; VL_BW_DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
78; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
79; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
80; VL_BW_DQ-NEXT:    retq
81  %b = shufflevector <2 x i1> %a, <2 x i1> <i1 1, i1 0>, <2 x i32> <i32 1, i32 2>
82  ret <2 x i1> %b
83}
84
85
86define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
87; AVX512F-LABEL: shuf4i1_3_2_10:
88; AVX512F:       # %bb.0:
89; AVX512F-NEXT:    vpslld $31, %xmm0, %xmm0
90; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
91; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
92; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
93; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
94; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
95; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
96; AVX512F-NEXT:    vzeroupper
97; AVX512F-NEXT:    retq
98;
99; AVX512VL-LABEL: shuf4i1_3_2_10:
100; AVX512VL:       # %bb.0:
101; AVX512VL-NEXT:    vpslld $31, %xmm0, %xmm0
102; AVX512VL-NEXT:    vptestmd %xmm0, %xmm0, %k1
103; AVX512VL-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
104; AVX512VL-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1} {z}
105; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
106; AVX512VL-NEXT:    vptestmd %xmm1, %xmm1, %k1
107; AVX512VL-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
108; AVX512VL-NEXT:    retq
109;
110; VL_BW_DQ-LABEL: shuf4i1_3_2_10:
111; VL_BW_DQ:       # %bb.0:
112; VL_BW_DQ-NEXT:    vpslld $31, %xmm0, %xmm0
113; VL_BW_DQ-NEXT:    vpmovd2m %xmm0, %k0
114; VL_BW_DQ-NEXT:    vpmovm2d %k0, %xmm0
115; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
116; VL_BW_DQ-NEXT:    vpmovd2m %xmm0, %k0
117; VL_BW_DQ-NEXT:    vpmovm2d %k0, %xmm0
118; VL_BW_DQ-NEXT:    retq
119  %b = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
120  ret <4 x i1> %b
121}
122
123define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) {
124; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
125; AVX512F:       # %bb.0:
126; AVX512F-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
127; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
128; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
129; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
130; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
131; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
132; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
133; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
134; AVX512F-NEXT:    vzeroupper
135; AVX512F-NEXT:    retq
136;
137; AVX512VL-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
138; AVX512VL:       # %bb.0:
139; AVX512VL-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
140; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
141; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z}
142; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,1,0,3,7,7,0]
143; AVX512VL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
144; AVX512VL-NEXT:    vptestmd %ymm1, %ymm1, %k1
145; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
146; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
147; AVX512VL-NEXT:    vzeroupper
148; AVX512VL-NEXT:    retq
149;
150; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
151; VL_BW_DQ:       # %bb.0:
152; VL_BW_DQ-NEXT:    vpcmpeqq %zmm2, %zmm0, %k0
153; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
154; VL_BW_DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,6,1,0,3,7,7,0]
155; VL_BW_DQ-NEXT:    vpermd %ymm0, %ymm1, %ymm0
156; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
157; VL_BW_DQ-NEXT:    vpmovm2w %k0, %xmm0
158; VL_BW_DQ-NEXT:    vzeroupper
159; VL_BW_DQ-NEXT:    retq
160  %a2 = icmp eq <8 x i64> %a, %a1
161  %b2 = icmp eq <8 x i64> %b, %b1
162  %c = shufflevector <8 x i1> %a2, <8 x i1> %b2, <8 x i32> <i32 3, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
163  ret <8 x i1> %c
164}
165
166define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <16 x i32> %b, <16 x i32> %a1, <16 x i32> %b1) {
167; AVX512F-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
168; AVX512F:       # %bb.0:
169; AVX512F-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
170; AVX512F-NEXT:    vpcmpeqd %zmm3, %zmm1, %k2
171; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
172; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
173; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
174; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
175; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k1
176; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
177; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
178; AVX512F-NEXT:    vzeroupper
179; AVX512F-NEXT:    retq
180;
181; AVX512VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
182; AVX512VL:       # %bb.0:
183; AVX512VL-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
184; AVX512VL-NEXT:    vpcmpeqd %zmm3, %zmm1, %k2
185; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
186; AVX512VL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
187; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
188; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
189; AVX512VL-NEXT:    vptestmd %zmm2, %zmm2, %k1
190; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
191; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
192; AVX512VL-NEXT:    vzeroupper
193; AVX512VL-NEXT:    retq
194;
195; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
196; VL_BW_DQ:       # %bb.0:
197; VL_BW_DQ-NEXT:    vpcmpeqd %zmm2, %zmm0, %k0
198; VL_BW_DQ-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
199; VL_BW_DQ-NEXT:    vpmovm2d %k1, %zmm0
200; VL_BW_DQ-NEXT:    vpmovm2d %k0, %zmm1
201; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
202; VL_BW_DQ-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
203; VL_BW_DQ-NEXT:    vpmovd2m %zmm2, %k0
204; VL_BW_DQ-NEXT:    vpmovm2b %k0, %xmm0
205; VL_BW_DQ-NEXT:    vzeroupper
206; VL_BW_DQ-NEXT:    retq
207  %a2 = icmp eq <16 x i32> %a, %a1
208  %b2 = icmp eq <16 x i32> %b, %b1
209  %c = shufflevector <16 x i1> %a2, <16 x i1> %b2, <16 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
210  ret <16 x i1> %c
211}
212
213define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) {
214; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
215; AVX512F:       # %bb.0:
216; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm1
217; AVX512F-NEXT:    vpslld $31, %zmm1, %zmm1
218; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
219; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
220; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
221; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
222; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
223; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
224; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
225; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
226; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
227; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k1
228; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
229; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
230; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
231; AVX512F-NEXT:    retq
232;
233; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
234; AVX512VL:       # %bb.0:
235; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm1
236; AVX512VL-NEXT:    vpslld $31, %zmm1, %zmm1
237; AVX512VL-NEXT:    vptestmd %zmm1, %zmm1, %k1
238; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
239; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm0
240; AVX512VL-NEXT:    vpslld $31, %zmm0, %zmm0
241; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k2
242; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
243; AVX512VL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
244; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
245; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
246; AVX512VL-NEXT:    vptestmd %zmm2, %zmm2, %k1
247; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
248; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
249; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
250; AVX512VL-NEXT:    retq
251;
252; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
253; VL_BW_DQ:       # %bb.0:
254; VL_BW_DQ-NEXT:    vpsllw $7, %ymm0, %ymm0
255; VL_BW_DQ-NEXT:    vpmovb2m %ymm0, %k0
256; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
257; VL_BW_DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
258; VL_BW_DQ-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
259; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm1, %zmm0
260; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k0
261; VL_BW_DQ-NEXT:    vpmovm2b %k0, %ymm0
262; VL_BW_DQ-NEXT:    retq
263  %b = shufflevector <32 x i1> %a, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
264  ret <32 x i1> %b
265}
266
267define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16(<32 x i16> %a, <32 x i16> %c, <32 x i16> %d) {
268; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
269; AVX512F:       # %bb.0:
270; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
271; AVX512F-NEXT:    vpcmpeqw %ymm3, %ymm0, %ymm4
272; AVX512F-NEXT:    vpmovsxwd %ymm4, %zmm4
273; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k1
274; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
275; AVX512F-NEXT:    vpcmpeqw %ymm3, %ymm0, %ymm0
276; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
277; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
278; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
279; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
280; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
281; AVX512F-NEXT:    vpermi2d %zmm0, %zmm3, %zmm4
282; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k1
283; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
284; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
285; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
286; AVX512F-NEXT:    vpternlogq $202, %zmm2, %zmm1, %zmm0
287; AVX512F-NEXT:    retq
288;
289; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
290; AVX512VL:       # %bb.0:
291; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
292; AVX512VL-NEXT:    vpcmpeqw %ymm3, %ymm0, %ymm4
293; AVX512VL-NEXT:    vpmovsxwd %ymm4, %zmm4
294; AVX512VL-NEXT:    vptestmd %zmm4, %zmm4, %k1
295; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
296; AVX512VL-NEXT:    vpcmpeqw %ymm3, %ymm0, %ymm0
297; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
298; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k2
299; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
300; AVX512VL-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
301; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
302; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm3, %zmm4
303; AVX512VL-NEXT:    vptestmd %zmm4, %zmm4, %k1
304; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
305; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
306; AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
307; AVX512VL-NEXT:    vpternlogq $202, %zmm2, %zmm1, %zmm0
308; AVX512VL-NEXT:    retq
309;
310; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
311; VL_BW_DQ:       # %bb.0:
312; VL_BW_DQ-NEXT:    vptestnmw %zmm0, %zmm0, %k0
313; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
314; VL_BW_DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
315; VL_BW_DQ-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
316; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm3, %zmm0
317; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k1
318; VL_BW_DQ-NEXT:    vpblendmw %zmm1, %zmm2, %zmm0 {%k1}
319; VL_BW_DQ-NEXT:    retq
320  %cmp = icmp eq <32 x i16> %a, zeroinitializer
321  %shuf = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
322  %sel = select <32 x i1> %shuf, <32 x i16> %c, <32 x i16> %d
323  ret <32 x i16> %sel
324}
325
326define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8(<32 x i8> %a, <32 x i8> %c, <32 x i8> %d) {
327; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
328; AVX512F:       # %bb.0:
329; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
330; AVX512F-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm0
331; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm3
332; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k1
333; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
334; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
335; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
336; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
337; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
338; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
339; AVX512F-NEXT:    vpermi2d %zmm0, %zmm3, %zmm4
340; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k1
341; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
342; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
343; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
344; AVX512F-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
345; AVX512F-NEXT:    retq
346;
347; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
348; AVX512VL:       # %bb.0:
349; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
350; AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm0
351; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm3
352; AVX512VL-NEXT:    vptestmd %zmm3, %zmm3, %k1
353; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
354; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm0
355; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k2
356; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
357; AVX512VL-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
358; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
359; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm3, %zmm4
360; AVX512VL-NEXT:    vptestmd %zmm4, %zmm4, %k1
361; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
362; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
363; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
364; AVX512VL-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
365; AVX512VL-NEXT:    retq
366;
367; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
368; VL_BW_DQ:       # %bb.0:
369; VL_BW_DQ-NEXT:    vptestnmb %ymm0, %ymm0, %k0
370; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
371; VL_BW_DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
372; VL_BW_DQ-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
373; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm3, %zmm0
374; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k1
375; VL_BW_DQ-NEXT:    vpblendmb %ymm1, %ymm2, %ymm0 {%k1}
376; VL_BW_DQ-NEXT:    retq
377  %cmp = icmp eq <32 x i8> %a, zeroinitializer
378  %shuf = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
379  %sel = select <32 x i1> %shuf, <32 x i8> %c, <32 x i8> %d
380  ret <32 x i8> %sel
381}
382
383define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split(<16 x i32> %a, <16 x i32> %b, <32 x i16> %c, <32 x i16> %d) {
384; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
385; AVX512F:       # %bb.0:
386; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
387; AVX512F-NEXT:    vptestnmd %zmm1, %zmm1, %k2
388; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
389; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
390; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
391; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm4
392; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k1
393; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
394; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
395; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
396; AVX512F-NEXT:    vpternlogq $202, %zmm3, %zmm2, %zmm0
397; AVX512F-NEXT:    retq
398;
399; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
400; AVX512VL:       # %bb.0:
401; AVX512VL-NEXT:    vptestnmd %zmm0, %zmm0, %k1
402; AVX512VL-NEXT:    vptestnmd %zmm1, %zmm1, %k2
403; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
404; AVX512VL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
405; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
406; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm4
407; AVX512VL-NEXT:    vptestmd %zmm4, %zmm4, %k1
408; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
409; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
410; AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
411; AVX512VL-NEXT:    vpternlogq $202, %zmm3, %zmm2, %zmm0
412; AVX512VL-NEXT:    retq
413;
414; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
415; VL_BW_DQ:       # %bb.0:
416; VL_BW_DQ-NEXT:    vptestnmd %zmm0, %zmm0, %k0
417; VL_BW_DQ-NEXT:    vptestnmd %zmm1, %zmm1, %k1
418; VL_BW_DQ-NEXT:    kunpckwd %k0, %k1, %k0
419; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
420; VL_BW_DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
421; VL_BW_DQ-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
422; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm1, %zmm0
423; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k1
424; VL_BW_DQ-NEXT:    vpblendmw %zmm2, %zmm3, %zmm0 {%k1}
425; VL_BW_DQ-NEXT:    retq
426  %cmp1 = icmp eq <16 x i32> %a, zeroinitializer
427  %cmp2 = icmp eq <16 x i32> %b, zeroinitializer
428  %concat = shufflevector <16 x i1> %cmp1, <16 x i1> %cmp2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
429  %shuf = shufflevector <32 x i1> %concat, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
430  %sel = select <32 x i1> %shuf, <32 x i16> %c, <32 x i16> %d
431  ret <32 x i16> %sel
432}
433
434define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split(<16 x i32> %a, <16 x i32> %b, <32 x i8> %c, <32 x i8> %d) {
435; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
436; AVX512F:       # %bb.0:
437; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
438; AVX512F-NEXT:    vptestnmd %zmm1, %zmm1, %k2
439; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
440; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
441; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
442; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm4
443; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k1
444; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
445; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
446; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
447; AVX512F-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm0
448; AVX512F-NEXT:    retq
449;
450; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
451; AVX512VL:       # %bb.0:
452; AVX512VL-NEXT:    vptestnmd %zmm0, %zmm0, %k1
453; AVX512VL-NEXT:    vptestnmd %zmm1, %zmm1, %k2
454; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
455; AVX512VL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
456; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
457; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm4
458; AVX512VL-NEXT:    vptestmd %zmm4, %zmm4, %k1
459; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
460; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
461; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
462; AVX512VL-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm0
463; AVX512VL-NEXT:    retq
464;
465; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
466; VL_BW_DQ:       # %bb.0:
467; VL_BW_DQ-NEXT:    vptestnmd %zmm0, %zmm0, %k0
468; VL_BW_DQ-NEXT:    vptestnmd %zmm1, %zmm1, %k1
469; VL_BW_DQ-NEXT:    kunpckwd %k0, %k1, %k0
470; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
471; VL_BW_DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
472; VL_BW_DQ-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
473; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm1, %zmm0
474; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k1
475; VL_BW_DQ-NEXT:    vpblendmb %ymm2, %ymm3, %ymm0 {%k1}
476; VL_BW_DQ-NEXT:    retq
477  %cmp1 = icmp eq <16 x i32> %a, zeroinitializer
478  %cmp2 = icmp eq <16 x i32> %b, zeroinitializer
479  %concat = shufflevector <16 x i1> %cmp1, <16 x i1> %cmp2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
480  %shuf = shufflevector <32 x i1> %concat, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
481  %sel = select <32 x i1> %shuf, <32 x i8> %c, <32 x i8> %d
482  ret <32 x i8> %sel
483}
484
485define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
486; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
487; AVX512F:       # %bb.0:
488; AVX512F-NEXT:    kmovw %edi, %k1
489; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
490; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
491; AVX512F-NEXT:    vpbroadcastq %xmm0, %zmm0
492; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
493; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
494; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
495; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
496; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
497; AVX512F-NEXT:    vzeroupper
498; AVX512F-NEXT:    retq
499;
500; AVX512VL-FAST-ALL-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
501; AVX512VL-FAST-ALL:       # %bb.0:
502; AVX512VL-FAST-ALL-NEXT:    kmovw %edi, %k1
503; AVX512VL-FAST-ALL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
504; AVX512VL-FAST-ALL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z}
505; AVX512VL-FAST-ALL-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2]
506; AVX512VL-FAST-ALL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
507; AVX512VL-FAST-ALL-NEXT:    vpslld $31, %ymm1, %ymm1
508; AVX512VL-FAST-ALL-NEXT:    vptestmd %ymm1, %ymm1, %k1
509; AVX512VL-FAST-ALL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
510; AVX512VL-FAST-ALL-NEXT:    vpmovdw %ymm0, %xmm0
511; AVX512VL-FAST-ALL-NEXT:    vzeroupper
512; AVX512VL-FAST-ALL-NEXT:    retq
513;
514; AVX512VL-FAST-PERLANE-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
515; AVX512VL-FAST-PERLANE:       # %bb.0:
516; AVX512VL-FAST-PERLANE-NEXT:    kmovw %edi, %k1
517; AVX512VL-FAST-PERLANE-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
518; AVX512VL-FAST-PERLANE-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z}
519; AVX512VL-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
520; AVX512VL-FAST-PERLANE-NEXT:    vpbroadcastq %xmm1, %ymm1
521; AVX512VL-FAST-PERLANE-NEXT:    vpslld $31, %ymm1, %ymm1
522; AVX512VL-FAST-PERLANE-NEXT:    vptestmd %ymm1, %ymm1, %k1
523; AVX512VL-FAST-PERLANE-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
524; AVX512VL-FAST-PERLANE-NEXT:    vpmovdw %ymm0, %xmm0
525; AVX512VL-FAST-PERLANE-NEXT:    vzeroupper
526; AVX512VL-FAST-PERLANE-NEXT:    retq
527;
528; VL_BW_DQ-FAST-ALL-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
529; VL_BW_DQ-FAST-ALL:       # %bb.0:
530; VL_BW_DQ-FAST-ALL-NEXT:    kmovd %edi, %k0
531; VL_BW_DQ-FAST-ALL-NEXT:    vpmovm2d %k0, %ymm0
532; VL_BW_DQ-FAST-ALL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2]
533; VL_BW_DQ-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
534; VL_BW_DQ-FAST-ALL-NEXT:    vpmovd2m %ymm0, %k0
535; VL_BW_DQ-FAST-ALL-NEXT:    vpmovm2w %k0, %xmm0
536; VL_BW_DQ-FAST-ALL-NEXT:    vzeroupper
537; VL_BW_DQ-FAST-ALL-NEXT:    retq
538;
539; VL_BW_DQ-FAST-PERLANE-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
540; VL_BW_DQ-FAST-PERLANE:       # %bb.0:
541; VL_BW_DQ-FAST-PERLANE-NEXT:    kmovd %edi, %k0
542; VL_BW_DQ-FAST-PERLANE-NEXT:    vpmovm2d %k0, %ymm0
543; VL_BW_DQ-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
544; VL_BW_DQ-FAST-PERLANE-NEXT:    vpbroadcastq %xmm0, %ymm0
545; VL_BW_DQ-FAST-PERLANE-NEXT:    vpmovd2m %ymm0, %k0
546; VL_BW_DQ-FAST-PERLANE-NEXT:    vpmovm2w %k0, %xmm0
547; VL_BW_DQ-FAST-PERLANE-NEXT:    vzeroupper
548; VL_BW_DQ-FAST-PERLANE-NEXT:    retq
549  %b = bitcast i8 %a to <8 x i1>
550  %c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef>
551  ret <8 x i1> %c
552}
553
554define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
555; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
556; AVX512F:       # %bb.0:
557; AVX512F-NEXT:    kmovw %edi, %k1
558; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
559; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
560; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
561; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
562; AVX512F-NEXT:    vpsllq $63, %zmm2, %zmm0
563; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
564; AVX512F-NEXT:    kmovw %k0, %eax
565; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
566; AVX512F-NEXT:    vzeroupper
567; AVX512F-NEXT:    retq
568;
569; AVX512VL-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
570; AVX512VL:       # %bb.0:
571; AVX512VL-NEXT:    kmovw %edi, %k1
572; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
573; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
574; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
575; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3]
576; AVX512VL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
577; AVX512VL-NEXT:    vpslld $31, %ymm2, %ymm0
578; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
579; AVX512VL-NEXT:    kmovw %k0, %eax
580; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
581; AVX512VL-NEXT:    vzeroupper
582; AVX512VL-NEXT:    retq
583;
584; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
585; VL_BW_DQ:       # %bb.0:
586; VL_BW_DQ-NEXT:    kmovd %edi, %k0
587; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
588; VL_BW_DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
589; VL_BW_DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3]
590; VL_BW_DQ-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
591; VL_BW_DQ-NEXT:    vpmovd2m %ymm2, %k0
592; VL_BW_DQ-NEXT:    kmovd %k0, %eax
593; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
594; VL_BW_DQ-NEXT:    vzeroupper
595; VL_BW_DQ-NEXT:    retq
596  %b = bitcast i8 %a to <8 x i1>
597  %c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 10, i32 2, i32 9, i32 undef, i32 3, i32 undef, i32 2, i32 undef>
598  %d = bitcast <8 x i1> %c to i8
599  ret i8 %d
600}
601
602define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
603; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
604; AVX512F:       # %bb.0:
605; AVX512F-NEXT:    kmovw %edi, %k1
606; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
607; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
608; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
609; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
610; AVX512F-NEXT:    kmovw %k0, %eax
611; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
612; AVX512F-NEXT:    vzeroupper
613; AVX512F-NEXT:    retq
614;
615; AVX512VL-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
616; AVX512VL:       # %bb.0:
617; AVX512VL-NEXT:    kmovw %edi, %k1
618; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
619; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
620; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
621; AVX512VL-NEXT:    vpslld $31, %ymm0, %ymm0
622; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
623; AVX512VL-NEXT:    kmovw %k0, %eax
624; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
625; AVX512VL-NEXT:    vzeroupper
626; AVX512VL-NEXT:    retq
627;
628; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
629; VL_BW_DQ:       # %bb.0:
630; VL_BW_DQ-NEXT:    kmovd %edi, %k0
631; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
632; VL_BW_DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
633; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
634; VL_BW_DQ-NEXT:    kmovd %k0, %eax
635; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
636; VL_BW_DQ-NEXT:    vzeroupper
637; VL_BW_DQ-NEXT:    retq
638  %b = bitcast i8 %a to <8 x i1>
639  %c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
640  %d = bitcast <8 x i1> %c to i8
641  ret i8 %d
642}
643
644define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
645; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
646; AVX512F:       # %bb.0:
647; AVX512F-NEXT:    kmovw %edi, %k1
648; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
649; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
650; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
651; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
652; AVX512F-NEXT:    vptestmq %zmm2, %zmm2, %k0
653; AVX512F-NEXT:    kmovw %k0, %eax
654; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
655; AVX512F-NEXT:    vzeroupper
656; AVX512F-NEXT:    retq
657;
658; AVX512VL-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
659; AVX512VL:       # %bb.0:
660; AVX512VL-NEXT:    kmovw %edi, %k1
661; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
662; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
663; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
664; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0]
665; AVX512VL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
666; AVX512VL-NEXT:    vptestmd %ymm2, %ymm2, %k0
667; AVX512VL-NEXT:    kmovw %k0, %eax
668; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
669; AVX512VL-NEXT:    vzeroupper
670; AVX512VL-NEXT:    retq
671;
672; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
673; VL_BW_DQ:       # %bb.0:
674; VL_BW_DQ-NEXT:    kmovd %edi, %k0
675; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
676; VL_BW_DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
677; VL_BW_DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0]
678; VL_BW_DQ-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
679; VL_BW_DQ-NEXT:    vpmovd2m %ymm2, %k0
680; VL_BW_DQ-NEXT:    kmovd %k0, %eax
681; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
682; VL_BW_DQ-NEXT:    vzeroupper
683; VL_BW_DQ-NEXT:    retq
684  %b = bitcast i8 %a to <8 x i1>
685  %c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
686  %d = bitcast <8 x i1>%c to i8
687  ret i8 %d
688}
689
690define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
691; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
692; AVX512F:       # %bb.0:
693; AVX512F-NEXT:    kmovw %edi, %k1
694; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
695; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
696; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
697; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
698; AVX512F-NEXT:    vptestmq %zmm2, %zmm2, %k0
699; AVX512F-NEXT:    kmovw %k0, %eax
700; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
701; AVX512F-NEXT:    vzeroupper
702; AVX512F-NEXT:    retq
703;
704; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
705; AVX512VL:       # %bb.0:
706; AVX512VL-NEXT:    kmovw %edi, %k1
707; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
708; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
709; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
710; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
711; AVX512VL-NEXT:    kmovw %k0, %eax
712; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
713; AVX512VL-NEXT:    vzeroupper
714; AVX512VL-NEXT:    retq
715;
716; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
717; VL_BW_DQ:       # %bb.0:
718; VL_BW_DQ-NEXT:    kmovd %edi, %k0
719; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
720; VL_BW_DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
721; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
722; VL_BW_DQ-NEXT:    kmovd %k0, %eax
723; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
724; VL_BW_DQ-NEXT:    vzeroupper
725; VL_BW_DQ-NEXT:    retq
726  %b = bitcast i8 %a to <8 x i1>
727  %c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 10, i32 3, i32 7, i32 7, i32 0>
728  %d = bitcast <8 x i1>%c to i8
729  ret i8 %d
730}
731
732define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
733; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
734; AVX512F:       # %bb.0:
735; AVX512F-NEXT:    kmovw %edi, %k1
736; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
737; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1]
738; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,0,0,0,0]
739; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
740; AVX512F-NEXT:    vptestmq %zmm2, %zmm2, %k0
741; AVX512F-NEXT:    kmovw %k0, %eax
742; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
743; AVX512F-NEXT:    vzeroupper
744; AVX512F-NEXT:    retq
745;
746; AVX512VL-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
747; AVX512VL:       # %bb.0:
748; AVX512VL-NEXT:    kmovw %edi, %k1
749; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
750; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
751; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
752; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7]
753; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
754; AVX512VL-NEXT:    kmovw %k0, %eax
755; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
756; AVX512VL-NEXT:    vzeroupper
757; AVX512VL-NEXT:    retq
758;
759; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
760; VL_BW_DQ:       # %bb.0:
761; VL_BW_DQ-NEXT:    kmovd %edi, %k0
762; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
763; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
764; VL_BW_DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7]
765; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
766; VL_BW_DQ-NEXT:    kmovd %k0, %eax
767; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
768; VL_BW_DQ-NEXT:    vzeroupper
769; VL_BW_DQ-NEXT:    retq
770  %b = bitcast i8 %a to <8 x i1>
771  %c = shufflevector <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 1>
772  %c1 = bitcast <8 x i1>%c to i8
773  ret i8 %c1
774}
775
776define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
777; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
778; AVX512F:       # %bb.0:
779; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
780; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
781; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
782; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
783; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
784; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
785; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
786; AVX512F-NEXT:    vptestmq %zmm2, %zmm2, %k0
787; AVX512F-NEXT:    kmovw %k0, %eax
788; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
789; AVX512F-NEXT:    vzeroupper
790; AVX512F-NEXT:    retq
791;
792; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
793; AVX512VL:       # %bb.0:
794; AVX512VL-NEXT:    vpmovsxwd %xmm0, %ymm0
795; AVX512VL-NEXT:    vpslld $31, %ymm0, %ymm0
796; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k1
797; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
798; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z}
799; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,1,2,3,4,5,6,7]
800; AVX512VL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
801; AVX512VL-NEXT:    vptestmd %ymm2, %ymm2, %k0
802; AVX512VL-NEXT:    kmovw %k0, %eax
803; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
804; AVX512VL-NEXT:    vzeroupper
805; AVX512VL-NEXT:    retq
806;
807; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
808; VL_BW_DQ:       # %bb.0:
809; VL_BW_DQ-NEXT:    vpsllw $15, %xmm0, %xmm0
810; VL_BW_DQ-NEXT:    vpmovw2m %xmm0, %k0
811; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
812; VL_BW_DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,1,2,3,4,5,6,7]
813; VL_BW_DQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
814; VL_BW_DQ-NEXT:    vpermt2d %ymm0, %ymm1, %ymm2
815; VL_BW_DQ-NEXT:    vpmovd2m %ymm2, %k0
816; VL_BW_DQ-NEXT:    kmovd %k0, %eax
817; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
818; VL_BW_DQ-NEXT:    vzeroupper
819; VL_BW_DQ-NEXT:    retq
820  %c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
821  %c1 = bitcast <8 x i1>%c to i8
822  ret i8 %c1
823}
824
825define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
826; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
827; AVX512F:       # %bb.0:
828; AVX512F-NEXT:    kmovw %edi, %k1
829; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
830; AVX512F-NEXT:    vpbroadcastd %xmm0, %zmm0
831; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
832; AVX512F-NEXT:    kmovw %k0, %eax
833; AVX512F-NEXT:    # kill: def $ax killed $ax killed $eax
834; AVX512F-NEXT:    vzeroupper
835; AVX512F-NEXT:    retq
836;
837; AVX512VL-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
838; AVX512VL:       # %bb.0:
839; AVX512VL-NEXT:    kmovw %edi, %k1
840; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
841; AVX512VL-NEXT:    vpbroadcastd %xmm0, %zmm0
842; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k0
843; AVX512VL-NEXT:    kmovw %k0, %eax
844; AVX512VL-NEXT:    # kill: def $ax killed $ax killed $eax
845; AVX512VL-NEXT:    vzeroupper
846; AVX512VL-NEXT:    retq
847;
848; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
849; VL_BW_DQ:       # %bb.0:
850; VL_BW_DQ-NEXT:    kmovd %edi, %k0
851; VL_BW_DQ-NEXT:    vpmovm2d %k0, %zmm0
852; VL_BW_DQ-NEXT:    vpbroadcastd %xmm0, %zmm0
853; VL_BW_DQ-NEXT:    vpmovd2m %zmm0, %k0
854; VL_BW_DQ-NEXT:    kmovd %k0, %eax
855; VL_BW_DQ-NEXT:    # kill: def $ax killed $ax killed $eax
856; VL_BW_DQ-NEXT:    vzeroupper
857; VL_BW_DQ-NEXT:    retq
858  %b = bitcast i16 %a to <16 x i1>
859  %c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer
860  %d = bitcast <16 x i1> %c to i16
861  ret i16 %d
862}
863
864define i64 @shuf64i1_zero(i64 %a) {
865; AVX512F-LABEL: shuf64i1_zero:
866; AVX512F:       # %bb.0:
867; AVX512F-NEXT:    kmovw %edi, %k1
868; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
869; AVX512F-NEXT:    vpbroadcastd %xmm0, %zmm0
870; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
871; AVX512F-NEXT:    kmovw %k0, %eax
872; AVX512F-NEXT:    kmovw %k0, %ecx
873; AVX512F-NEXT:    shll $16, %ecx
874; AVX512F-NEXT:    orl %eax, %ecx
875; AVX512F-NEXT:    movq %rcx, %rax
876; AVX512F-NEXT:    shlq $32, %rax
877; AVX512F-NEXT:    orq %rcx, %rax
878; AVX512F-NEXT:    vzeroupper
879; AVX512F-NEXT:    retq
880;
881; AVX512VL-LABEL: shuf64i1_zero:
882; AVX512VL:       # %bb.0:
883; AVX512VL-NEXT:    kmovw %edi, %k1
884; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
885; AVX512VL-NEXT:    vpbroadcastd %xmm0, %zmm0
886; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k0
887; AVX512VL-NEXT:    kmovw %k0, %eax
888; AVX512VL-NEXT:    kmovw %k0, %ecx
889; AVX512VL-NEXT:    shll $16, %ecx
890; AVX512VL-NEXT:    orl %eax, %ecx
891; AVX512VL-NEXT:    movq %rcx, %rax
892; AVX512VL-NEXT:    shlq $32, %rax
893; AVX512VL-NEXT:    orq %rcx, %rax
894; AVX512VL-NEXT:    vzeroupper
895; AVX512VL-NEXT:    retq
896;
897; VL_BW_DQ-LABEL: shuf64i1_zero:
898; VL_BW_DQ:       # %bb.0:
899; VL_BW_DQ-NEXT:    kmovq %rdi, %k0
900; VL_BW_DQ-NEXT:    vpmovm2b %k0, %zmm0
901; VL_BW_DQ-NEXT:    vpbroadcastb %xmm0, %zmm0
902; VL_BW_DQ-NEXT:    vpmovb2m %zmm0, %k0
903; VL_BW_DQ-NEXT:    kmovq %k0, %rax
904; VL_BW_DQ-NEXT:    vzeroupper
905; VL_BW_DQ-NEXT:    retq
906  %b = bitcast i64 %a to <64 x i1>
907  %c = shufflevector < 64 x i1> %b, <64 x i1> undef, <64 x i32> zeroinitializer
908  %d = bitcast <64 x i1> %c to i64
909  ret i64 %d
910}
911