1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -O2 | FileCheck %s --check-prefixes=AVX512,AVX512-FAST
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+fast-variable-perlane-shuffle -O2 | FileCheck %s --check-prefixes=AVX512,AVX512-FAST-PERLANE
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -O2 | FileCheck %s --check-prefixes=AVX512NOTDQ,AVX512NOTDQ-FAST
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+fast-variable-perlane-shuffle -O2 | FileCheck %s --check-prefixes=AVX512NOTDQ,AVX512NOTDQ-FAST-PERLANE
6
7define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
8; AVX512-LABEL: load_v8i1_broadcast_4_v2i1:
9; AVX512:       # %bb.0:
10; AVX512-NEXT:    kmovb (%rdi), %k0
11; AVX512-NEXT:    kshiftrb $4, %k0, %k0
12; AVX512-NEXT:    vpmovm2q %k0, %xmm2
13; AVX512-NEXT:    vpbroadcastq %xmm2, %xmm2
14; AVX512-NEXT:    vpmovq2m %xmm2, %k1
15; AVX512-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
16; AVX512-NEXT:    vmovapd %xmm1, (%rsi)
17; AVX512-NEXT:    retq
18;
19; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1:
20; AVX512NOTDQ:       # %bb.0:
21; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
22; AVX512NOTDQ-NEXT:    kshiftrw $4, %k0, %k1
23; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
24; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
25; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm2, %xmm2
26; AVX512NOTDQ-NEXT:    vptestmq %xmm2, %xmm2, %k1
27; AVX512NOTDQ-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
28; AVX512NOTDQ-NEXT:    vmovapd %xmm1, (%rsi)
29; AVX512NOTDQ-NEXT:    retq
30    %d0 = load <8 x i1>, <8 x i1>* %a0
31    %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 4,i32 4>
32    %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
33    store <2 x double> %d2, <2 x double>* %a3
34    ret void
35}
36define void @load_v8i1_broadcast_7_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
37; AVX512-LABEL: load_v8i1_broadcast_7_v2i1:
38; AVX512:       # %bb.0:
39; AVX512-NEXT:    kmovb (%rdi), %k0
40; AVX512-NEXT:    kshiftrb $6, %k0, %k0
41; AVX512-NEXT:    vpmovm2q %k0, %xmm2
42; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
43; AVX512-NEXT:    vpmovq2m %xmm2, %k1
44; AVX512-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
45; AVX512-NEXT:    vmovapd %xmm1, (%rsi)
46; AVX512-NEXT:    retq
47;
48; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1:
49; AVX512NOTDQ:       # %bb.0:
50; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
51; AVX512NOTDQ-NEXT:    kshiftrw $6, %k0, %k1
52; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
53; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
54; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
55; AVX512NOTDQ-NEXT:    vptestmq %xmm2, %xmm2, %k1
56; AVX512NOTDQ-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
57; AVX512NOTDQ-NEXT:    vmovapd %xmm1, (%rsi)
58; AVX512NOTDQ-NEXT:    retq
59    %d0 = load <8 x i1>, <8 x i1>* %a0
60    %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 7,i32 7>
61    %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
62    store <2 x double> %d2, <2 x double>* %a3
63    ret void
64}
65define void @load_v16i1_broadcast_8_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
66; AVX512-LABEL: load_v16i1_broadcast_8_v2i1:
67; AVX512:       # %bb.0:
68; AVX512-NEXT:    kmovw (%rdi), %k0
69; AVX512-NEXT:    kshiftrw $8, %k0, %k0
70; AVX512-NEXT:    vpmovm2q %k0, %xmm2
71; AVX512-NEXT:    vpbroadcastq %xmm2, %xmm2
72; AVX512-NEXT:    vpmovq2m %xmm2, %k1
73; AVX512-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
74; AVX512-NEXT:    vmovapd %xmm1, (%rsi)
75; AVX512-NEXT:    retq
76;
77; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1:
78; AVX512NOTDQ:       # %bb.0:
79; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
80; AVX512NOTDQ-NEXT:    kshiftrw $8, %k0, %k1
81; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
82; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
83; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm2, %xmm2
84; AVX512NOTDQ-NEXT:    vptestmq %xmm2, %xmm2, %k1
85; AVX512NOTDQ-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
86; AVX512NOTDQ-NEXT:    vmovapd %xmm1, (%rsi)
87; AVX512NOTDQ-NEXT:    retq
88    %d0 = load <16 x i1>, <16 x i1>* %a0
89    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 8,i32 8>
90    %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
91    store <2 x double> %d2, <2 x double>* %a3
92    ret void
93}
94define void @load_v16i1_broadcast_8_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
95; AVX512-LABEL: load_v16i1_broadcast_8_v4i1:
96; AVX512:       # %bb.0:
97; AVX512-NEXT:    kmovw (%rdi), %k0
98; AVX512-NEXT:    kshiftrw $8, %k0, %k0
99; AVX512-NEXT:    vpmovm2d %k0, %xmm2
100; AVX512-NEXT:    vpbroadcastd %xmm2, %xmm2
101; AVX512-NEXT:    vpmovd2m %xmm2, %k1
102; AVX512-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
103; AVX512-NEXT:    vmovaps %xmm1, (%rsi)
104; AVX512-NEXT:    retq
105;
106; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1:
107; AVX512NOTDQ:       # %bb.0:
108; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
109; AVX512NOTDQ-NEXT:    kshiftrw $8, %k0, %k1
110; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
111; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm2, %xmm2 {%k1} {z}
112; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %xmm2
113; AVX512NOTDQ-NEXT:    vptestmd %xmm2, %xmm2, %k1
114; AVX512NOTDQ-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
115; AVX512NOTDQ-NEXT:    vmovaps %xmm1, (%rsi)
116; AVX512NOTDQ-NEXT:    retq
117    %d0 = load <16 x i1>, <16 x i1>* %a0
118    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 8,i32 8,i32 8,i32 8>
119    %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
120    store <4 x float> %d2, <4 x float>* %a3
121    ret void
122}
123define void @load_v16i1_broadcast_15_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
124; AVX512-LABEL: load_v16i1_broadcast_15_v2i1:
125; AVX512:       # %bb.0:
126; AVX512-NEXT:    kmovw (%rdi), %k0
127; AVX512-NEXT:    kshiftrw $14, %k0, %k0
128; AVX512-NEXT:    vpmovm2q %k0, %xmm2
129; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
130; AVX512-NEXT:    vpmovq2m %xmm2, %k1
131; AVX512-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
132; AVX512-NEXT:    vmovapd %xmm1, (%rsi)
133; AVX512-NEXT:    retq
134;
135; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1:
136; AVX512NOTDQ:       # %bb.0:
137; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
138; AVX512NOTDQ-NEXT:    kshiftrw $14, %k0, %k1
139; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
140; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
141; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
142; AVX512NOTDQ-NEXT:    vptestmq %xmm2, %xmm2, %k1
143; AVX512NOTDQ-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
144; AVX512NOTDQ-NEXT:    vmovapd %xmm1, (%rsi)
145; AVX512NOTDQ-NEXT:    retq
146    %d0 = load <16 x i1>, <16 x i1>* %a0
147    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 15,i32 15>
148    %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
149    store <2 x double> %d2, <2 x double>* %a3
150    ret void
151}
152define void @load_v16i1_broadcast_15_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
153; AVX512-LABEL: load_v16i1_broadcast_15_v4i1:
154; AVX512:       # %bb.0:
155; AVX512-NEXT:    kmovw (%rdi), %k0
156; AVX512-NEXT:    kshiftrw $12, %k0, %k0
157; AVX512-NEXT:    vpmovm2d %k0, %xmm2
158; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
159; AVX512-NEXT:    vpmovd2m %xmm2, %k1
160; AVX512-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
161; AVX512-NEXT:    vmovaps %xmm1, (%rsi)
162; AVX512-NEXT:    retq
163;
164; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1:
165; AVX512NOTDQ:       # %bb.0:
166; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
167; AVX512NOTDQ-NEXT:    kshiftrw $12, %k0, %k1
168; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
169; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm2, %xmm2 {%k1} {z}
170; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
171; AVX512NOTDQ-NEXT:    vptestmd %xmm2, %xmm2, %k1
172; AVX512NOTDQ-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
173; AVX512NOTDQ-NEXT:    vmovaps %xmm1, (%rsi)
174; AVX512NOTDQ-NEXT:    retq
175    %d0 = load <16 x i1>, <16 x i1>* %a0
176    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 15,i32 15,i32 15,i32 15>
177    %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
178    store <4 x float> %d2, <4 x float>* %a3
179    ret void
180}
181define void @load_v32i1_broadcast_16_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
182; AVX512-LABEL: load_v32i1_broadcast_16_v2i1:
183; AVX512:       # %bb.0:
184; AVX512-NEXT:    kmovd (%rdi), %k0
185; AVX512-NEXT:    kshiftrd $16, %k0, %k0
186; AVX512-NEXT:    vpmovm2q %k0, %xmm2
187; AVX512-NEXT:    vpbroadcastq %xmm2, %xmm2
188; AVX512-NEXT:    vpmovq2m %xmm2, %k1
189; AVX512-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
190; AVX512-NEXT:    vmovapd %xmm1, (%rsi)
191; AVX512-NEXT:    retq
192;
193; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1:
194; AVX512NOTDQ:       # %bb.0:
195; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
196; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k1
197; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
198; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
199; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm2, %xmm2
200; AVX512NOTDQ-NEXT:    vptestmq %xmm2, %xmm2, %k1
201; AVX512NOTDQ-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
202; AVX512NOTDQ-NEXT:    vmovapd %xmm1, (%rsi)
203; AVX512NOTDQ-NEXT:    retq
204    %d0 = load <32 x i1>, <32 x i1>* %a0
205    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 16,i32 16>
206    %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
207    store <2 x double> %d2, <2 x double>* %a3
208    ret void
209}
210define void @load_v32i1_broadcast_16_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
211; AVX512-LABEL: load_v32i1_broadcast_16_v4i1:
212; AVX512:       # %bb.0:
213; AVX512-NEXT:    kmovd (%rdi), %k0
214; AVX512-NEXT:    kshiftrd $16, %k0, %k0
215; AVX512-NEXT:    vpmovm2d %k0, %xmm2
216; AVX512-NEXT:    vpbroadcastd %xmm2, %xmm2
217; AVX512-NEXT:    vpmovd2m %xmm2, %k1
218; AVX512-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
219; AVX512-NEXT:    vmovaps %xmm1, (%rsi)
220; AVX512-NEXT:    retq
221;
222; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1:
223; AVX512NOTDQ:       # %bb.0:
224; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
225; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k1
226; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
227; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm2, %xmm2 {%k1} {z}
228; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %xmm2
229; AVX512NOTDQ-NEXT:    vptestmd %xmm2, %xmm2, %k1
230; AVX512NOTDQ-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
231; AVX512NOTDQ-NEXT:    vmovaps %xmm1, (%rsi)
232; AVX512NOTDQ-NEXT:    retq
233    %d0 = load <32 x i1>, <32 x i1>* %a0
234    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 16,i32 16,i32 16,i32 16>
235    %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
236    store <4 x float> %d2, <4 x float>* %a3
237    ret void
238}
239define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
240; AVX512-LABEL: load_v32i1_broadcast_16_v8i1:
241; AVX512:       # %bb.0:
242; AVX512-NEXT:    kmovb 2(%rdi), %k0
243; AVX512-NEXT:    vpmovm2d %k0, %ymm2
244; AVX512-NEXT:    vpbroadcastd %xmm2, %ymm2
245; AVX512-NEXT:    vpmovd2m %ymm2, %k1
246; AVX512-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
247; AVX512-NEXT:    vmovaps %ymm1, (%rsi)
248; AVX512-NEXT:    vzeroupper
249; AVX512-NEXT:    retq
250;
251; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1:
252; AVX512NOTDQ:       # %bb.0:
253; AVX512NOTDQ-NEXT:    kmovw 2(%rdi), %k1
254; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
255; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
256; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %ymm2
257; AVX512NOTDQ-NEXT:    vptestmd %ymm2, %ymm2, %k1
258; AVX512NOTDQ-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
259; AVX512NOTDQ-NEXT:    vmovaps %ymm1, (%rsi)
260; AVX512NOTDQ-NEXT:    vzeroupper
261; AVX512NOTDQ-NEXT:    retq
262    %d0 = load <32 x i1>, <32 x i1>* %a0
263    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16>
264    %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
265    store <8 x float> %d2, <8 x float>* %a3
266    ret void
267}
268define void @load_v32i1_broadcast_31_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
269; AVX512-LABEL: load_v32i1_broadcast_31_v2i1:
270; AVX512:       # %bb.0:
271; AVX512-NEXT:    kmovd (%rdi), %k0
272; AVX512-NEXT:    kshiftrd $30, %k0, %k0
273; AVX512-NEXT:    vpmovm2q %k0, %xmm2
274; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
275; AVX512-NEXT:    vpmovq2m %xmm2, %k1
276; AVX512-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
277; AVX512-NEXT:    vmovapd %xmm1, (%rsi)
278; AVX512-NEXT:    retq
279;
280; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1:
281; AVX512NOTDQ:       # %bb.0:
282; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
283; AVX512NOTDQ-NEXT:    kshiftrd $30, %k0, %k1
284; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
285; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
286; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
287; AVX512NOTDQ-NEXT:    vptestmq %xmm2, %xmm2, %k1
288; AVX512NOTDQ-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
289; AVX512NOTDQ-NEXT:    vmovapd %xmm1, (%rsi)
290; AVX512NOTDQ-NEXT:    retq
291    %d0 = load <32 x i1>, <32 x i1>* %a0
292    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 31,i32 31>
293    %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
294    store <2 x double> %d2, <2 x double>* %a3
295    ret void
296}
297define void @load_v32i1_broadcast_31_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
298; AVX512-LABEL: load_v32i1_broadcast_31_v4i1:
299; AVX512:       # %bb.0:
300; AVX512-NEXT:    kmovd (%rdi), %k0
301; AVX512-NEXT:    kshiftrd $28, %k0, %k0
302; AVX512-NEXT:    vpmovm2d %k0, %xmm2
303; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
304; AVX512-NEXT:    vpmovd2m %xmm2, %k1
305; AVX512-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
306; AVX512-NEXT:    vmovaps %xmm1, (%rsi)
307; AVX512-NEXT:    retq
308;
309; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1:
310; AVX512NOTDQ:       # %bb.0:
311; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
312; AVX512NOTDQ-NEXT:    kshiftrd $28, %k0, %k1
313; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
314; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm2, %xmm2 {%k1} {z}
315; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
316; AVX512NOTDQ-NEXT:    vptestmd %xmm2, %xmm2, %k1
317; AVX512NOTDQ-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
318; AVX512NOTDQ-NEXT:    vmovaps %xmm1, (%rsi)
319; AVX512NOTDQ-NEXT:    retq
320    %d0 = load <32 x i1>, <32 x i1>* %a0
321    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 31,i32 31,i32 31,i32 31>
322    %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
323    store <4 x float> %d2, <4 x float>* %a3
324    ret void
325}
326define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
327; AVX512-FAST-LABEL: load_v32i1_broadcast_31_v8i1:
328; AVX512-FAST:       # %bb.0:
329; AVX512-FAST-NEXT:    kmovb 3(%rdi), %k0
330; AVX512-FAST-NEXT:    vpmovm2d %k0, %ymm2
331; AVX512-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
332; AVX512-FAST-NEXT:    vpermd %ymm2, %ymm3, %ymm2
333; AVX512-FAST-NEXT:    vpmovd2m %ymm2, %k1
334; AVX512-FAST-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
335; AVX512-FAST-NEXT:    vmovaps %ymm1, (%rsi)
336; AVX512-FAST-NEXT:    vzeroupper
337; AVX512-FAST-NEXT:    retq
338;
339; AVX512-FAST-PERLANE-LABEL: load_v32i1_broadcast_31_v8i1:
340; AVX512-FAST-PERLANE:       # %bb.0:
341; AVX512-FAST-PERLANE-NEXT:    kmovb 3(%rdi), %k0
342; AVX512-FAST-PERLANE-NEXT:    vpmovm2d %k0, %ymm2
343; AVX512-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
344; AVX512-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
345; AVX512-FAST-PERLANE-NEXT:    vpmovd2m %ymm2, %k1
346; AVX512-FAST-PERLANE-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
347; AVX512-FAST-PERLANE-NEXT:    vmovaps %ymm1, (%rsi)
348; AVX512-FAST-PERLANE-NEXT:    vzeroupper
349; AVX512-FAST-PERLANE-NEXT:    retq
350;
351; AVX512NOTDQ-FAST-LABEL: load_v32i1_broadcast_31_v8i1:
352; AVX512NOTDQ-FAST:       # %bb.0:
353; AVX512NOTDQ-FAST-NEXT:    movzbl 3(%rdi), %eax
354; AVX512NOTDQ-FAST-NEXT:    kmovd %eax, %k1
355; AVX512NOTDQ-FAST-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
356; AVX512NOTDQ-FAST-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
357; AVX512NOTDQ-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
358; AVX512NOTDQ-FAST-NEXT:    vpermd %ymm2, %ymm3, %ymm2
359; AVX512NOTDQ-FAST-NEXT:    vptestmd %ymm2, %ymm2, %k1
360; AVX512NOTDQ-FAST-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
361; AVX512NOTDQ-FAST-NEXT:    vmovaps %ymm1, (%rsi)
362; AVX512NOTDQ-FAST-NEXT:    vzeroupper
363; AVX512NOTDQ-FAST-NEXT:    retq
364;
365; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v32i1_broadcast_31_v8i1:
366; AVX512NOTDQ-FAST-PERLANE:       # %bb.0:
367; AVX512NOTDQ-FAST-PERLANE-NEXT:    movzbl 3(%rdi), %eax
368; AVX512NOTDQ-FAST-PERLANE-NEXT:    kmovd %eax, %k1
369; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
370; AVX512NOTDQ-FAST-PERLANE-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
371; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
372; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
373; AVX512NOTDQ-FAST-PERLANE-NEXT:    vptestmd %ymm2, %ymm2, %k1
374; AVX512NOTDQ-FAST-PERLANE-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
375; AVX512NOTDQ-FAST-PERLANE-NEXT:    vmovaps %ymm1, (%rsi)
376; AVX512NOTDQ-FAST-PERLANE-NEXT:    vzeroupper
377; AVX512NOTDQ-FAST-PERLANE-NEXT:    retq
378    %d0 = load <32 x i1>, <32 x i1>* %a0
379    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31>
380    %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
381    store <8 x float> %d2, <8 x float>* %a3
382    ret void
383}
384define void @load_v64i1_broadcast_32_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
385; AVX512-LABEL: load_v64i1_broadcast_32_v2i1:
386; AVX512:       # %bb.0:
387; AVX512-NEXT:    kmovq (%rdi), %k0
388; AVX512-NEXT:    kshiftrq $32, %k0, %k0
389; AVX512-NEXT:    vpmovm2q %k0, %xmm2
390; AVX512-NEXT:    vpbroadcastq %xmm2, %xmm2
391; AVX512-NEXT:    vpmovq2m %xmm2, %k1
392; AVX512-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
393; AVX512-NEXT:    vmovapd %xmm1, (%rsi)
394; AVX512-NEXT:    retq
395;
396; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1:
397; AVX512NOTDQ:       # %bb.0:
398; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
399; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k1
400; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
401; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
402; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm2, %xmm2
403; AVX512NOTDQ-NEXT:    vptestmq %xmm2, %xmm2, %k1
404; AVX512NOTDQ-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
405; AVX512NOTDQ-NEXT:    vmovapd %xmm1, (%rsi)
406; AVX512NOTDQ-NEXT:    retq
407    %d0 = load <64 x i1>, <64 x i1>* %a0
408    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 32,i32 32>
409    %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
410    store <2 x double> %d2, <2 x double>* %a3
411    ret void
412}
413define void @load_v64i1_broadcast_32_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
414; AVX512-LABEL: load_v64i1_broadcast_32_v4i1:
415; AVX512:       # %bb.0:
416; AVX512-NEXT:    kmovq (%rdi), %k0
417; AVX512-NEXT:    kshiftrq $32, %k0, %k0
418; AVX512-NEXT:    vpmovm2d %k0, %xmm2
419; AVX512-NEXT:    vpbroadcastd %xmm2, %xmm2
420; AVX512-NEXT:    vpmovd2m %xmm2, %k1
421; AVX512-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
422; AVX512-NEXT:    vmovaps %xmm1, (%rsi)
423; AVX512-NEXT:    retq
424;
425; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1:
426; AVX512NOTDQ:       # %bb.0:
427; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
428; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k1
429; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
430; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm2, %xmm2 {%k1} {z}
431; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %xmm2
432; AVX512NOTDQ-NEXT:    vptestmd %xmm2, %xmm2, %k1
433; AVX512NOTDQ-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
434; AVX512NOTDQ-NEXT:    vmovaps %xmm1, (%rsi)
435; AVX512NOTDQ-NEXT:    retq
436    %d0 = load <64 x i1>, <64 x i1>* %a0
437    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 32,i32 32,i32 32,i32 32>
438    %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
439    store <4 x float> %d2, <4 x float>* %a3
440    ret void
441}
442define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
443; AVX512-LABEL: load_v64i1_broadcast_32_v8i1:
444; AVX512:       # %bb.0:
445; AVX512-NEXT:    kmovb 4(%rdi), %k0
446; AVX512-NEXT:    vpmovm2d %k0, %ymm2
447; AVX512-NEXT:    vpbroadcastd %xmm2, %ymm2
448; AVX512-NEXT:    vpmovd2m %ymm2, %k1
449; AVX512-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
450; AVX512-NEXT:    vmovaps %ymm1, (%rsi)
451; AVX512-NEXT:    vzeroupper
452; AVX512-NEXT:    retq
453;
454; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1:
455; AVX512NOTDQ:       # %bb.0:
456; AVX512NOTDQ-NEXT:    kmovw 4(%rdi), %k1
457; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
458; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
459; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %ymm2
460; AVX512NOTDQ-NEXT:    vptestmd %ymm2, %ymm2, %k1
461; AVX512NOTDQ-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
462; AVX512NOTDQ-NEXT:    vmovaps %ymm1, (%rsi)
463; AVX512NOTDQ-NEXT:    vzeroupper
464; AVX512NOTDQ-NEXT:    retq
465    %d0 = load <64 x i1>, <64 x i1>* %a0
466    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
467    %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
468    store <8 x float> %d2, <8 x float>* %a3
469    ret void
470}
471define void @load_v64i1_broadcast_32_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) {
472; AVX512-LABEL: load_v64i1_broadcast_32_v16i1:
473; AVX512:       # %bb.0:
474; AVX512-NEXT:    kmovw 4(%rdi), %k0
475; AVX512-NEXT:    vpmovm2d %k0, %zmm2
476; AVX512-NEXT:    vpbroadcastd %xmm2, %zmm2
477; AVX512-NEXT:    vpmovd2m %zmm2, %k1
478; AVX512-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
479; AVX512-NEXT:    vmovaps %zmm1, (%rsi)
480; AVX512-NEXT:    vzeroupper
481; AVX512-NEXT:    retq
482;
483; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1:
484; AVX512NOTDQ:       # %bb.0:
485; AVX512NOTDQ-NEXT:    kmovw 4(%rdi), %k1
486; AVX512NOTDQ-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
487; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %zmm2
488; AVX512NOTDQ-NEXT:    vptestmd %zmm2, %zmm2, %k1
489; AVX512NOTDQ-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
490; AVX512NOTDQ-NEXT:    vmovaps %zmm1, (%rsi)
491; AVX512NOTDQ-NEXT:    vzeroupper
492; AVX512NOTDQ-NEXT:    retq
493    %d0 = load <64 x i1>, <64 x i1>* %a0
494    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
495    %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2
496    store <16 x float> %d2, <16 x float>* %a3
497    ret void
498}
499define void @load_v64i1_broadcast_63_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
500; AVX512-LABEL: load_v64i1_broadcast_63_v2i1:
501; AVX512:       # %bb.0:
502; AVX512-NEXT:    kmovq (%rdi), %k0
503; AVX512-NEXT:    kshiftrq $62, %k0, %k0
504; AVX512-NEXT:    vpmovm2q %k0, %xmm2
505; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
506; AVX512-NEXT:    vpmovq2m %xmm2, %k1
507; AVX512-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
508; AVX512-NEXT:    vmovapd %xmm1, (%rsi)
509; AVX512-NEXT:    retq
510;
511; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1:
512; AVX512NOTDQ:       # %bb.0:
513; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
514; AVX512NOTDQ-NEXT:    kshiftrq $62, %k0, %k1
515; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
516; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
517; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
518; AVX512NOTDQ-NEXT:    vptestmq %xmm2, %xmm2, %k1
519; AVX512NOTDQ-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
520; AVX512NOTDQ-NEXT:    vmovapd %xmm1, (%rsi)
521; AVX512NOTDQ-NEXT:    retq
522    %d0 = load <64 x i1>, <64 x i1>* %a0
523    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 63,i32 63>
524    %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
525    store <2 x double> %d2, <2 x double>* %a3
526    ret void
527}
528define void @load_v64i1_broadcast_63_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
529; AVX512-LABEL: load_v64i1_broadcast_63_v4i1:
530; AVX512:       # %bb.0:
531; AVX512-NEXT:    kmovq (%rdi), %k0
532; AVX512-NEXT:    kshiftrq $60, %k0, %k0
533; AVX512-NEXT:    vpmovm2d %k0, %xmm2
534; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
535; AVX512-NEXT:    vpmovd2m %xmm2, %k1
536; AVX512-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
537; AVX512-NEXT:    vmovaps %xmm1, (%rsi)
538; AVX512-NEXT:    retq
539;
540; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1:
541; AVX512NOTDQ:       # %bb.0:
542; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
543; AVX512NOTDQ-NEXT:    kshiftrq $60, %k0, %k1
544; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
545; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm2, %xmm2 {%k1} {z}
546; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
547; AVX512NOTDQ-NEXT:    vptestmd %xmm2, %xmm2, %k1
548; AVX512NOTDQ-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
549; AVX512NOTDQ-NEXT:    vmovaps %xmm1, (%rsi)
550; AVX512NOTDQ-NEXT:    retq
551    %d0 = load <64 x i1>, <64 x i1>* %a0
552    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 63,i32 63,i32 63,i32 63>
553    %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
554    store <4 x float> %d2, <4 x float>* %a3
555    ret void
556}
557define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
558; AVX512-FAST-LABEL: load_v64i1_broadcast_63_v8i1:
559; AVX512-FAST:       # %bb.0:
560; AVX512-FAST-NEXT:    kmovb 7(%rdi), %k0
561; AVX512-FAST-NEXT:    vpmovm2d %k0, %ymm2
562; AVX512-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
563; AVX512-FAST-NEXT:    vpermd %ymm2, %ymm3, %ymm2
564; AVX512-FAST-NEXT:    vpmovd2m %ymm2, %k1
565; AVX512-FAST-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
566; AVX512-FAST-NEXT:    vmovaps %ymm1, (%rsi)
567; AVX512-FAST-NEXT:    vzeroupper
568; AVX512-FAST-NEXT:    retq
569;
570; AVX512-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v8i1:
571; AVX512-FAST-PERLANE:       # %bb.0:
572; AVX512-FAST-PERLANE-NEXT:    kmovb 7(%rdi), %k0
573; AVX512-FAST-PERLANE-NEXT:    vpmovm2d %k0, %ymm2
574; AVX512-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
575; AVX512-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
576; AVX512-FAST-PERLANE-NEXT:    vpmovd2m %ymm2, %k1
577; AVX512-FAST-PERLANE-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
578; AVX512-FAST-PERLANE-NEXT:    vmovaps %ymm1, (%rsi)
579; AVX512-FAST-PERLANE-NEXT:    vzeroupper
580; AVX512-FAST-PERLANE-NEXT:    retq
581;
582; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v8i1:
583; AVX512NOTDQ-FAST:       # %bb.0:
584; AVX512NOTDQ-FAST-NEXT:    movzbl 7(%rdi), %eax
585; AVX512NOTDQ-FAST-NEXT:    kmovd %eax, %k1
586; AVX512NOTDQ-FAST-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
587; AVX512NOTDQ-FAST-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
588; AVX512NOTDQ-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
589; AVX512NOTDQ-FAST-NEXT:    vpermd %ymm2, %ymm3, %ymm2
590; AVX512NOTDQ-FAST-NEXT:    vptestmd %ymm2, %ymm2, %k1
591; AVX512NOTDQ-FAST-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
592; AVX512NOTDQ-FAST-NEXT:    vmovaps %ymm1, (%rsi)
593; AVX512NOTDQ-FAST-NEXT:    vzeroupper
594; AVX512NOTDQ-FAST-NEXT:    retq
595;
596; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v8i1:
597; AVX512NOTDQ-FAST-PERLANE:       # %bb.0:
598; AVX512NOTDQ-FAST-PERLANE-NEXT:    movzbl 7(%rdi), %eax
599; AVX512NOTDQ-FAST-PERLANE-NEXT:    kmovd %eax, %k1
600; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
601; AVX512NOTDQ-FAST-PERLANE-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
602; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
603; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
604; AVX512NOTDQ-FAST-PERLANE-NEXT:    vptestmd %ymm2, %ymm2, %k1
605; AVX512NOTDQ-FAST-PERLANE-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
606; AVX512NOTDQ-FAST-PERLANE-NEXT:    vmovaps %ymm1, (%rsi)
607; AVX512NOTDQ-FAST-PERLANE-NEXT:    vzeroupper
608; AVX512NOTDQ-FAST-PERLANE-NEXT:    retq
609    %d0 = load <64 x i1>, <64 x i1>* %a0
610    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
611    %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
612    store <8 x float> %d2, <8 x float>* %a3
613    ret void
614}
615define void @load_v64i1_broadcast_63_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) {
616; AVX512-FAST-LABEL: load_v64i1_broadcast_63_v16i1:
617; AVX512-FAST:       # %bb.0:
618; AVX512-FAST-NEXT:    kmovw 6(%rdi), %k0
619; AVX512-FAST-NEXT:    vpmovm2d %k0, %zmm2
620; AVX512-FAST-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
621; AVX512-FAST-NEXT:    vpermd %zmm2, %zmm3, %zmm2
622; AVX512-FAST-NEXT:    vpmovd2m %zmm2, %k1
623; AVX512-FAST-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
624; AVX512-FAST-NEXT:    vmovaps %zmm1, (%rsi)
625; AVX512-FAST-NEXT:    vzeroupper
626; AVX512-FAST-NEXT:    retq
627;
628; AVX512-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1:
629; AVX512-FAST-PERLANE:       # %bb.0:
630; AVX512-FAST-PERLANE-NEXT:    kmovw 6(%rdi), %k0
631; AVX512-FAST-PERLANE-NEXT:    vpmovm2d %k0, %zmm2
632; AVX512-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} zmm2 = zmm2[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
633; AVX512-FAST-PERLANE-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[6,7,6,7,6,7,6,7]
634; AVX512-FAST-PERLANE-NEXT:    vpmovd2m %zmm2, %k1
635; AVX512-FAST-PERLANE-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
636; AVX512-FAST-PERLANE-NEXT:    vmovaps %zmm1, (%rsi)
637; AVX512-FAST-PERLANE-NEXT:    vzeroupper
638; AVX512-FAST-PERLANE-NEXT:    retq
639;
640; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v16i1:
641; AVX512NOTDQ-FAST:       # %bb.0:
642; AVX512NOTDQ-FAST-NEXT:    kmovw 6(%rdi), %k1
643; AVX512NOTDQ-FAST-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
644; AVX512NOTDQ-FAST-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
645; AVX512NOTDQ-FAST-NEXT:    vpermd %zmm2, %zmm3, %zmm2
646; AVX512NOTDQ-FAST-NEXT:    vptestmd %zmm2, %zmm2, %k1
647; AVX512NOTDQ-FAST-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
648; AVX512NOTDQ-FAST-NEXT:    vmovaps %zmm1, (%rsi)
649; AVX512NOTDQ-FAST-NEXT:    vzeroupper
650; AVX512NOTDQ-FAST-NEXT:    retq
651;
652; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1:
653; AVX512NOTDQ-FAST-PERLANE:       # %bb.0:
654; AVX512NOTDQ-FAST-PERLANE-NEXT:    kmovw 6(%rdi), %k1
655; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
656; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} zmm2 = zmm2[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
657; AVX512NOTDQ-FAST-PERLANE-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[6,7,6,7,6,7,6,7]
658; AVX512NOTDQ-FAST-PERLANE-NEXT:    vptestmd %zmm2, %zmm2, %k1
659; AVX512NOTDQ-FAST-PERLANE-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
660; AVX512NOTDQ-FAST-PERLANE-NEXT:    vmovaps %zmm1, (%rsi)
661; AVX512NOTDQ-FAST-PERLANE-NEXT:    vzeroupper
662; AVX512NOTDQ-FAST-PERLANE-NEXT:    retq
663    %d0 = load <64 x i1>, <64 x i1>* %a0
664    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
665    %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2
666    store <16 x float> %d2, <16 x float>* %a3
667    ret void
668}
669define void @load_v2i1_broadcast_1_v1i1_store(<2 x i1>* %a0,<1 x i1>* %a1) {
670; AVX512-LABEL: load_v2i1_broadcast_1_v1i1_store:
671; AVX512:       # %bb.0:
672; AVX512-NEXT:    kmovb (%rdi), %k0
673; AVX512-NEXT:    kshiftrb $1, %k0, %k0
674; AVX512-NEXT:    kshiftlb $7, %k0, %k0
675; AVX512-NEXT:    kshiftrb $7, %k0, %k0
676; AVX512-NEXT:    kmovb %k0, (%rsi)
677; AVX512-NEXT:    retq
678;
679; AVX512NOTDQ-LABEL: load_v2i1_broadcast_1_v1i1_store:
680; AVX512NOTDQ:       # %bb.0:
681; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
682; AVX512NOTDQ-NEXT:    kshiftrw $1, %k0, %k0
683; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
684; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
685; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
686; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
687; AVX512NOTDQ-NEXT:    retq
688    %d0 = load <2 x i1>, <2 x i1>* %a0
689    %d1 = shufflevector <2 x i1> %d0,<2 x i1> undef,<1 x i32><i32 1>
690    store <1 x i1> %d1, <1 x i1>* %a1
691    ret void
692}
693define void @load_v3i1_broadcast_1_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
694; AVX512-LABEL: load_v3i1_broadcast_1_v1i1_store:
695; AVX512:       # %bb.0:
696; AVX512-NEXT:    movb (%rdi), %al
697; AVX512-NEXT:    shrb %al
698; AVX512-NEXT:    xorl %ecx, %ecx
699; AVX512-NEXT:    testb $1, %al
700; AVX512-NEXT:    movl $255, %eax
701; AVX512-NEXT:    cmovel %ecx, %eax
702; AVX512-NEXT:    kmovd %eax, %k0
703; AVX512-NEXT:    kshiftrb $1, %k0, %k0
704; AVX512-NEXT:    kshiftlb $7, %k0, %k0
705; AVX512-NEXT:    kshiftrb $7, %k0, %k0
706; AVX512-NEXT:    kmovb %k0, (%rsi)
707; AVX512-NEXT:    retq
708;
709; AVX512NOTDQ-LABEL: load_v3i1_broadcast_1_v1i1_store:
710; AVX512NOTDQ:       # %bb.0:
711; AVX512NOTDQ-NEXT:    movb (%rdi), %al
712; AVX512NOTDQ-NEXT:    shrb %al
713; AVX512NOTDQ-NEXT:    xorl %ecx, %ecx
714; AVX512NOTDQ-NEXT:    testb $1, %al
715; AVX512NOTDQ-NEXT:    movl $255, %eax
716; AVX512NOTDQ-NEXT:    cmovel %ecx, %eax
717; AVX512NOTDQ-NEXT:    kmovd %eax, %k0
718; AVX512NOTDQ-NEXT:    kshiftrw $1, %k0, %k0
719; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
720; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
721; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
722; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
723; AVX512NOTDQ-NEXT:    retq
724    %d0 = load <3 x i1>, <3 x i1>* %a0
725    %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32><i32 1>
726    store <1 x i1> %d1, <1 x i1>* %a1
727    ret void
728}
729define void @load_v3i1_broadcast_2_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
730; AVX512-LABEL: load_v3i1_broadcast_2_v1i1_store:
731; AVX512:       # %bb.0:
732; AVX512-NEXT:    xorl %eax, %eax
733; AVX512-NEXT:    testb $4, (%rdi)
734; AVX512-NEXT:    movl $255, %ecx
735; AVX512-NEXT:    cmovel %eax, %ecx
736; AVX512-NEXT:    kmovd %ecx, %k0
737; AVX512-NEXT:    kshiftrb $2, %k0, %k0
738; AVX512-NEXT:    kshiftlb $7, %k0, %k0
739; AVX512-NEXT:    kshiftrb $7, %k0, %k0
740; AVX512-NEXT:    kmovb %k0, (%rsi)
741; AVX512-NEXT:    retq
742;
743; AVX512NOTDQ-LABEL: load_v3i1_broadcast_2_v1i1_store:
744; AVX512NOTDQ:       # %bb.0:
745; AVX512NOTDQ-NEXT:    xorl %eax, %eax
746; AVX512NOTDQ-NEXT:    testb $4, (%rdi)
747; AVX512NOTDQ-NEXT:    movl $255, %ecx
748; AVX512NOTDQ-NEXT:    cmovel %eax, %ecx
749; AVX512NOTDQ-NEXT:    kmovd %ecx, %k0
750; AVX512NOTDQ-NEXT:    kshiftrw $2, %k0, %k0
751; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
752; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
753; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
754; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
755; AVX512NOTDQ-NEXT:    retq
756    %d0 = load <3 x i1>, <3 x i1>* %a0
757    %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32><i32 2>
758    store <1 x i1> %d1, <1 x i1>* %a1
759    ret void
760}
761define void @load_v4i1_broadcast_2_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
762; AVX512-LABEL: load_v4i1_broadcast_2_v1i1_store:
763; AVX512:       # %bb.0:
764; AVX512-NEXT:    kmovb (%rdi), %k0
765; AVX512-NEXT:    kshiftrb $2, %k0, %k0
766; AVX512-NEXT:    kshiftlb $7, %k0, %k0
767; AVX512-NEXT:    kshiftrb $7, %k0, %k0
768; AVX512-NEXT:    kmovb %k0, (%rsi)
769; AVX512-NEXT:    retq
770;
771; AVX512NOTDQ-LABEL: load_v4i1_broadcast_2_v1i1_store:
772; AVX512NOTDQ:       # %bb.0:
773; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
774; AVX512NOTDQ-NEXT:    kshiftrw $2, %k0, %k0
775; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
776; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
777; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
778; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
779; AVX512NOTDQ-NEXT:    retq
780    %d0 = load <4 x i1>, <4 x i1>* %a0
781    %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32><i32 2>
782    store <1 x i1> %d1, <1 x i1>* %a1
783    ret void
784}
785define void @load_v4i1_broadcast_3_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
786; AVX512-LABEL: load_v4i1_broadcast_3_v1i1_store:
787; AVX512:       # %bb.0:
788; AVX512-NEXT:    kmovb (%rdi), %k0
789; AVX512-NEXT:    kshiftrb $3, %k0, %k0
790; AVX512-NEXT:    kshiftlb $7, %k0, %k0
791; AVX512-NEXT:    kshiftrb $7, %k0, %k0
792; AVX512-NEXT:    kmovb %k0, (%rsi)
793; AVX512-NEXT:    retq
794;
795; AVX512NOTDQ-LABEL: load_v4i1_broadcast_3_v1i1_store:
796; AVX512NOTDQ:       # %bb.0:
797; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
798; AVX512NOTDQ-NEXT:    kshiftrw $3, %k0, %k0
799; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
800; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
801; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
802; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
803; AVX512NOTDQ-NEXT:    retq
804    %d0 = load <4 x i1>, <4 x i1>* %a0
805    %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32><i32 3>
806    store <1 x i1> %d1, <1 x i1>* %a1
807    ret void
808}
809define void @load_v8i1_broadcast_4_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
810; AVX512-LABEL: load_v8i1_broadcast_4_v1i1_store:
811; AVX512:       # %bb.0:
812; AVX512-NEXT:    kmovb (%rdi), %k0
813; AVX512-NEXT:    kshiftrb $4, %k0, %k0
814; AVX512-NEXT:    kshiftlb $7, %k0, %k0
815; AVX512-NEXT:    kshiftrb $7, %k0, %k0
816; AVX512-NEXT:    kmovb %k0, (%rsi)
817; AVX512-NEXT:    retq
818;
819; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v1i1_store:
820; AVX512NOTDQ:       # %bb.0:
821; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
822; AVX512NOTDQ-NEXT:    kshiftrw $4, %k0, %k0
823; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
824; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
825; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
826; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
827; AVX512NOTDQ-NEXT:    retq
828    %d0 = load <8 x i1>, <8 x i1>* %a0
829    %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32><i32 4>
830    store <1 x i1> %d1, <1 x i1>* %a1
831    ret void
832}
833define void @load_v8i1_broadcast_4_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
834; AVX512-LABEL: load_v8i1_broadcast_4_v2i1_store:
835; AVX512:       # %bb.0:
836; AVX512-NEXT:    kmovb (%rdi), %k0
837; AVX512-NEXT:    kshiftrb $4, %k0, %k0
838; AVX512-NEXT:    vpmovm2q %k0, %xmm0
839; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
840; AVX512-NEXT:    vpmovq2m %xmm0, %k0
841; AVX512-NEXT:    kmovb %k0, (%rsi)
842; AVX512-NEXT:    retq
843;
844; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1_store:
845; AVX512NOTDQ:       # %bb.0:
846; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
847; AVX512NOTDQ-NEXT:    kshiftrw $4, %k0, %k1
848; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
849; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
850; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm0, %xmm0
851; AVX512NOTDQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
852; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
853; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
854; AVX512NOTDQ-NEXT:    retq
855    %d0 = load <8 x i1>, <8 x i1>* %a0
856    %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 4,i32 4>
857    store <2 x i1> %d1, <2 x i1>* %a1
858    ret void
859}
860define void @load_v8i1_broadcast_7_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
861; AVX512-LABEL: load_v8i1_broadcast_7_v1i1_store:
862; AVX512:       # %bb.0:
863; AVX512-NEXT:    kmovb (%rdi), %k0
864; AVX512-NEXT:    kshiftrb $7, %k0, %k0
865; AVX512-NEXT:    kshiftlb $7, %k0, %k0
866; AVX512-NEXT:    kshiftrb $7, %k0, %k0
867; AVX512-NEXT:    kmovb %k0, (%rsi)
868; AVX512-NEXT:    retq
869;
870; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v1i1_store:
871; AVX512NOTDQ:       # %bb.0:
872; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
873; AVX512NOTDQ-NEXT:    kshiftrw $7, %k0, %k0
874; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
875; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
876; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
877; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
878; AVX512NOTDQ-NEXT:    retq
879    %d0 = load <8 x i1>, <8 x i1>* %a0
880    %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32><i32 7>
881    store <1 x i1> %d1, <1 x i1>* %a1
882    ret void
883}
884define void @load_v8i1_broadcast_7_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
885; AVX512-LABEL: load_v8i1_broadcast_7_v2i1_store:
886; AVX512:       # %bb.0:
887; AVX512-NEXT:    kmovb (%rdi), %k0
888; AVX512-NEXT:    kshiftrb $6, %k0, %k0
889; AVX512-NEXT:    vpmovm2q %k0, %xmm0
890; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
891; AVX512-NEXT:    vpmovq2m %xmm0, %k0
892; AVX512-NEXT:    kmovb %k0, (%rsi)
893; AVX512-NEXT:    retq
894;
895; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1_store:
896; AVX512NOTDQ:       # %bb.0:
897; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
898; AVX512NOTDQ-NEXT:    kshiftrw $6, %k0, %k1
899; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
900; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
901; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
902; AVX512NOTDQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
903; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
904; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
905; AVX512NOTDQ-NEXT:    retq
906    %d0 = load <8 x i1>, <8 x i1>* %a0
907    %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 7,i32 7>
908    store <2 x i1> %d1, <2 x i1>* %a1
909    ret void
910}
911define void @load_v16i1_broadcast_8_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
912; AVX512-LABEL: load_v16i1_broadcast_8_v1i1_store:
913; AVX512:       # %bb.0:
914; AVX512-NEXT:    kmovw (%rdi), %k0
915; AVX512-NEXT:    kshiftrw $8, %k0, %k0
916; AVX512-NEXT:    kshiftlb $7, %k0, %k0
917; AVX512-NEXT:    kshiftrb $7, %k0, %k0
918; AVX512-NEXT:    kmovb %k0, (%rsi)
919; AVX512-NEXT:    retq
920;
921; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v1i1_store:
922; AVX512NOTDQ:       # %bb.0:
923; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
924; AVX512NOTDQ-NEXT:    kshiftrw $8, %k0, %k0
925; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
926; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
927; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
928; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
929; AVX512NOTDQ-NEXT:    retq
930    %d0 = load <16 x i1>, <16 x i1>* %a0
931    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32><i32 8>
932    store <1 x i1> %d1, <1 x i1>* %a1
933    ret void
934}
935define void @load_v16i1_broadcast_8_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) {
936; AVX512-LABEL: load_v16i1_broadcast_8_v2i1_store:
937; AVX512:       # %bb.0:
938; AVX512-NEXT:    kmovw (%rdi), %k0
939; AVX512-NEXT:    kshiftrw $8, %k0, %k0
940; AVX512-NEXT:    vpmovm2q %k0, %xmm0
941; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
942; AVX512-NEXT:    vpmovq2m %xmm0, %k0
943; AVX512-NEXT:    kmovb %k0, (%rsi)
944; AVX512-NEXT:    retq
945;
946; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1_store:
947; AVX512NOTDQ:       # %bb.0:
948; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
949; AVX512NOTDQ-NEXT:    kshiftrw $8, %k0, %k1
950; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
951; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
952; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm0, %xmm0
953; AVX512NOTDQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
954; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
955; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
956; AVX512NOTDQ-NEXT:    retq
957    %d0 = load <16 x i1>, <16 x i1>* %a0
958    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 8,i32 8>
959    store <2 x i1> %d1, <2 x i1>* %a1
960    ret void
961}
962define void @load_v16i1_broadcast_8_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) {
963; AVX512-LABEL: load_v16i1_broadcast_8_v4i1_store:
964; AVX512:       # %bb.0:
965; AVX512-NEXT:    kmovw (%rdi), %k0
966; AVX512-NEXT:    kshiftrw $8, %k0, %k0
967; AVX512-NEXT:    vpmovm2d %k0, %xmm0
968; AVX512-NEXT:    vpbroadcastd %xmm0, %xmm0
969; AVX512-NEXT:    vpmovd2m %xmm0, %k0
970; AVX512-NEXT:    kmovb %k0, (%rsi)
971; AVX512-NEXT:    retq
972;
973; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1_store:
974; AVX512NOTDQ:       # %bb.0:
975; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
976; AVX512NOTDQ-NEXT:    kshiftrw $8, %k0, %k1
977; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
978; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
979; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %xmm0
980; AVX512NOTDQ-NEXT:    vptestmd %xmm0, %xmm0, %k0
981; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
982; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
983; AVX512NOTDQ-NEXT:    retq
984    %d0 = load <16 x i1>, <16 x i1>* %a0
985    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 8,i32 8,i32 8,i32 8>
986    store <4 x i1> %d1, <4 x i1>* %a1
987    ret void
988}
989define void @load_v16i1_broadcast_15_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
990; AVX512-LABEL: load_v16i1_broadcast_15_v1i1_store:
991; AVX512:       # %bb.0:
992; AVX512-NEXT:    kmovw (%rdi), %k0
993; AVX512-NEXT:    kshiftrw $15, %k0, %k0
994; AVX512-NEXT:    kshiftlb $7, %k0, %k0
995; AVX512-NEXT:    kshiftrb $7, %k0, %k0
996; AVX512-NEXT:    kmovb %k0, (%rsi)
997; AVX512-NEXT:    retq
998;
999; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v1i1_store:
1000; AVX512NOTDQ:       # %bb.0:
1001; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
1002; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
1003; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
1004; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
1005; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1006; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1007; AVX512NOTDQ-NEXT:    retq
1008    %d0 = load <16 x i1>, <16 x i1>* %a0
1009    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32><i32 15>
1010    store <1 x i1> %d1, <1 x i1>* %a1
1011    ret void
1012}
1013define void @load_v16i1_broadcast_15_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) {
1014; AVX512-LABEL: load_v16i1_broadcast_15_v2i1_store:
1015; AVX512:       # %bb.0:
1016; AVX512-NEXT:    kmovw (%rdi), %k0
1017; AVX512-NEXT:    kshiftrw $14, %k0, %k0
1018; AVX512-NEXT:    vpmovm2q %k0, %xmm0
1019; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1020; AVX512-NEXT:    vpmovq2m %xmm0, %k0
1021; AVX512-NEXT:    kmovb %k0, (%rsi)
1022; AVX512-NEXT:    retq
1023;
1024; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1_store:
1025; AVX512NOTDQ:       # %bb.0:
1026; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
1027; AVX512NOTDQ-NEXT:    kshiftrw $14, %k0, %k1
1028; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1029; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1030; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1031; AVX512NOTDQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
1032; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1033; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1034; AVX512NOTDQ-NEXT:    retq
1035    %d0 = load <16 x i1>, <16 x i1>* %a0
1036    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 15,i32 15>
1037    store <2 x i1> %d1, <2 x i1>* %a1
1038    ret void
1039}
1040define void @load_v16i1_broadcast_15_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) {
1041; AVX512-LABEL: load_v16i1_broadcast_15_v4i1_store:
1042; AVX512:       # %bb.0:
1043; AVX512-NEXT:    kmovw (%rdi), %k0
1044; AVX512-NEXT:    kshiftrw $12, %k0, %k0
1045; AVX512-NEXT:    vpmovm2d %k0, %xmm0
1046; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1047; AVX512-NEXT:    vpmovd2m %xmm0, %k0
1048; AVX512-NEXT:    kmovb %k0, (%rsi)
1049; AVX512-NEXT:    retq
1050;
1051; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1_store:
1052; AVX512NOTDQ:       # %bb.0:
1053; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
1054; AVX512NOTDQ-NEXT:    kshiftrw $12, %k0, %k1
1055; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1056; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1057; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1058; AVX512NOTDQ-NEXT:    vptestmd %xmm0, %xmm0, %k0
1059; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1060; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1061; AVX512NOTDQ-NEXT:    retq
1062    %d0 = load <16 x i1>, <16 x i1>* %a0
1063    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 15,i32 15,i32 15,i32 15>
1064    store <4 x i1> %d1, <4 x i1>* %a1
1065    ret void
1066}
1067define void @load_v32i1_broadcast_16_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
1068; AVX512-LABEL: load_v32i1_broadcast_16_v1i1_store:
1069; AVX512:       # %bb.0:
1070; AVX512-NEXT:    kmovd (%rdi), %k0
1071; AVX512-NEXT:    kshiftrd $16, %k0, %k0
1072; AVX512-NEXT:    kshiftlb $7, %k0, %k0
1073; AVX512-NEXT:    kshiftrb $7, %k0, %k0
1074; AVX512-NEXT:    kmovb %k0, (%rsi)
1075; AVX512-NEXT:    retq
1076;
1077; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v1i1_store:
1078; AVX512NOTDQ:       # %bb.0:
1079; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
1080; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k0
1081; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
1082; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
1083; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1084; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1085; AVX512NOTDQ-NEXT:    retq
1086    %d0 = load <32 x i1>, <32 x i1>* %a0
1087    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32><i32 16>
1088    store <1 x i1> %d1, <1 x i1>* %a1
1089    ret void
1090}
1091define void @load_v32i1_broadcast_16_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) {
1092; AVX512-LABEL: load_v32i1_broadcast_16_v2i1_store:
1093; AVX512:       # %bb.0:
1094; AVX512-NEXT:    kmovd (%rdi), %k0
1095; AVX512-NEXT:    kshiftrd $16, %k0, %k0
1096; AVX512-NEXT:    vpmovm2q %k0, %xmm0
1097; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
1098; AVX512-NEXT:    vpmovq2m %xmm0, %k0
1099; AVX512-NEXT:    kmovb %k0, (%rsi)
1100; AVX512-NEXT:    retq
1101;
1102; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1_store:
1103; AVX512NOTDQ:       # %bb.0:
1104; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
1105; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k1
1106; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1107; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1108; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm0, %xmm0
1109; AVX512NOTDQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
1110; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1111; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1112; AVX512NOTDQ-NEXT:    retq
1113    %d0 = load <32 x i1>, <32 x i1>* %a0
1114    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 16,i32 16>
1115    store <2 x i1> %d1, <2 x i1>* %a1
1116    ret void
1117}
1118define void @load_v32i1_broadcast_16_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) {
1119; AVX512-LABEL: load_v32i1_broadcast_16_v4i1_store:
1120; AVX512:       # %bb.0:
1121; AVX512-NEXT:    kmovd (%rdi), %k0
1122; AVX512-NEXT:    kshiftrd $16, %k0, %k0
1123; AVX512-NEXT:    vpmovm2d %k0, %xmm0
1124; AVX512-NEXT:    vpbroadcastd %xmm0, %xmm0
1125; AVX512-NEXT:    vpmovd2m %xmm0, %k0
1126; AVX512-NEXT:    kmovb %k0, (%rsi)
1127; AVX512-NEXT:    retq
1128;
1129; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1_store:
1130; AVX512NOTDQ:       # %bb.0:
1131; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
1132; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k1
1133; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1134; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1135; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %xmm0
1136; AVX512NOTDQ-NEXT:    vptestmd %xmm0, %xmm0, %k0
1137; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1138; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1139; AVX512NOTDQ-NEXT:    retq
1140    %d0 = load <32 x i1>, <32 x i1>* %a0
1141    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 16,i32 16,i32 16,i32 16>
1142    store <4 x i1> %d1, <4 x i1>* %a1
1143    ret void
1144}
1145define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
1146; AVX512-LABEL: load_v32i1_broadcast_16_v8i1_store:
1147; AVX512:       # %bb.0:
1148; AVX512-NEXT:    kmovb 2(%rdi), %k0
1149; AVX512-NEXT:    vpmovm2d %k0, %ymm0
1150; AVX512-NEXT:    vpbroadcastd %xmm0, %ymm0
1151; AVX512-NEXT:    vpmovd2m %ymm0, %k0
1152; AVX512-NEXT:    kmovb %k0, (%rsi)
1153; AVX512-NEXT:    vzeroupper
1154; AVX512-NEXT:    retq
1155;
1156; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1_store:
1157; AVX512NOTDQ:       # %bb.0:
1158; AVX512NOTDQ-NEXT:    kmovw 2(%rdi), %k1
1159; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
1160; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1161; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %ymm0
1162; AVX512NOTDQ-NEXT:    vptestmd %ymm0, %ymm0, %k0
1163; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1164; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1165; AVX512NOTDQ-NEXT:    vzeroupper
1166; AVX512NOTDQ-NEXT:    retq
1167    %d0 = load <32 x i1>, <32 x i1>* %a0
1168    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16>
1169    store <8 x i1> %d1, <8 x i1>* %a1
1170    ret void
1171}
1172define void @load_v32i1_broadcast_31_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
1173; AVX512-LABEL: load_v32i1_broadcast_31_v1i1_store:
1174; AVX512:       # %bb.0:
1175; AVX512-NEXT:    kmovd (%rdi), %k0
1176; AVX512-NEXT:    kshiftrd $31, %k0, %k0
1177; AVX512-NEXT:    kshiftlb $7, %k0, %k0
1178; AVX512-NEXT:    kshiftrb $7, %k0, %k0
1179; AVX512-NEXT:    kmovb %k0, (%rsi)
1180; AVX512-NEXT:    retq
1181;
1182; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v1i1_store:
1183; AVX512NOTDQ:       # %bb.0:
1184; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
1185; AVX512NOTDQ-NEXT:    kshiftrd $31, %k0, %k0
1186; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
1187; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
1188; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1189; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1190; AVX512NOTDQ-NEXT:    retq
1191    %d0 = load <32 x i1>, <32 x i1>* %a0
1192    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32><i32 31>
1193    store <1 x i1> %d1, <1 x i1>* %a1
1194    ret void
1195}
1196define void @load_v32i1_broadcast_31_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) {
1197; AVX512-LABEL: load_v32i1_broadcast_31_v2i1_store:
1198; AVX512:       # %bb.0:
1199; AVX512-NEXT:    kmovd (%rdi), %k0
1200; AVX512-NEXT:    kshiftrd $30, %k0, %k0
1201; AVX512-NEXT:    vpmovm2q %k0, %xmm0
1202; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1203; AVX512-NEXT:    vpmovq2m %xmm0, %k0
1204; AVX512-NEXT:    kmovb %k0, (%rsi)
1205; AVX512-NEXT:    retq
1206;
1207; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1_store:
1208; AVX512NOTDQ:       # %bb.0:
1209; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
1210; AVX512NOTDQ-NEXT:    kshiftrd $30, %k0, %k1
1211; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1212; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1213; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1214; AVX512NOTDQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
1215; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1216; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1217; AVX512NOTDQ-NEXT:    retq
1218    %d0 = load <32 x i1>, <32 x i1>* %a0
1219    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 31,i32 31>
1220    store <2 x i1> %d1, <2 x i1>* %a1
1221    ret void
1222}
1223define void @load_v32i1_broadcast_31_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) {
1224; AVX512-LABEL: load_v32i1_broadcast_31_v4i1_store:
1225; AVX512:       # %bb.0:
1226; AVX512-NEXT:    kmovd (%rdi), %k0
1227; AVX512-NEXT:    kshiftrd $28, %k0, %k0
1228; AVX512-NEXT:    vpmovm2d %k0, %xmm0
1229; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1230; AVX512-NEXT:    vpmovd2m %xmm0, %k0
1231; AVX512-NEXT:    kmovb %k0, (%rsi)
1232; AVX512-NEXT:    retq
1233;
1234; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1_store:
1235; AVX512NOTDQ:       # %bb.0:
1236; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
1237; AVX512NOTDQ-NEXT:    kshiftrd $28, %k0, %k1
1238; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1239; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1240; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1241; AVX512NOTDQ-NEXT:    vptestmd %xmm0, %xmm0, %k0
1242; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1243; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1244; AVX512NOTDQ-NEXT:    retq
1245    %d0 = load <32 x i1>, <32 x i1>* %a0
1246    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 31,i32 31,i32 31,i32 31>
1247    store <4 x i1> %d1, <4 x i1>* %a1
1248    ret void
1249}
1250define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
1251; AVX512-FAST-LABEL: load_v32i1_broadcast_31_v8i1_store:
1252; AVX512-FAST:       # %bb.0:
1253; AVX512-FAST-NEXT:    kmovb 3(%rdi), %k0
1254; AVX512-FAST-NEXT:    vpmovm2d %k0, %ymm0
1255; AVX512-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
1256; AVX512-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
1257; AVX512-FAST-NEXT:    vpmovd2m %ymm0, %k0
1258; AVX512-FAST-NEXT:    kmovb %k0, (%rsi)
1259; AVX512-FAST-NEXT:    vzeroupper
1260; AVX512-FAST-NEXT:    retq
1261;
1262; AVX512-FAST-PERLANE-LABEL: load_v32i1_broadcast_31_v8i1_store:
1263; AVX512-FAST-PERLANE:       # %bb.0:
1264; AVX512-FAST-PERLANE-NEXT:    kmovb 3(%rdi), %k0
1265; AVX512-FAST-PERLANE-NEXT:    vpmovm2d %k0, %ymm0
1266; AVX512-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7]
1267; AVX512-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
1268; AVX512-FAST-PERLANE-NEXT:    vpmovd2m %ymm0, %k0
1269; AVX512-FAST-PERLANE-NEXT:    kmovb %k0, (%rsi)
1270; AVX512-FAST-PERLANE-NEXT:    vzeroupper
1271; AVX512-FAST-PERLANE-NEXT:    retq
1272;
1273; AVX512NOTDQ-FAST-LABEL: load_v32i1_broadcast_31_v8i1_store:
1274; AVX512NOTDQ-FAST:       # %bb.0:
1275; AVX512NOTDQ-FAST-NEXT:    movzbl 3(%rdi), %eax
1276; AVX512NOTDQ-FAST-NEXT:    kmovd %eax, %k1
1277; AVX512NOTDQ-FAST-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
1278; AVX512NOTDQ-FAST-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1279; AVX512NOTDQ-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
1280; AVX512NOTDQ-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
1281; AVX512NOTDQ-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k0
1282; AVX512NOTDQ-FAST-NEXT:    kmovd %k0, %eax
1283; AVX512NOTDQ-FAST-NEXT:    movb %al, (%rsi)
1284; AVX512NOTDQ-FAST-NEXT:    vzeroupper
1285; AVX512NOTDQ-FAST-NEXT:    retq
1286;
1287; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v32i1_broadcast_31_v8i1_store:
1288; AVX512NOTDQ-FAST-PERLANE:       # %bb.0:
1289; AVX512NOTDQ-FAST-PERLANE-NEXT:    movzbl 3(%rdi), %eax
1290; AVX512NOTDQ-FAST-PERLANE-NEXT:    kmovd %eax, %k1
1291; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
1292; AVX512NOTDQ-FAST-PERLANE-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1293; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7]
1294; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
1295; AVX512NOTDQ-FAST-PERLANE-NEXT:    vptestmd %ymm0, %ymm0, %k0
1296; AVX512NOTDQ-FAST-PERLANE-NEXT:    kmovd %k0, %eax
1297; AVX512NOTDQ-FAST-PERLANE-NEXT:    movb %al, (%rsi)
1298; AVX512NOTDQ-FAST-PERLANE-NEXT:    vzeroupper
1299; AVX512NOTDQ-FAST-PERLANE-NEXT:    retq
1300    %d0 = load <32 x i1>, <32 x i1>* %a0
1301    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31>
1302    store <8 x i1> %d1, <8 x i1>* %a1
1303    ret void
1304}
1305define void @load_v64i1_broadcast_32_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
1306; AVX512-LABEL: load_v64i1_broadcast_32_v1i1_store:
1307; AVX512:       # %bb.0:
1308; AVX512-NEXT:    kmovq (%rdi), %k0
1309; AVX512-NEXT:    kshiftrq $32, %k0, %k0
1310; AVX512-NEXT:    kshiftlb $7, %k0, %k0
1311; AVX512-NEXT:    kshiftrb $7, %k0, %k0
1312; AVX512-NEXT:    kmovb %k0, (%rsi)
1313; AVX512-NEXT:    retq
1314;
1315; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v1i1_store:
1316; AVX512NOTDQ:       # %bb.0:
1317; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
1318; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k0
1319; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
1320; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
1321; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1322; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1323; AVX512NOTDQ-NEXT:    retq
1324    %d0 = load <64 x i1>, <64 x i1>* %a0
1325    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32><i32 32>
1326    store <1 x i1> %d1, <1 x i1>* %a1
1327    ret void
1328}
1329define void @load_v64i1_broadcast_32_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) {
1330; AVX512-LABEL: load_v64i1_broadcast_32_v2i1_store:
1331; AVX512:       # %bb.0:
1332; AVX512-NEXT:    kmovq (%rdi), %k0
1333; AVX512-NEXT:    kshiftrq $32, %k0, %k0
1334; AVX512-NEXT:    vpmovm2q %k0, %xmm0
1335; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
1336; AVX512-NEXT:    vpmovq2m %xmm0, %k0
1337; AVX512-NEXT:    kmovb %k0, (%rsi)
1338; AVX512-NEXT:    retq
1339;
1340; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1_store:
1341; AVX512NOTDQ:       # %bb.0:
1342; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
1343; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k1
1344; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1345; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1346; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm0, %xmm0
1347; AVX512NOTDQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
1348; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1349; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1350; AVX512NOTDQ-NEXT:    retq
1351    %d0 = load <64 x i1>, <64 x i1>* %a0
1352    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 32,i32 32>
1353    store <2 x i1> %d1, <2 x i1>* %a1
1354    ret void
1355}
1356define void @load_v64i1_broadcast_32_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) {
1357; AVX512-LABEL: load_v64i1_broadcast_32_v4i1_store:
1358; AVX512:       # %bb.0:
1359; AVX512-NEXT:    kmovq (%rdi), %k0
1360; AVX512-NEXT:    kshiftrq $32, %k0, %k0
1361; AVX512-NEXT:    vpmovm2d %k0, %xmm0
1362; AVX512-NEXT:    vpbroadcastd %xmm0, %xmm0
1363; AVX512-NEXT:    vpmovd2m %xmm0, %k0
1364; AVX512-NEXT:    kmovb %k0, (%rsi)
1365; AVX512-NEXT:    retq
1366;
1367; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1_store:
1368; AVX512NOTDQ:       # %bb.0:
1369; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
1370; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k1
1371; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1372; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1373; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %xmm0
1374; AVX512NOTDQ-NEXT:    vptestmd %xmm0, %xmm0, %k0
1375; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1376; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1377; AVX512NOTDQ-NEXT:    retq
1378    %d0 = load <64 x i1>, <64 x i1>* %a0
1379    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 32,i32 32,i32 32,i32 32>
1380    store <4 x i1> %d1, <4 x i1>* %a1
1381    ret void
1382}
1383define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
1384; AVX512-LABEL: load_v64i1_broadcast_32_v8i1_store:
1385; AVX512:       # %bb.0:
1386; AVX512-NEXT:    kmovb 4(%rdi), %k0
1387; AVX512-NEXT:    vpmovm2d %k0, %ymm0
1388; AVX512-NEXT:    vpbroadcastd %xmm0, %ymm0
1389; AVX512-NEXT:    vpmovd2m %ymm0, %k0
1390; AVX512-NEXT:    kmovb %k0, (%rsi)
1391; AVX512-NEXT:    vzeroupper
1392; AVX512-NEXT:    retq
1393;
1394; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1_store:
1395; AVX512NOTDQ:       # %bb.0:
1396; AVX512NOTDQ-NEXT:    kmovw 4(%rdi), %k1
1397; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
1398; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1399; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %ymm0
1400; AVX512NOTDQ-NEXT:    vptestmd %ymm0, %ymm0, %k0
1401; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1402; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1403; AVX512NOTDQ-NEXT:    vzeroupper
1404; AVX512NOTDQ-NEXT:    retq
1405    %d0 = load <64 x i1>, <64 x i1>* %a0
1406    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
1407    store <8 x i1> %d1, <8 x i1>* %a1
1408    ret void
1409}
1410define void @load_v64i1_broadcast_32_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) {
1411; AVX512-LABEL: load_v64i1_broadcast_32_v16i1_store:
1412; AVX512:       # %bb.0:
1413; AVX512-NEXT:    kmovw 4(%rdi), %k0
1414; AVX512-NEXT:    vpmovm2d %k0, %zmm0
1415; AVX512-NEXT:    vpbroadcastd %xmm0, %zmm0
1416; AVX512-NEXT:    vpmovd2m %zmm0, %k0
1417; AVX512-NEXT:    kmovw %k0, (%rsi)
1418; AVX512-NEXT:    vzeroupper
1419; AVX512-NEXT:    retq
1420;
1421; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1_store:
1422; AVX512NOTDQ:       # %bb.0:
1423; AVX512NOTDQ-NEXT:    kmovw 4(%rdi), %k1
1424; AVX512NOTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1425; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %zmm0
1426; AVX512NOTDQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
1427; AVX512NOTDQ-NEXT:    kmovw %k0, (%rsi)
1428; AVX512NOTDQ-NEXT:    vzeroupper
1429; AVX512NOTDQ-NEXT:    retq
1430    %d0 = load <64 x i1>, <64 x i1>* %a0
1431    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
1432    store <16 x i1> %d1, <16 x i1>* %a1
1433    ret void
1434}
1435define void @load_v64i1_broadcast_63_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
1436; AVX512-LABEL: load_v64i1_broadcast_63_v1i1_store:
1437; AVX512:       # %bb.0:
1438; AVX512-NEXT:    kmovq (%rdi), %k0
1439; AVX512-NEXT:    kshiftrq $63, %k0, %k0
1440; AVX512-NEXT:    kshiftlb $7, %k0, %k0
1441; AVX512-NEXT:    kshiftrb $7, %k0, %k0
1442; AVX512-NEXT:    kmovb %k0, (%rsi)
1443; AVX512-NEXT:    retq
1444;
1445; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v1i1_store:
1446; AVX512NOTDQ:       # %bb.0:
1447; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
1448; AVX512NOTDQ-NEXT:    kshiftrq $63, %k0, %k0
1449; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
1450; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
1451; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1452; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1453; AVX512NOTDQ-NEXT:    retq
1454    %d0 = load <64 x i1>, <64 x i1>* %a0
1455    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32><i32 63>
1456    store <1 x i1> %d1, <1 x i1>* %a1
1457    ret void
1458}
1459define void @load_v64i1_broadcast_63_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) {
1460; AVX512-LABEL: load_v64i1_broadcast_63_v2i1_store:
1461; AVX512:       # %bb.0:
1462; AVX512-NEXT:    kmovq (%rdi), %k0
1463; AVX512-NEXT:    kshiftrq $62, %k0, %k0
1464; AVX512-NEXT:    vpmovm2q %k0, %xmm0
1465; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1466; AVX512-NEXT:    vpmovq2m %xmm0, %k0
1467; AVX512-NEXT:    kmovb %k0, (%rsi)
1468; AVX512-NEXT:    retq
1469;
1470; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1_store:
1471; AVX512NOTDQ:       # %bb.0:
1472; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
1473; AVX512NOTDQ-NEXT:    kshiftrq $62, %k0, %k1
1474; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1475; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1476; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1477; AVX512NOTDQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
1478; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1479; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1480; AVX512NOTDQ-NEXT:    retq
1481    %d0 = load <64 x i1>, <64 x i1>* %a0
1482    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 63,i32 63>
1483    store <2 x i1> %d1, <2 x i1>* %a1
1484    ret void
1485}
1486define void @load_v64i1_broadcast_63_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) {
1487; AVX512-LABEL: load_v64i1_broadcast_63_v4i1_store:
1488; AVX512:       # %bb.0:
1489; AVX512-NEXT:    kmovq (%rdi), %k0
1490; AVX512-NEXT:    kshiftrq $60, %k0, %k0
1491; AVX512-NEXT:    vpmovm2d %k0, %xmm0
1492; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1493; AVX512-NEXT:    vpmovd2m %xmm0, %k0
1494; AVX512-NEXT:    kmovb %k0, (%rsi)
1495; AVX512-NEXT:    retq
1496;
1497; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1_store:
1498; AVX512NOTDQ:       # %bb.0:
1499; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
1500; AVX512NOTDQ-NEXT:    kshiftrq $60, %k0, %k1
1501; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1502; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1503; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1504; AVX512NOTDQ-NEXT:    vptestmd %xmm0, %xmm0, %k0
1505; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1506; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1507; AVX512NOTDQ-NEXT:    retq
1508    %d0 = load <64 x i1>, <64 x i1>* %a0
1509    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 63,i32 63,i32 63,i32 63>
1510    store <4 x i1> %d1, <4 x i1>* %a1
1511    ret void
1512}
1513define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
1514; AVX512-FAST-LABEL: load_v64i1_broadcast_63_v8i1_store:
1515; AVX512-FAST:       # %bb.0:
1516; AVX512-FAST-NEXT:    kmovb 7(%rdi), %k0
1517; AVX512-FAST-NEXT:    vpmovm2d %k0, %ymm0
1518; AVX512-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
1519; AVX512-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
1520; AVX512-FAST-NEXT:    vpmovd2m %ymm0, %k0
1521; AVX512-FAST-NEXT:    kmovb %k0, (%rsi)
1522; AVX512-FAST-NEXT:    vzeroupper
1523; AVX512-FAST-NEXT:    retq
1524;
1525; AVX512-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v8i1_store:
1526; AVX512-FAST-PERLANE:       # %bb.0:
1527; AVX512-FAST-PERLANE-NEXT:    kmovb 7(%rdi), %k0
1528; AVX512-FAST-PERLANE-NEXT:    vpmovm2d %k0, %ymm0
1529; AVX512-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7]
1530; AVX512-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
1531; AVX512-FAST-PERLANE-NEXT:    vpmovd2m %ymm0, %k0
1532; AVX512-FAST-PERLANE-NEXT:    kmovb %k0, (%rsi)
1533; AVX512-FAST-PERLANE-NEXT:    vzeroupper
1534; AVX512-FAST-PERLANE-NEXT:    retq
1535;
1536; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v8i1_store:
1537; AVX512NOTDQ-FAST:       # %bb.0:
1538; AVX512NOTDQ-FAST-NEXT:    movzbl 7(%rdi), %eax
1539; AVX512NOTDQ-FAST-NEXT:    kmovd %eax, %k1
1540; AVX512NOTDQ-FAST-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
1541; AVX512NOTDQ-FAST-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1542; AVX512NOTDQ-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
1543; AVX512NOTDQ-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
1544; AVX512NOTDQ-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k0
1545; AVX512NOTDQ-FAST-NEXT:    kmovd %k0, %eax
1546; AVX512NOTDQ-FAST-NEXT:    movb %al, (%rsi)
1547; AVX512NOTDQ-FAST-NEXT:    vzeroupper
1548; AVX512NOTDQ-FAST-NEXT:    retq
1549;
1550; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v8i1_store:
1551; AVX512NOTDQ-FAST-PERLANE:       # %bb.0:
1552; AVX512NOTDQ-FAST-PERLANE-NEXT:    movzbl 7(%rdi), %eax
1553; AVX512NOTDQ-FAST-PERLANE-NEXT:    kmovd %eax, %k1
1554; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
1555; AVX512NOTDQ-FAST-PERLANE-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1556; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7]
1557; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
1558; AVX512NOTDQ-FAST-PERLANE-NEXT:    vptestmd %ymm0, %ymm0, %k0
1559; AVX512NOTDQ-FAST-PERLANE-NEXT:    kmovd %k0, %eax
1560; AVX512NOTDQ-FAST-PERLANE-NEXT:    movb %al, (%rsi)
1561; AVX512NOTDQ-FAST-PERLANE-NEXT:    vzeroupper
1562; AVX512NOTDQ-FAST-PERLANE-NEXT:    retq
1563    %d0 = load <64 x i1>, <64 x i1>* %a0
1564    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
1565    store <8 x i1> %d1, <8 x i1>* %a1
1566    ret void
1567}
1568define void @load_v64i1_broadcast_63_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) {
1569; AVX512-FAST-LABEL: load_v64i1_broadcast_63_v16i1_store:
1570; AVX512-FAST:       # %bb.0:
1571; AVX512-FAST-NEXT:    kmovw 6(%rdi), %k0
1572; AVX512-FAST-NEXT:    vpmovm2d %k0, %zmm0
1573; AVX512-FAST-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1574; AVX512-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
1575; AVX512-FAST-NEXT:    vpmovd2m %zmm0, %k0
1576; AVX512-FAST-NEXT:    kmovw %k0, (%rsi)
1577; AVX512-FAST-NEXT:    vzeroupper
1578; AVX512-FAST-NEXT:    retq
1579;
1580; AVX512-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1_store:
1581; AVX512-FAST-PERLANE:       # %bb.0:
1582; AVX512-FAST-PERLANE-NEXT:    kmovw 6(%rdi), %k0
1583; AVX512-FAST-PERLANE-NEXT:    vpmovm2d %k0, %zmm0
1584; AVX512-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
1585; AVX512-FAST-PERLANE-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,6,7,6,7,6,7]
1586; AVX512-FAST-PERLANE-NEXT:    vpmovd2m %zmm0, %k0
1587; AVX512-FAST-PERLANE-NEXT:    kmovw %k0, (%rsi)
1588; AVX512-FAST-PERLANE-NEXT:    vzeroupper
1589; AVX512-FAST-PERLANE-NEXT:    retq
1590;
1591; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v16i1_store:
1592; AVX512NOTDQ-FAST:       # %bb.0:
1593; AVX512NOTDQ-FAST-NEXT:    kmovw 6(%rdi), %k1
1594; AVX512NOTDQ-FAST-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1595; AVX512NOTDQ-FAST-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1596; AVX512NOTDQ-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
1597; AVX512NOTDQ-FAST-NEXT:    vptestmd %zmm0, %zmm0, %k0
1598; AVX512NOTDQ-FAST-NEXT:    kmovw %k0, (%rsi)
1599; AVX512NOTDQ-FAST-NEXT:    vzeroupper
1600; AVX512NOTDQ-FAST-NEXT:    retq
1601;
1602; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1_store:
1603; AVX512NOTDQ-FAST-PERLANE:       # %bb.0:
1604; AVX512NOTDQ-FAST-PERLANE-NEXT:    kmovw 6(%rdi), %k1
1605; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1606; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
1607; AVX512NOTDQ-FAST-PERLANE-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,6,7,6,7,6,7]
1608; AVX512NOTDQ-FAST-PERLANE-NEXT:    vptestmd %zmm0, %zmm0, %k0
1609; AVX512NOTDQ-FAST-PERLANE-NEXT:    kmovw %k0, (%rsi)
1610; AVX512NOTDQ-FAST-PERLANE-NEXT:    vzeroupper
1611; AVX512NOTDQ-FAST-PERLANE-NEXT:    retq
1612    %d0 = load <64 x i1>, <64 x i1>* %a0
1613    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
1614    store <16 x i1> %d1, <16 x i1>* %a1
1615    ret void
1616}
1617
1618