1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+prefer-256-bit | FileCheck %s --check-prefix=AVX256
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,-prefer-256-bit | FileCheck %s --check-prefix=AVX512VL
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+prefer-256-bit | FileCheck %s --check-prefix=AVX512F
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,-prefer-256-bit | FileCheck %s --check-prefix=AVX512F
6
7define <8 x i16> @testv8i1_sext_v8i16(<8 x i32>* %p) {
8; AVX256-LABEL: testv8i1_sext_v8i16:
9; AVX256:       # %bb.0:
10; AVX256-NEXT:    vmovdqa (%rdi), %ymm0
11; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k1
12; AVX256-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
13; AVX256-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
14; AVX256-NEXT:    vpmovdw %ymm0, %xmm0
15; AVX256-NEXT:    vzeroupper
16; AVX256-NEXT:    retq
17;
18; AVX512VL-LABEL: testv8i1_sext_v8i16:
19; AVX512VL:       # %bb.0:
20; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
21; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
22; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
23; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
24; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
25; AVX512VL-NEXT:    vzeroupper
26; AVX512VL-NEXT:    retq
27;
28; AVX512F-LABEL: testv8i1_sext_v8i16:
29; AVX512F:       # %bb.0:
30; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
31; AVX512F-NEXT:    vpcmpeqd (%rdi), %ymm0, %ymm0
32; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
33; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
34; AVX512F-NEXT:    vzeroupper
35; AVX512F-NEXT:    retq
36  %in = load <8 x i32>, <8 x i32>* %p
37  %cmp = icmp eq <8 x i32> %in, zeroinitializer
38  %ext = sext <8 x i1> %cmp to <8 x i16>
39  ret <8 x i16> %ext
40}
41
42define <16 x i8> @testv16i1_sext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
43; AVX256-LABEL: testv16i1_sext_v16i8:
44; AVX256:       # %bb.0:
45; AVX256-NEXT:    vmovdqa (%rdi), %ymm0
46; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k1
47; AVX256-NEXT:    vmovdqa (%rsi), %ymm0
48; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k2
49; AVX256-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
50; AVX256-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k2} {z}
51; AVX256-NEXT:    vpmovdw %ymm1, %xmm1
52; AVX256-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
53; AVX256-NEXT:    vpmovdw %ymm0, %xmm0
54; AVX256-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
55; AVX256-NEXT:    vzeroupper
56; AVX256-NEXT:    retq
57;
58; AVX512VL-LABEL: testv16i1_sext_v16i8:
59; AVX512VL:       # %bb.0:
60; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
61; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k0
62; AVX512VL-NEXT:    vmovdqa (%rsi), %ymm0
63; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
64; AVX512VL-NEXT:    kunpckbw %k0, %k1, %k1
65; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
66; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
67; AVX512VL-NEXT:    vzeroupper
68; AVX512VL-NEXT:    retq
69;
70; AVX512F-LABEL: testv16i1_sext_v16i8:
71; AVX512F:       # %bb.0:
72; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
73; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
74; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
75; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
76; AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
77; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
78; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
79; AVX512F-NEXT:    vzeroupper
80; AVX512F-NEXT:    retq
81  %in = load <8 x i32>, <8 x i32>* %p
82  %cmp = icmp eq <8 x i32> %in, zeroinitializer
83  %in2 = load <8 x i32>, <8 x i32>* %q
84  %cmp2 = icmp eq <8 x i32> %in2, zeroinitializer
85  %concat = shufflevector <8 x i1> %cmp, <8 x i1> %cmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
86  %ext = sext <16 x i1> %concat to <16 x i8>
87  ret <16 x i8> %ext
88}
89
90define <16 x i16> @testv16i1_sext_v16i16(<8 x i32>* %p, <8 x i32>* %q) {
91; AVX256-LABEL: testv16i1_sext_v16i16:
92; AVX256:       # %bb.0:
93; AVX256-NEXT:    vmovdqa (%rdi), %ymm0
94; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k1
95; AVX256-NEXT:    vmovdqa (%rsi), %ymm0
96; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k2
97; AVX256-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
98; AVX256-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z}
99; AVX256-NEXT:    vpmovdw %ymm1, %xmm1
100; AVX256-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k2} {z}
101; AVX256-NEXT:    vpmovdw %ymm0, %xmm0
102; AVX256-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
103; AVX256-NEXT:    retq
104;
105; AVX512VL-LABEL: testv16i1_sext_v16i16:
106; AVX512VL:       # %bb.0:
107; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
108; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k0
109; AVX512VL-NEXT:    vmovdqa (%rsi), %ymm0
110; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
111; AVX512VL-NEXT:    kunpckbw %k0, %k1, %k1
112; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
113; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
114; AVX512VL-NEXT:    retq
115;
116; AVX512F-LABEL: testv16i1_sext_v16i16:
117; AVX512F:       # %bb.0:
118; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
119; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
120; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
121; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
122; AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
123; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
124; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
125; AVX512F-NEXT:    retq
126  %in = load <8 x i32>, <8 x i32>* %p
127  %cmp = icmp eq <8 x i32> %in, zeroinitializer
128  %in2 = load <8 x i32>, <8 x i32>* %q
129  %cmp2 = icmp eq <8 x i32> %in2, zeroinitializer
130  %concat = shufflevector <8 x i1> %cmp, <8 x i1> %cmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
131  %ext = sext <16 x i1> %concat to <16 x i16>
132  ret <16 x i16> %ext
133}
134
135define <8 x i16> @testv8i1_zext_v8i16(<8 x i32>* %p) {
136; AVX256-LABEL: testv8i1_zext_v8i16:
137; AVX256:       # %bb.0:
138; AVX256-NEXT:    vmovdqa (%rdi), %ymm0
139; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k1
140; AVX256-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
141; AVX256-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
142; AVX256-NEXT:    vpmovdw %ymm0, %xmm0
143; AVX256-NEXT:    vpsrlw $15, %xmm0, %xmm0
144; AVX256-NEXT:    vzeroupper
145; AVX256-NEXT:    retq
146;
147; AVX512VL-LABEL: testv8i1_zext_v8i16:
148; AVX512VL:       # %bb.0:
149; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
150; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
151; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
152; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
153; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
154; AVX512VL-NEXT:    vpsrlw $15, %xmm0, %xmm0
155; AVX512VL-NEXT:    vzeroupper
156; AVX512VL-NEXT:    retq
157;
158; AVX512F-LABEL: testv8i1_zext_v8i16:
159; AVX512F:       # %bb.0:
160; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
161; AVX512F-NEXT:    vpcmpeqd (%rdi), %ymm0, %ymm0
162; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
163; AVX512F-NEXT:    vpsrlw $15, %xmm0, %xmm0
164; AVX512F-NEXT:    vzeroupper
165; AVX512F-NEXT:    retq
166  %in = load <8 x i32>, <8 x i32>* %p
167  %cmp = icmp eq <8 x i32> %in, zeroinitializer
168  %ext = zext <8 x i1> %cmp to <8 x i16>
169  ret <8 x i16> %ext
170}
171
172define <16 x i8> @testv16i1_zext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
173; AVX256-LABEL: testv16i1_zext_v16i8:
174; AVX256:       # %bb.0:
175; AVX256-NEXT:    vmovdqa (%rdi), %ymm0
176; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k1
177; AVX256-NEXT:    vmovdqa (%rsi), %ymm0
178; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k2
179; AVX256-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
180; AVX256-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k2} {z}
181; AVX256-NEXT:    vpmovdw %ymm1, %xmm1
182; AVX256-NEXT:    vpsrlw $15, %xmm1, %xmm1
183; AVX256-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
184; AVX256-NEXT:    vpmovdw %ymm0, %xmm0
185; AVX256-NEXT:    vpsrlw $15, %xmm0, %xmm0
186; AVX256-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
187; AVX256-NEXT:    vzeroupper
188; AVX256-NEXT:    retq
189;
190; AVX512VL-LABEL: testv16i1_zext_v16i8:
191; AVX512VL:       # %bb.0:
192; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
193; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k0
194; AVX512VL-NEXT:    vmovdqa (%rsi), %ymm0
195; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
196; AVX512VL-NEXT:    kunpckbw %k0, %k1, %k1
197; AVX512VL-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 {%k1} {z}
198; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
199; AVX512VL-NEXT:    vzeroupper
200; AVX512VL-NEXT:    retq
201;
202; AVX512F-LABEL: testv16i1_zext_v16i8:
203; AVX512F:       # %bb.0:
204; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
205; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
206; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
207; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
208; AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
209; AVX512F-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 {%k1} {z}
210; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
211; AVX512F-NEXT:    vzeroupper
212; AVX512F-NEXT:    retq
213  %in = load <8 x i32>, <8 x i32>* %p
214  %cmp = icmp eq <8 x i32> %in, zeroinitializer
215  %in2 = load <8 x i32>, <8 x i32>* %q
216  %cmp2 = icmp eq <8 x i32> %in2, zeroinitializer
217  %concat = shufflevector <8 x i1> %cmp, <8 x i1> %cmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
218  %ext = zext <16 x i1> %concat to <16 x i8>
219  ret <16 x i8> %ext
220}
221
222define <16 x i16> @testv16i1_zext_v16i16(<8 x i32>* %p, <8 x i32>* %q) {
223; AVX256-LABEL: testv16i1_zext_v16i16:
224; AVX256:       # %bb.0:
225; AVX256-NEXT:    vmovdqa (%rdi), %ymm0
226; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k1
227; AVX256-NEXT:    vmovdqa (%rsi), %ymm0
228; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k2
229; AVX256-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
230; AVX256-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z}
231; AVX256-NEXT:    vpmovdw %ymm1, %xmm1
232; AVX256-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k2} {z}
233; AVX256-NEXT:    vpmovdw %ymm0, %xmm0
234; AVX256-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
235; AVX256-NEXT:    vpsrlw $15, %ymm0, %ymm0
236; AVX256-NEXT:    retq
237;
238; AVX512VL-LABEL: testv16i1_zext_v16i16:
239; AVX512VL:       # %bb.0:
240; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
241; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k0
242; AVX512VL-NEXT:    vmovdqa (%rsi), %ymm0
243; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
244; AVX512VL-NEXT:    kunpckbw %k0, %k1, %k1
245; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
246; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
247; AVX512VL-NEXT:    vpsrlw $15, %ymm0, %ymm0
248; AVX512VL-NEXT:    retq
249;
250; AVX512F-LABEL: testv16i1_zext_v16i16:
251; AVX512F:       # %bb.0:
252; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
253; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
254; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
255; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
256; AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
257; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
258; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
259; AVX512F-NEXT:    vpsrlw $15, %ymm0, %ymm0
260; AVX512F-NEXT:    retq
261  %in = load <8 x i32>, <8 x i32>* %p
262  %cmp = icmp eq <8 x i32> %in, zeroinitializer
263  %in2 = load <8 x i32>, <8 x i32>* %q
264  %cmp2 = icmp eq <8 x i32> %in2, zeroinitializer
265  %concat = shufflevector <8 x i1> %cmp, <8 x i1> %cmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
266  %ext = zext <16 x i1> %concat to <16 x i16>
267  ret <16 x i16> %ext
268}
269