1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefixes=AVX,AVX512VPOPCNTDQ
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VPOPCNTDQVL
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg | FileCheck %s --check-prefix=BITALG_NOVLX
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg,+avx512vl | FileCheck %s --check-prefix=BITALG
12
13define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
14; SSE2-LABEL: testv2i64:
15; SSE2:       # %bb.0:
16; SSE2-NEXT:    movdqa %xmm0, %xmm1
17; SSE2-NEXT:    psrlw $1, %xmm1
18; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
19; SSE2-NEXT:    psubb %xmm1, %xmm0
20; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
21; SSE2-NEXT:    movdqa %xmm0, %xmm2
22; SSE2-NEXT:    pand %xmm1, %xmm2
23; SSE2-NEXT:    psrlw $2, %xmm0
24; SSE2-NEXT:    pand %xmm1, %xmm0
25; SSE2-NEXT:    paddb %xmm2, %xmm0
26; SSE2-NEXT:    movdqa %xmm0, %xmm1
27; SSE2-NEXT:    psrlw $4, %xmm1
28; SSE2-NEXT:    paddb %xmm0, %xmm1
29; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
30; SSE2-NEXT:    pxor %xmm0, %xmm0
31; SSE2-NEXT:    psadbw %xmm0, %xmm1
32; SSE2-NEXT:    movdqa %xmm1, %xmm0
33; SSE2-NEXT:    retq
34;
35; SSE3-LABEL: testv2i64:
36; SSE3:       # %bb.0:
37; SSE3-NEXT:    movdqa %xmm0, %xmm1
38; SSE3-NEXT:    psrlw $1, %xmm1
39; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
40; SSE3-NEXT:    psubb %xmm1, %xmm0
41; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
42; SSE3-NEXT:    movdqa %xmm0, %xmm2
43; SSE3-NEXT:    pand %xmm1, %xmm2
44; SSE3-NEXT:    psrlw $2, %xmm0
45; SSE3-NEXT:    pand %xmm1, %xmm0
46; SSE3-NEXT:    paddb %xmm2, %xmm0
47; SSE3-NEXT:    movdqa %xmm0, %xmm1
48; SSE3-NEXT:    psrlw $4, %xmm1
49; SSE3-NEXT:    paddb %xmm0, %xmm1
50; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
51; SSE3-NEXT:    pxor %xmm0, %xmm0
52; SSE3-NEXT:    psadbw %xmm0, %xmm1
53; SSE3-NEXT:    movdqa %xmm1, %xmm0
54; SSE3-NEXT:    retq
55;
56; SSSE3-LABEL: testv2i64:
57; SSSE3:       # %bb.0:
58; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
59; SSSE3-NEXT:    movdqa %xmm0, %xmm2
60; SSSE3-NEXT:    pand %xmm1, %xmm2
61; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
62; SSSE3-NEXT:    movdqa %xmm3, %xmm4
63; SSSE3-NEXT:    pshufb %xmm2, %xmm4
64; SSSE3-NEXT:    psrlw $4, %xmm0
65; SSSE3-NEXT:    pand %xmm1, %xmm0
66; SSSE3-NEXT:    pshufb %xmm0, %xmm3
67; SSSE3-NEXT:    paddb %xmm4, %xmm3
68; SSSE3-NEXT:    pxor %xmm0, %xmm0
69; SSSE3-NEXT:    psadbw %xmm3, %xmm0
70; SSSE3-NEXT:    retq
71;
72; SSE41-LABEL: testv2i64:
73; SSE41:       # %bb.0:
74; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
75; SSE41-NEXT:    movdqa %xmm0, %xmm2
76; SSE41-NEXT:    pand %xmm1, %xmm2
77; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
78; SSE41-NEXT:    movdqa %xmm3, %xmm4
79; SSE41-NEXT:    pshufb %xmm2, %xmm4
80; SSE41-NEXT:    psrlw $4, %xmm0
81; SSE41-NEXT:    pand %xmm1, %xmm0
82; SSE41-NEXT:    pshufb %xmm0, %xmm3
83; SSE41-NEXT:    paddb %xmm4, %xmm3
84; SSE41-NEXT:    pxor %xmm0, %xmm0
85; SSE41-NEXT:    psadbw %xmm3, %xmm0
86; SSE41-NEXT:    retq
87;
88; AVX1-LABEL: testv2i64:
89; AVX1:       # %bb.0:
90; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
91; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
92; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
93; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
94; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
95; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
96; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
97; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
98; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
99; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
100; AVX1-NEXT:    retq
101;
102; AVX2-LABEL: testv2i64:
103; AVX2:       # %bb.0:
104; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
105; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
106; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
107; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
108; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
109; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
110; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
111; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
112; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
113; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
114; AVX2-NEXT:    retq
115;
116; AVX512VPOPCNTDQ-LABEL: testv2i64:
117; AVX512VPOPCNTDQ:       # %bb.0:
118; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
119; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
120; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
121; AVX512VPOPCNTDQ-NEXT:    vzeroupper
122; AVX512VPOPCNTDQ-NEXT:    retq
123;
124; AVX512VPOPCNTDQVL-LABEL: testv2i64:
125; AVX512VPOPCNTDQVL:       # %bb.0:
126; AVX512VPOPCNTDQVL-NEXT:    vpopcntq %xmm0, %xmm0
127; AVX512VPOPCNTDQVL-NEXT:    retq
128;
129; BITALG_NOVLX-LABEL: testv2i64:
130; BITALG_NOVLX:       # %bb.0:
131; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
132; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
133; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
134; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
135; BITALG_NOVLX-NEXT:    vzeroupper
136; BITALG_NOVLX-NEXT:    retq
137;
138; BITALG-LABEL: testv2i64:
139; BITALG:       # %bb.0:
140; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
141; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
142; BITALG-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
143; BITALG-NEXT:    retq
144  %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in)
145  ret <2 x i64> %out
146}
147
148define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
149; SSE2-LABEL: testv4i32:
150; SSE2:       # %bb.0:
151; SSE2-NEXT:    movdqa %xmm0, %xmm1
152; SSE2-NEXT:    psrlw $1, %xmm1
153; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
154; SSE2-NEXT:    psubb %xmm1, %xmm0
155; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
156; SSE2-NEXT:    movdqa %xmm0, %xmm2
157; SSE2-NEXT:    pand %xmm1, %xmm2
158; SSE2-NEXT:    psrlw $2, %xmm0
159; SSE2-NEXT:    pand %xmm1, %xmm0
160; SSE2-NEXT:    paddb %xmm2, %xmm0
161; SSE2-NEXT:    movdqa %xmm0, %xmm1
162; SSE2-NEXT:    psrlw $4, %xmm1
163; SSE2-NEXT:    paddb %xmm0, %xmm1
164; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
165; SSE2-NEXT:    pxor %xmm0, %xmm0
166; SSE2-NEXT:    movdqa %xmm1, %xmm2
167; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
168; SSE2-NEXT:    psadbw %xmm0, %xmm2
169; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
170; SSE2-NEXT:    psadbw %xmm0, %xmm1
171; SSE2-NEXT:    packuswb %xmm2, %xmm1
172; SSE2-NEXT:    movdqa %xmm1, %xmm0
173; SSE2-NEXT:    retq
174;
175; SSE3-LABEL: testv4i32:
176; SSE3:       # %bb.0:
177; SSE3-NEXT:    movdqa %xmm0, %xmm1
178; SSE3-NEXT:    psrlw $1, %xmm1
179; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
180; SSE3-NEXT:    psubb %xmm1, %xmm0
181; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
182; SSE3-NEXT:    movdqa %xmm0, %xmm2
183; SSE3-NEXT:    pand %xmm1, %xmm2
184; SSE3-NEXT:    psrlw $2, %xmm0
185; SSE3-NEXT:    pand %xmm1, %xmm0
186; SSE3-NEXT:    paddb %xmm2, %xmm0
187; SSE3-NEXT:    movdqa %xmm0, %xmm1
188; SSE3-NEXT:    psrlw $4, %xmm1
189; SSE3-NEXT:    paddb %xmm0, %xmm1
190; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
191; SSE3-NEXT:    pxor %xmm0, %xmm0
192; SSE3-NEXT:    movdqa %xmm1, %xmm2
193; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
194; SSE3-NEXT:    psadbw %xmm0, %xmm2
195; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
196; SSE3-NEXT:    psadbw %xmm0, %xmm1
197; SSE3-NEXT:    packuswb %xmm2, %xmm1
198; SSE3-NEXT:    movdqa %xmm1, %xmm0
199; SSE3-NEXT:    retq
200;
201; SSSE3-LABEL: testv4i32:
202; SSSE3:       # %bb.0:
203; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
204; SSSE3-NEXT:    movdqa %xmm0, %xmm3
205; SSSE3-NEXT:    pand %xmm2, %xmm3
206; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
207; SSSE3-NEXT:    movdqa %xmm1, %xmm4
208; SSSE3-NEXT:    pshufb %xmm3, %xmm4
209; SSSE3-NEXT:    psrlw $4, %xmm0
210; SSSE3-NEXT:    pand %xmm2, %xmm0
211; SSSE3-NEXT:    pshufb %xmm0, %xmm1
212; SSSE3-NEXT:    paddb %xmm4, %xmm1
213; SSSE3-NEXT:    pxor %xmm0, %xmm0
214; SSSE3-NEXT:    movdqa %xmm1, %xmm2
215; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
216; SSSE3-NEXT:    psadbw %xmm0, %xmm2
217; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
218; SSSE3-NEXT:    psadbw %xmm0, %xmm1
219; SSSE3-NEXT:    packuswb %xmm2, %xmm1
220; SSSE3-NEXT:    movdqa %xmm1, %xmm0
221; SSSE3-NEXT:    retq
222;
223; SSE41-LABEL: testv4i32:
224; SSE41:       # %bb.0:
225; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
226; SSE41-NEXT:    movdqa %xmm0, %xmm2
227; SSE41-NEXT:    pand %xmm1, %xmm2
228; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
229; SSE41-NEXT:    movdqa %xmm3, %xmm4
230; SSE41-NEXT:    pshufb %xmm2, %xmm4
231; SSE41-NEXT:    psrlw $4, %xmm0
232; SSE41-NEXT:    pand %xmm1, %xmm0
233; SSE41-NEXT:    pshufb %xmm0, %xmm3
234; SSE41-NEXT:    paddb %xmm4, %xmm3
235; SSE41-NEXT:    pxor %xmm1, %xmm1
236; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
237; SSE41-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
238; SSE41-NEXT:    psadbw %xmm1, %xmm3
239; SSE41-NEXT:    psadbw %xmm1, %xmm0
240; SSE41-NEXT:    packuswb %xmm3, %xmm0
241; SSE41-NEXT:    retq
242;
243; AVX1-LABEL: testv4i32:
244; AVX1:       # %bb.0:
245; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
246; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
247; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
248; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
249; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
250; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
251; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
252; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
253; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
254; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
255; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
256; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
257; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
258; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
259; AVX1-NEXT:    retq
260;
261; AVX2-LABEL: testv4i32:
262; AVX2:       # %bb.0:
263; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
264; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
265; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
266; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
267; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
268; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
269; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
270; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
271; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
272; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
273; AVX2-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
274; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
275; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
276; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
277; AVX2-NEXT:    retq
278;
279; AVX512VPOPCNTDQ-LABEL: testv4i32:
280; AVX512VPOPCNTDQ:       # %bb.0:
281; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
282; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
283; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
284; AVX512VPOPCNTDQ-NEXT:    vzeroupper
285; AVX512VPOPCNTDQ-NEXT:    retq
286;
287; AVX512VPOPCNTDQVL-LABEL: testv4i32:
288; AVX512VPOPCNTDQVL:       # %bb.0:
289; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %xmm0, %xmm0
290; AVX512VPOPCNTDQVL-NEXT:    retq
291;
292; BITALG_NOVLX-LABEL: testv4i32:
293; BITALG_NOVLX:       # %bb.0:
294; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
295; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
296; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
297; BITALG_NOVLX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
298; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
299; BITALG_NOVLX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
300; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
301; BITALG_NOVLX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
302; BITALG_NOVLX-NEXT:    vzeroupper
303; BITALG_NOVLX-NEXT:    retq
304;
305; BITALG-LABEL: testv4i32:
306; BITALG:       # %bb.0:
307; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
308; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
309; BITALG-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
310; BITALG-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
311; BITALG-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
312; BITALG-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
313; BITALG-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
314; BITALG-NEXT:    retq
315  %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in)
316  ret <4 x i32> %out
317}
318
319define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
320; SSE2-LABEL: testv8i16:
321; SSE2:       # %bb.0:
322; SSE2-NEXT:    movdqa %xmm0, %xmm1
323; SSE2-NEXT:    psrlw $1, %xmm1
324; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
325; SSE2-NEXT:    psubb %xmm1, %xmm0
326; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
327; SSE2-NEXT:    movdqa %xmm0, %xmm2
328; SSE2-NEXT:    pand %xmm1, %xmm2
329; SSE2-NEXT:    psrlw $2, %xmm0
330; SSE2-NEXT:    pand %xmm1, %xmm0
331; SSE2-NEXT:    paddb %xmm2, %xmm0
332; SSE2-NEXT:    movdqa %xmm0, %xmm1
333; SSE2-NEXT:    psrlw $4, %xmm1
334; SSE2-NEXT:    paddb %xmm0, %xmm1
335; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
336; SSE2-NEXT:    movdqa %xmm1, %xmm0
337; SSE2-NEXT:    psllw $8, %xmm0
338; SSE2-NEXT:    paddb %xmm1, %xmm0
339; SSE2-NEXT:    psrlw $8, %xmm0
340; SSE2-NEXT:    retq
341;
342; SSE3-LABEL: testv8i16:
343; SSE3:       # %bb.0:
344; SSE3-NEXT:    movdqa %xmm0, %xmm1
345; SSE3-NEXT:    psrlw $1, %xmm1
346; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
347; SSE3-NEXT:    psubb %xmm1, %xmm0
348; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
349; SSE3-NEXT:    movdqa %xmm0, %xmm2
350; SSE3-NEXT:    pand %xmm1, %xmm2
351; SSE3-NEXT:    psrlw $2, %xmm0
352; SSE3-NEXT:    pand %xmm1, %xmm0
353; SSE3-NEXT:    paddb %xmm2, %xmm0
354; SSE3-NEXT:    movdqa %xmm0, %xmm1
355; SSE3-NEXT:    psrlw $4, %xmm1
356; SSE3-NEXT:    paddb %xmm0, %xmm1
357; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
358; SSE3-NEXT:    movdqa %xmm1, %xmm0
359; SSE3-NEXT:    psllw $8, %xmm0
360; SSE3-NEXT:    paddb %xmm1, %xmm0
361; SSE3-NEXT:    psrlw $8, %xmm0
362; SSE3-NEXT:    retq
363;
364; SSSE3-LABEL: testv8i16:
365; SSSE3:       # %bb.0:
366; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
367; SSSE3-NEXT:    movdqa %xmm0, %xmm2
368; SSSE3-NEXT:    pand %xmm1, %xmm2
369; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
370; SSSE3-NEXT:    movdqa %xmm3, %xmm4
371; SSSE3-NEXT:    pshufb %xmm2, %xmm4
372; SSSE3-NEXT:    psrlw $4, %xmm0
373; SSSE3-NEXT:    pand %xmm1, %xmm0
374; SSSE3-NEXT:    pshufb %xmm0, %xmm3
375; SSSE3-NEXT:    paddb %xmm4, %xmm3
376; SSSE3-NEXT:    movdqa %xmm3, %xmm0
377; SSSE3-NEXT:    psllw $8, %xmm0
378; SSSE3-NEXT:    paddb %xmm3, %xmm0
379; SSSE3-NEXT:    psrlw $8, %xmm0
380; SSSE3-NEXT:    retq
381;
382; SSE41-LABEL: testv8i16:
383; SSE41:       # %bb.0:
384; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
385; SSE41-NEXT:    movdqa %xmm0, %xmm2
386; SSE41-NEXT:    pand %xmm1, %xmm2
387; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
388; SSE41-NEXT:    movdqa %xmm3, %xmm4
389; SSE41-NEXT:    pshufb %xmm2, %xmm4
390; SSE41-NEXT:    psrlw $4, %xmm0
391; SSE41-NEXT:    pand %xmm1, %xmm0
392; SSE41-NEXT:    pshufb %xmm0, %xmm3
393; SSE41-NEXT:    paddb %xmm4, %xmm3
394; SSE41-NEXT:    movdqa %xmm3, %xmm0
395; SSE41-NEXT:    psllw $8, %xmm0
396; SSE41-NEXT:    paddb %xmm3, %xmm0
397; SSE41-NEXT:    psrlw $8, %xmm0
398; SSE41-NEXT:    retq
399;
400; AVX1-LABEL: testv8i16:
401; AVX1:       # %bb.0:
402; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
403; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
404; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
405; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
406; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
407; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
408; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
409; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
410; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
411; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
412; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
413; AVX1-NEXT:    retq
414;
415; AVX2-LABEL: testv8i16:
416; AVX2:       # %bb.0:
417; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
418; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
419; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
420; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
421; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
422; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
423; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
424; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
425; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
426; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
427; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
428; AVX2-NEXT:    retq
429;
430; AVX512VPOPCNTDQ-LABEL: testv8i16:
431; AVX512VPOPCNTDQ:       # %bb.0:
432; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
433; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
434; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
435; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
436; AVX512VPOPCNTDQ-NEXT:    vzeroupper
437; AVX512VPOPCNTDQ-NEXT:    retq
438;
439; AVX512VPOPCNTDQVL-LABEL: testv8i16:
440; AVX512VPOPCNTDQVL:       # %bb.0:
441; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
442; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
443; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %ymm0, %xmm0
444; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
445; AVX512VPOPCNTDQVL-NEXT:    retq
446;
447; BITALG_NOVLX-LABEL: testv8i16:
448; BITALG_NOVLX:       # %bb.0:
449; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
450; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
451; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
452; BITALG_NOVLX-NEXT:    vzeroupper
453; BITALG_NOVLX-NEXT:    retq
454;
455; BITALG-LABEL: testv8i16:
456; BITALG:       # %bb.0:
457; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
458; BITALG-NEXT:    retq
459  %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %in)
460  ret <8 x i16> %out
461}
462
463define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
464; SSE2-LABEL: testv16i8:
465; SSE2:       # %bb.0:
466; SSE2-NEXT:    movdqa %xmm0, %xmm1
467; SSE2-NEXT:    psrlw $1, %xmm1
468; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
469; SSE2-NEXT:    psubb %xmm1, %xmm0
470; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
471; SSE2-NEXT:    movdqa %xmm0, %xmm2
472; SSE2-NEXT:    pand %xmm1, %xmm2
473; SSE2-NEXT:    psrlw $2, %xmm0
474; SSE2-NEXT:    pand %xmm1, %xmm0
475; SSE2-NEXT:    paddb %xmm2, %xmm0
476; SSE2-NEXT:    movdqa %xmm0, %xmm1
477; SSE2-NEXT:    psrlw $4, %xmm1
478; SSE2-NEXT:    paddb %xmm0, %xmm1
479; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
480; SSE2-NEXT:    movdqa %xmm1, %xmm0
481; SSE2-NEXT:    retq
482;
483; SSE3-LABEL: testv16i8:
484; SSE3:       # %bb.0:
485; SSE3-NEXT:    movdqa %xmm0, %xmm1
486; SSE3-NEXT:    psrlw $1, %xmm1
487; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
488; SSE3-NEXT:    psubb %xmm1, %xmm0
489; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
490; SSE3-NEXT:    movdqa %xmm0, %xmm2
491; SSE3-NEXT:    pand %xmm1, %xmm2
492; SSE3-NEXT:    psrlw $2, %xmm0
493; SSE3-NEXT:    pand %xmm1, %xmm0
494; SSE3-NEXT:    paddb %xmm2, %xmm0
495; SSE3-NEXT:    movdqa %xmm0, %xmm1
496; SSE3-NEXT:    psrlw $4, %xmm1
497; SSE3-NEXT:    paddb %xmm0, %xmm1
498; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
499; SSE3-NEXT:    movdqa %xmm1, %xmm0
500; SSE3-NEXT:    retq
501;
502; SSSE3-LABEL: testv16i8:
503; SSSE3:       # %bb.0:
504; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
505; SSSE3-NEXT:    movdqa %xmm0, %xmm3
506; SSSE3-NEXT:    pand %xmm2, %xmm3
507; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
508; SSSE3-NEXT:    movdqa %xmm1, %xmm4
509; SSSE3-NEXT:    pshufb %xmm3, %xmm4
510; SSSE3-NEXT:    psrlw $4, %xmm0
511; SSSE3-NEXT:    pand %xmm2, %xmm0
512; SSSE3-NEXT:    pshufb %xmm0, %xmm1
513; SSSE3-NEXT:    paddb %xmm4, %xmm1
514; SSSE3-NEXT:    movdqa %xmm1, %xmm0
515; SSSE3-NEXT:    retq
516;
517; SSE41-LABEL: testv16i8:
518; SSE41:       # %bb.0:
519; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
520; SSE41-NEXT:    movdqa %xmm0, %xmm3
521; SSE41-NEXT:    pand %xmm2, %xmm3
522; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
523; SSE41-NEXT:    movdqa %xmm1, %xmm4
524; SSE41-NEXT:    pshufb %xmm3, %xmm4
525; SSE41-NEXT:    psrlw $4, %xmm0
526; SSE41-NEXT:    pand %xmm2, %xmm0
527; SSE41-NEXT:    pshufb %xmm0, %xmm1
528; SSE41-NEXT:    paddb %xmm4, %xmm1
529; SSE41-NEXT:    movdqa %xmm1, %xmm0
530; SSE41-NEXT:    retq
531;
532; AVX1-LABEL: testv16i8:
533; AVX1:       # %bb.0:
534; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
535; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
536; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
537; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
538; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
539; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
540; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
541; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
542; AVX1-NEXT:    retq
543;
544; AVX2-LABEL: testv16i8:
545; AVX2:       # %bb.0:
546; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
547; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
548; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
549; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
550; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
551; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
552; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
553; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
554; AVX2-NEXT:    retq
555;
556; AVX512VPOPCNTDQ-LABEL: testv16i8:
557; AVX512VPOPCNTDQ:       # %bb.0:
558; AVX512VPOPCNTDQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
559; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
560; AVX512VPOPCNTDQ-NEXT:    vpmovdb %zmm0, %xmm0
561; AVX512VPOPCNTDQ-NEXT:    vzeroupper
562; AVX512VPOPCNTDQ-NEXT:    retq
563;
564; AVX512VPOPCNTDQVL-LABEL: testv16i8:
565; AVX512VPOPCNTDQVL:       # %bb.0:
566; AVX512VPOPCNTDQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
567; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
568; AVX512VPOPCNTDQVL-NEXT:    vpmovdb %zmm0, %xmm0
569; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
570; AVX512VPOPCNTDQVL-NEXT:    retq
571;
572; BITALG_NOVLX-LABEL: testv16i8:
573; BITALG_NOVLX:       # %bb.0:
574; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
575; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
576; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
577; BITALG_NOVLX-NEXT:    vzeroupper
578; BITALG_NOVLX-NEXT:    retq
579;
580; BITALG-LABEL: testv16i8:
581; BITALG:       # %bb.0:
582; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
583; BITALG-NEXT:    retq
584  %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %in)
585  ret <16 x i8> %out
586}
587
588define <2 x i64> @foldv2i64() nounwind {
589; SSE-LABEL: foldv2i64:
590; SSE:       # %bb.0:
591; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,64]
592; SSE-NEXT:    retq
593;
594; AVX-LABEL: foldv2i64:
595; AVX:       # %bb.0:
596; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,64]
597; AVX-NEXT:    retq
598;
599; BITALG_NOVLX-LABEL: foldv2i64:
600; BITALG_NOVLX:       # %bb.0:
601; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,64]
602; BITALG_NOVLX-NEXT:    retq
603;
604; BITALG-LABEL: foldv2i64:
605; BITALG:       # %bb.0:
606; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [1,64]
607; BITALG-NEXT:    retq
608  %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> <i64 256, i64 -1>)
609  ret <2 x i64> %out
610}
611
612define <4 x i32> @foldv4i32() nounwind {
613; SSE-LABEL: foldv4i32:
614; SSE:       # %bb.0:
615; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,32,0,8]
616; SSE-NEXT:    retq
617;
618; AVX-LABEL: foldv4i32:
619; AVX:       # %bb.0:
620; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,32,0,8]
621; AVX-NEXT:    retq
622;
623; BITALG_NOVLX-LABEL: foldv4i32:
624; BITALG_NOVLX:       # %bb.0:
625; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,32,0,8]
626; BITALG_NOVLX-NEXT:    retq
627;
628; BITALG-LABEL: foldv4i32:
629; BITALG:       # %bb.0:
630; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [1,32,0,8]
631; BITALG-NEXT:    retq
632  %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>)
633  ret <4 x i32> %out
634}
635
636define <8 x i16> @foldv8i16() nounwind {
637; SSE-LABEL: foldv8i16:
638; SSE:       # %bb.0:
639; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
640; SSE-NEXT:    retq
641;
642; AVX-LABEL: foldv8i16:
643; AVX:       # %bb.0:
644; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
645; AVX-NEXT:    retq
646;
647; BITALG_NOVLX-LABEL: foldv8i16:
648; BITALG_NOVLX:       # %bb.0:
649; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
650; BITALG_NOVLX-NEXT:    retq
651;
652; BITALG-LABEL: foldv8i16:
653; BITALG:       # %bb.0:
654; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
655; BITALG-NEXT:    retq
656  %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>)
657  ret <8 x i16> %out
658}
659
660define <16 x i8> @foldv16i8() nounwind {
661; SSE-LABEL: foldv16i8:
662; SSE:       # %bb.0:
663; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
664; SSE-NEXT:    retq
665;
666; AVX-LABEL: foldv16i8:
667; AVX:       # %bb.0:
668; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
669; AVX-NEXT:    retq
670;
671; BITALG_NOVLX-LABEL: foldv16i8:
672; BITALG_NOVLX:       # %bb.0:
673; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
674; BITALG_NOVLX-NEXT:    retq
675;
676; BITALG-LABEL: foldv16i8:
677; BITALG:       # %bb.0:
678; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
679; BITALG-NEXT:    retq
680  %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>)
681  ret <16 x i8> %out
682}
683
684define <2 x i64> @eq_1_v2i64(<2 x i64> %0) {
685; SSE2-LABEL: eq_1_v2i64:
686; SSE2:       # %bb.0:
687; SSE2-NEXT:    pxor %xmm1, %xmm1
688; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
689; SSE2-NEXT:    paddq %xmm0, %xmm2
690; SSE2-NEXT:    pand %xmm0, %xmm2
691; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
692; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,0,3,2]
693; SSE2-NEXT:    pand %xmm3, %xmm0
694; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
695; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
696; SSE2-NEXT:    pand %xmm2, %xmm1
697; SSE2-NEXT:    pandn %xmm1, %xmm0
698; SSE2-NEXT:    retq
699;
700; SSE3-LABEL: eq_1_v2i64:
701; SSE3:       # %bb.0:
702; SSE3-NEXT:    pxor %xmm1, %xmm1
703; SSE3-NEXT:    pcmpeqd %xmm2, %xmm2
704; SSE3-NEXT:    paddq %xmm0, %xmm2
705; SSE3-NEXT:    pand %xmm0, %xmm2
706; SSE3-NEXT:    pcmpeqd %xmm1, %xmm0
707; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,0,3,2]
708; SSE3-NEXT:    pand %xmm3, %xmm0
709; SSE3-NEXT:    pcmpeqd %xmm1, %xmm2
710; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
711; SSE3-NEXT:    pand %xmm2, %xmm1
712; SSE3-NEXT:    pandn %xmm1, %xmm0
713; SSE3-NEXT:    retq
714;
715; SSSE3-LABEL: eq_1_v2i64:
716; SSSE3:       # %bb.0:
717; SSSE3-NEXT:    pxor %xmm1, %xmm1
718; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm2
719; SSSE3-NEXT:    paddq %xmm0, %xmm2
720; SSSE3-NEXT:    pand %xmm0, %xmm2
721; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
722; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,0,3,2]
723; SSSE3-NEXT:    pand %xmm3, %xmm0
724; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm2
725; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
726; SSSE3-NEXT:    pand %xmm2, %xmm1
727; SSSE3-NEXT:    pandn %xmm1, %xmm0
728; SSSE3-NEXT:    retq
729;
730; SSE41-LABEL: eq_1_v2i64:
731; SSE41:       # %bb.0:
732; SSE41-NEXT:    pxor %xmm1, %xmm1
733; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
734; SSE41-NEXT:    paddq %xmm0, %xmm2
735; SSE41-NEXT:    pand %xmm0, %xmm2
736; SSE41-NEXT:    pcmpeqq %xmm1, %xmm0
737; SSE41-NEXT:    pcmpeqq %xmm1, %xmm2
738; SSE41-NEXT:    pandn %xmm2, %xmm0
739; SSE41-NEXT:    retq
740;
741; AVX1-LABEL: eq_1_v2i64:
742; AVX1:       # %bb.0:
743; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
744; AVX1-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm2
745; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
746; AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm3
747; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
748; AVX1-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
749; AVX1-NEXT:    vpandn %xmm0, %xmm2, %xmm0
750; AVX1-NEXT:    retq
751;
752; AVX2-LABEL: eq_1_v2i64:
753; AVX2:       # %bb.0:
754; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
755; AVX2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm2
756; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
757; AVX2-NEXT:    vpaddq %xmm3, %xmm0, %xmm3
758; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
759; AVX2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
760; AVX2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
761; AVX2-NEXT:    retq
762;
763; AVX512VPOPCNTDQ-LABEL: eq_1_v2i64:
764; AVX512VPOPCNTDQ:       # %bb.0:
765; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
766; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
767; AVX512VPOPCNTDQ-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
768; AVX512VPOPCNTDQ-NEXT:    vzeroupper
769; AVX512VPOPCNTDQ-NEXT:    retq
770;
771; AVX512VPOPCNTDQVL-LABEL: eq_1_v2i64:
772; AVX512VPOPCNTDQVL:       # %bb.0:
773; AVX512VPOPCNTDQVL-NEXT:    vpopcntq %xmm0, %xmm0
774; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
775; AVX512VPOPCNTDQVL-NEXT:    retq
776;
777; BITALG_NOVLX-LABEL: eq_1_v2i64:
778; BITALG_NOVLX:       # %bb.0:
779; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
780; BITALG_NOVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm2
781; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
782; BITALG_NOVLX-NEXT:    vpaddq %xmm3, %xmm0, %xmm3
783; BITALG_NOVLX-NEXT:    vpand %xmm3, %xmm0, %xmm0
784; BITALG_NOVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
785; BITALG_NOVLX-NEXT:    vpandn %xmm0, %xmm2, %xmm0
786; BITALG_NOVLX-NEXT:    retq
787;
788; BITALG-LABEL: eq_1_v2i64:
789; BITALG:       # %bb.0:
790; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
791; BITALG-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm2
792; BITALG-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
793; BITALG-NEXT:    vpaddq %xmm3, %xmm0, %xmm3
794; BITALG-NEXT:    vpand %xmm3, %xmm0, %xmm0
795; BITALG-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
796; BITALG-NEXT:    vpandn %xmm0, %xmm2, %xmm0
797; BITALG-NEXT:    retq
798  %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
799  %3 = icmp eq <2 x i64> %2, <i64 1, i64 1>
800  %4 = sext <2 x i1> %3 to <2 x i64>
801  ret <2 x i64> %4
802}
803
804define <2 x i64> @ne_1_v2i64(<2 x i64> %0) {
805; SSE2-LABEL: ne_1_v2i64:
806; SSE2:       # %bb.0:
807; SSE2-NEXT:    pxor %xmm1, %xmm1
808; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
809; SSE2-NEXT:    movdqa %xmm0, %xmm3
810; SSE2-NEXT:    paddq %xmm2, %xmm3
811; SSE2-NEXT:    pand %xmm0, %xmm3
812; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
813; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,0,3,2]
814; SSE2-NEXT:    pand %xmm4, %xmm0
815; SSE2-NEXT:    pcmpeqd %xmm1, %xmm3
816; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,0,3,2]
817; SSE2-NEXT:    pand %xmm3, %xmm1
818; SSE2-NEXT:    pxor %xmm2, %xmm1
819; SSE2-NEXT:    por %xmm1, %xmm0
820; SSE2-NEXT:    retq
821;
822; SSE3-LABEL: ne_1_v2i64:
823; SSE3:       # %bb.0:
824; SSE3-NEXT:    pxor %xmm1, %xmm1
825; SSE3-NEXT:    pcmpeqd %xmm2, %xmm2
826; SSE3-NEXT:    movdqa %xmm0, %xmm3
827; SSE3-NEXT:    paddq %xmm2, %xmm3
828; SSE3-NEXT:    pand %xmm0, %xmm3
829; SSE3-NEXT:    pcmpeqd %xmm1, %xmm0
830; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,0,3,2]
831; SSE3-NEXT:    pand %xmm4, %xmm0
832; SSE3-NEXT:    pcmpeqd %xmm1, %xmm3
833; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,0,3,2]
834; SSE3-NEXT:    pand %xmm3, %xmm1
835; SSE3-NEXT:    pxor %xmm2, %xmm1
836; SSE3-NEXT:    por %xmm1, %xmm0
837; SSE3-NEXT:    retq
838;
839; SSSE3-LABEL: ne_1_v2i64:
840; SSSE3:       # %bb.0:
841; SSSE3-NEXT:    pxor %xmm1, %xmm1
842; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm2
843; SSSE3-NEXT:    movdqa %xmm0, %xmm3
844; SSSE3-NEXT:    paddq %xmm2, %xmm3
845; SSSE3-NEXT:    pand %xmm0, %xmm3
846; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
847; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,0,3,2]
848; SSSE3-NEXT:    pand %xmm4, %xmm0
849; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm3
850; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,0,3,2]
851; SSSE3-NEXT:    pand %xmm3, %xmm1
852; SSSE3-NEXT:    pxor %xmm2, %xmm1
853; SSSE3-NEXT:    por %xmm1, %xmm0
854; SSSE3-NEXT:    retq
855;
856; SSE41-LABEL: ne_1_v2i64:
857; SSE41:       # %bb.0:
858; SSE41-NEXT:    pxor %xmm2, %xmm2
859; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
860; SSE41-NEXT:    movdqa %xmm0, %xmm1
861; SSE41-NEXT:    paddq %xmm3, %xmm1
862; SSE41-NEXT:    pand %xmm0, %xmm1
863; SSE41-NEXT:    pcmpeqq %xmm2, %xmm0
864; SSE41-NEXT:    pcmpeqq %xmm2, %xmm1
865; SSE41-NEXT:    pxor %xmm3, %xmm1
866; SSE41-NEXT:    por %xmm0, %xmm1
867; SSE41-NEXT:    movdqa %xmm1, %xmm0
868; SSE41-NEXT:    retq
869;
870; AVX1-LABEL: ne_1_v2i64:
871; AVX1:       # %bb.0:
872; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
873; AVX1-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm2
874; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
875; AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm4
876; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
877; AVX1-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
878; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
879; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
880; AVX1-NEXT:    retq
881;
882; AVX2-LABEL: ne_1_v2i64:
883; AVX2:       # %bb.0:
884; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
885; AVX2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm2
886; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
887; AVX2-NEXT:    vpaddq %xmm3, %xmm0, %xmm4
888; AVX2-NEXT:    vpand %xmm4, %xmm0, %xmm0
889; AVX2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
890; AVX2-NEXT:    vpxor %xmm3, %xmm0, %xmm0
891; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
892; AVX2-NEXT:    retq
893;
894; AVX512VPOPCNTDQ-LABEL: ne_1_v2i64:
895; AVX512VPOPCNTDQ:       # %bb.0:
896; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
897; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
898; AVX512VPOPCNTDQ-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
899; AVX512VPOPCNTDQ-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
900; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
901; AVX512VPOPCNTDQ-NEXT:    vzeroupper
902; AVX512VPOPCNTDQ-NEXT:    retq
903;
904; AVX512VPOPCNTDQVL-LABEL: ne_1_v2i64:
905; AVX512VPOPCNTDQVL:       # %bb.0:
906; AVX512VPOPCNTDQVL-NEXT:    vpopcntq %xmm0, %xmm0
907; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
908; AVX512VPOPCNTDQVL-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
909; AVX512VPOPCNTDQVL-NEXT:    retq
910;
911; BITALG_NOVLX-LABEL: ne_1_v2i64:
912; BITALG_NOVLX:       # %bb.0:
913; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
914; BITALG_NOVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm2
915; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
916; BITALG_NOVLX-NEXT:    vpaddq %xmm3, %xmm0, %xmm3
917; BITALG_NOVLX-NEXT:    vpand %xmm3, %xmm0, %xmm0
918; BITALG_NOVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
919; BITALG_NOVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
920; BITALG_NOVLX-NEXT:    vpor %xmm0, %xmm2, %xmm0
921; BITALG_NOVLX-NEXT:    vzeroupper
922; BITALG_NOVLX-NEXT:    retq
923;
924; BITALG-LABEL: ne_1_v2i64:
925; BITALG:       # %bb.0:
926; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
927; BITALG-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm2
928; BITALG-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
929; BITALG-NEXT:    vpaddq %xmm3, %xmm0, %xmm4
930; BITALG-NEXT:    vpand %xmm4, %xmm0, %xmm0
931; BITALG-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
932; BITALG-NEXT:    vpternlogq $222, %xmm3, %xmm2, %xmm0
933; BITALG-NEXT:    retq
934  %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
935  %3 = icmp ne <2 x i64> %2, <i64 1, i64 1>
936  %4 = sext <2 x i1> %3 to <2 x i64>
937  ret <2 x i64> %4
938}
939
940define <4 x i32> @eq_1_v4i32(<4 x i32> %0) {
941; SSE-LABEL: eq_1_v4i32:
942; SSE:       # %bb.0:
943; SSE-NEXT:    pxor %xmm1, %xmm1
944; SSE-NEXT:    pcmpeqd %xmm2, %xmm2
945; SSE-NEXT:    paddd %xmm0, %xmm2
946; SSE-NEXT:    pand %xmm0, %xmm2
947; SSE-NEXT:    pcmpeqd %xmm1, %xmm0
948; SSE-NEXT:    pcmpeqd %xmm1, %xmm2
949; SSE-NEXT:    pandn %xmm2, %xmm0
950; SSE-NEXT:    retq
951;
952; AVX1-LABEL: eq_1_v4i32:
953; AVX1:       # %bb.0:
954; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
955; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm2
956; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
957; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm3
958; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
959; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
960; AVX1-NEXT:    vpandn %xmm0, %xmm2, %xmm0
961; AVX1-NEXT:    retq
962;
963; AVX2-LABEL: eq_1_v4i32:
964; AVX2:       # %bb.0:
965; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
966; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm2
967; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
968; AVX2-NEXT:    vpaddd %xmm3, %xmm0, %xmm3
969; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
970; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
971; AVX2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
972; AVX2-NEXT:    retq
973;
974; AVX512VPOPCNTDQ-LABEL: eq_1_v4i32:
975; AVX512VPOPCNTDQ:       # %bb.0:
976; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
977; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
978; AVX512VPOPCNTDQ-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
979; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
980; AVX512VPOPCNTDQ-NEXT:    vzeroupper
981; AVX512VPOPCNTDQ-NEXT:    retq
982;
983; AVX512VPOPCNTDQVL-LABEL: eq_1_v4i32:
984; AVX512VPOPCNTDQVL:       # %bb.0:
985; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %xmm0, %xmm0
986; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
987; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
988; AVX512VPOPCNTDQVL-NEXT:    retq
989;
990; BITALG_NOVLX-LABEL: eq_1_v4i32:
991; BITALG_NOVLX:       # %bb.0:
992; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
993; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm2
994; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
995; BITALG_NOVLX-NEXT:    vpaddd %xmm3, %xmm0, %xmm3
996; BITALG_NOVLX-NEXT:    vpand %xmm3, %xmm0, %xmm0
997; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
998; BITALG_NOVLX-NEXT:    vpandn %xmm0, %xmm2, %xmm0
999; BITALG_NOVLX-NEXT:    retq
1000;
1001; BITALG-LABEL: eq_1_v4i32:
1002; BITALG:       # %bb.0:
1003; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1004; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm2
1005; BITALG-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
1006; BITALG-NEXT:    vpaddd %xmm3, %xmm0, %xmm3
1007; BITALG-NEXT:    vpand %xmm3, %xmm0, %xmm0
1008; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
1009; BITALG-NEXT:    vpandn %xmm0, %xmm2, %xmm0
1010; BITALG-NEXT:    retq
1011  %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
1012  %3 = icmp eq <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
1013  %4 = sext <4 x i1> %3 to <4 x i32>
1014  ret <4 x i32> %4
1015}
1016
1017define <4 x i32> @ne_1_v4i32(<4 x i32> %0) {
1018; SSE-LABEL: ne_1_v4i32:
1019; SSE:       # %bb.0:
1020; SSE-NEXT:    pxor %xmm2, %xmm2
1021; SSE-NEXT:    pcmpeqd %xmm3, %xmm3
1022; SSE-NEXT:    movdqa %xmm0, %xmm1
1023; SSE-NEXT:    paddd %xmm3, %xmm1
1024; SSE-NEXT:    pand %xmm0, %xmm1
1025; SSE-NEXT:    pcmpeqd %xmm2, %xmm0
1026; SSE-NEXT:    pcmpeqd %xmm2, %xmm1
1027; SSE-NEXT:    pxor %xmm3, %xmm1
1028; SSE-NEXT:    por %xmm0, %xmm1
1029; SSE-NEXT:    movdqa %xmm1, %xmm0
1030; SSE-NEXT:    retq
1031;
1032; AVX1-LABEL: ne_1_v4i32:
1033; AVX1:       # %bb.0:
1034; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1035; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm2
1036; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
1037; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm4
1038; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
1039; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
1040; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
1041; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
1042; AVX1-NEXT:    retq
1043;
1044; AVX2-LABEL: ne_1_v4i32:
1045; AVX2:       # %bb.0:
1046; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1047; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm2
1048; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
1049; AVX2-NEXT:    vpaddd %xmm3, %xmm0, %xmm4
1050; AVX2-NEXT:    vpand %xmm4, %xmm0, %xmm0
1051; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
1052; AVX2-NEXT:    vpxor %xmm3, %xmm0, %xmm0
1053; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
1054; AVX2-NEXT:    retq
1055;
1056; AVX512VPOPCNTDQ-LABEL: ne_1_v4i32:
1057; AVX512VPOPCNTDQ:       # %bb.0:
1058; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1059; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
1060; AVX512VPOPCNTDQ-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
1061; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
1062; AVX512VPOPCNTDQ-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
1063; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1064; AVX512VPOPCNTDQ-NEXT:    vzeroupper
1065; AVX512VPOPCNTDQ-NEXT:    retq
1066;
1067; AVX512VPOPCNTDQVL-LABEL: ne_1_v4i32:
1068; AVX512VPOPCNTDQVL:       # %bb.0:
1069; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %xmm0, %xmm0
1070; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
1071; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
1072; AVX512VPOPCNTDQVL-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
1073; AVX512VPOPCNTDQVL-NEXT:    retq
1074;
1075; BITALG_NOVLX-LABEL: ne_1_v4i32:
1076; BITALG_NOVLX:       # %bb.0:
1077; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1078; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm2
1079; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
1080; BITALG_NOVLX-NEXT:    vpaddd %xmm3, %xmm0, %xmm3
1081; BITALG_NOVLX-NEXT:    vpand %xmm3, %xmm0, %xmm0
1082; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
1083; BITALG_NOVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
1084; BITALG_NOVLX-NEXT:    vpor %xmm0, %xmm2, %xmm0
1085; BITALG_NOVLX-NEXT:    vzeroupper
1086; BITALG_NOVLX-NEXT:    retq
1087;
1088; BITALG-LABEL: ne_1_v4i32:
1089; BITALG:       # %bb.0:
1090; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1091; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm2
1092; BITALG-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
1093; BITALG-NEXT:    vpaddd %xmm3, %xmm0, %xmm4
1094; BITALG-NEXT:    vpand %xmm4, %xmm0, %xmm0
1095; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
1096; BITALG-NEXT:    vpternlogd $222, %xmm3, %xmm2, %xmm0
1097; BITALG-NEXT:    retq
1098  %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
1099  %3 = icmp ne <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
1100  %4 = sext <4 x i1> %3 to <4 x i32>
1101  ret <4 x i32> %4
1102}
1103
1104define <8 x i16> @eq_1_v8i16(<8 x i16> %0) {
1105; SSE-LABEL: eq_1_v8i16:
1106; SSE:       # %bb.0:
1107; SSE-NEXT:    pxor %xmm1, %xmm1
1108; SSE-NEXT:    pcmpeqd %xmm2, %xmm2
1109; SSE-NEXT:    paddw %xmm0, %xmm2
1110; SSE-NEXT:    pand %xmm0, %xmm2
1111; SSE-NEXT:    pcmpeqw %xmm1, %xmm0
1112; SSE-NEXT:    pcmpeqw %xmm1, %xmm2
1113; SSE-NEXT:    pandn %xmm2, %xmm0
1114; SSE-NEXT:    retq
1115;
1116; AVX-LABEL: eq_1_v8i16:
1117; AVX:       # %bb.0:
1118; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1119; AVX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm2
1120; AVX-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
1121; AVX-NEXT:    vpaddw %xmm3, %xmm0, %xmm3
1122; AVX-NEXT:    vpand %xmm3, %xmm0, %xmm0
1123; AVX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
1124; AVX-NEXT:    vpandn %xmm0, %xmm2, %xmm0
1125; AVX-NEXT:    retq
1126;
1127; BITALG_NOVLX-LABEL: eq_1_v8i16:
1128; BITALG_NOVLX:       # %bb.0:
1129; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1130; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
1131; BITALG_NOVLX-NEXT:    vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1132; BITALG_NOVLX-NEXT:    vzeroupper
1133; BITALG_NOVLX-NEXT:    retq
1134;
1135; BITALG-LABEL: eq_1_v8i16:
1136; BITALG:       # %bb.0:
1137; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
1138; BITALG-NEXT:    vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1139; BITALG-NEXT:    retq
1140  %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
1141  %3 = icmp eq <8 x i16> %2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1142  %4 = sext <8 x i1> %3 to <8 x i16>
1143  ret <8 x i16> %4
1144}
1145
1146define <8 x i16> @ne_1_v8i16(<8 x i16> %0) {
1147; SSE-LABEL: ne_1_v8i16:
1148; SSE:       # %bb.0:
1149; SSE-NEXT:    pxor %xmm2, %xmm2
1150; SSE-NEXT:    pcmpeqd %xmm3, %xmm3
1151; SSE-NEXT:    movdqa %xmm0, %xmm1
1152; SSE-NEXT:    paddw %xmm3, %xmm1
1153; SSE-NEXT:    pand %xmm0, %xmm1
1154; SSE-NEXT:    pcmpeqw %xmm2, %xmm0
1155; SSE-NEXT:    pcmpeqw %xmm2, %xmm1
1156; SSE-NEXT:    pxor %xmm3, %xmm1
1157; SSE-NEXT:    por %xmm0, %xmm1
1158; SSE-NEXT:    movdqa %xmm1, %xmm0
1159; SSE-NEXT:    retq
1160;
1161; AVX1-LABEL: ne_1_v8i16:
1162; AVX1:       # %bb.0:
1163; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1164; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm2
1165; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
1166; AVX1-NEXT:    vpaddw %xmm3, %xmm0, %xmm4
1167; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
1168; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
1169; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
1170; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
1171; AVX1-NEXT:    retq
1172;
1173; AVX2-LABEL: ne_1_v8i16:
1174; AVX2:       # %bb.0:
1175; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1176; AVX2-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm2
1177; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
1178; AVX2-NEXT:    vpaddw %xmm3, %xmm0, %xmm4
1179; AVX2-NEXT:    vpand %xmm4, %xmm0, %xmm0
1180; AVX2-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
1181; AVX2-NEXT:    vpxor %xmm3, %xmm0, %xmm0
1182; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
1183; AVX2-NEXT:    retq
1184;
1185; AVX512VPOPCNTDQ-LABEL: ne_1_v8i16:
1186; AVX512VPOPCNTDQ:       # %bb.0:
1187; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1188; AVX512VPOPCNTDQ-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm2
1189; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
1190; AVX512VPOPCNTDQ-NEXT:    vpaddw %xmm3, %xmm0, %xmm3
1191; AVX512VPOPCNTDQ-NEXT:    vpand %xmm3, %xmm0, %xmm0
1192; AVX512VPOPCNTDQ-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
1193; AVX512VPOPCNTDQ-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
1194; AVX512VPOPCNTDQ-NEXT:    vpor %xmm0, %xmm2, %xmm0
1195; AVX512VPOPCNTDQ-NEXT:    vzeroupper
1196; AVX512VPOPCNTDQ-NEXT:    retq
1197;
1198; AVX512VPOPCNTDQVL-LABEL: ne_1_v8i16:
1199; AVX512VPOPCNTDQVL:       # %bb.0:
1200; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1201; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm2
1202; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
1203; AVX512VPOPCNTDQVL-NEXT:    vpaddw %xmm3, %xmm0, %xmm4
1204; AVX512VPOPCNTDQVL-NEXT:    vpand %xmm4, %xmm0, %xmm0
1205; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
1206; AVX512VPOPCNTDQVL-NEXT:    vpternlogq $222, %xmm3, %xmm2, %xmm0
1207; AVX512VPOPCNTDQVL-NEXT:    retq
1208;
1209; BITALG_NOVLX-LABEL: ne_1_v8i16:
1210; BITALG_NOVLX:       # %bb.0:
1211; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1212; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
1213; BITALG_NOVLX-NEXT:    vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1214; BITALG_NOVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
1215; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1216; BITALG_NOVLX-NEXT:    vzeroupper
1217; BITALG_NOVLX-NEXT:    retq
1218;
1219; BITALG-LABEL: ne_1_v8i16:
1220; BITALG:       # %bb.0:
1221; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
1222; BITALG-NEXT:    vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1223; BITALG-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
1224; BITALG-NEXT:    retq
1225  %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
1226  %3 = icmp ne <8 x i16> %2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1227  %4 = sext <8 x i1> %3 to <8 x i16>
1228  ret <8 x i16> %4
1229}
1230
1231define <16 x i8> @eq_1_v16i8(<16 x i8> %0) {
1232; SSE-LABEL: eq_1_v16i8:
1233; SSE:       # %bb.0:
1234; SSE-NEXT:    pxor %xmm1, %xmm1
1235; SSE-NEXT:    pcmpeqd %xmm2, %xmm2
1236; SSE-NEXT:    paddb %xmm0, %xmm2
1237; SSE-NEXT:    pand %xmm0, %xmm2
1238; SSE-NEXT:    pcmpeqb %xmm1, %xmm0
1239; SSE-NEXT:    pcmpeqb %xmm1, %xmm2
1240; SSE-NEXT:    pandn %xmm2, %xmm0
1241; SSE-NEXT:    retq
1242;
1243; AVX-LABEL: eq_1_v16i8:
1244; AVX:       # %bb.0:
1245; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1246; AVX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm2
1247; AVX-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
1248; AVX-NEXT:    vpaddb %xmm3, %xmm0, %xmm3
1249; AVX-NEXT:    vpand %xmm3, %xmm0, %xmm0
1250; AVX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
1251; AVX-NEXT:    vpandn %xmm0, %xmm2, %xmm0
1252; AVX-NEXT:    retq
1253;
1254; BITALG_NOVLX-LABEL: eq_1_v16i8:
1255; BITALG_NOVLX:       # %bb.0:
1256; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1257; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
1258; BITALG_NOVLX-NEXT:    vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1259; BITALG_NOVLX-NEXT:    vzeroupper
1260; BITALG_NOVLX-NEXT:    retq
1261;
1262; BITALG-LABEL: eq_1_v16i8:
1263; BITALG:       # %bb.0:
1264; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
1265; BITALG-NEXT:    vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1266; BITALG-NEXT:    retq
1267  %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0)
1268  %3 = icmp eq <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1269  %4 = sext <16 x i1> %3 to <16 x i8>
1270  ret <16 x i8> %4
1271}
1272
1273define <16 x i8> @ne_1_v16i8(<16 x i8> %0) {
1274; SSE-LABEL: ne_1_v16i8:
1275; SSE:       # %bb.0:
1276; SSE-NEXT:    pxor %xmm2, %xmm2
1277; SSE-NEXT:    pcmpeqd %xmm3, %xmm3
1278; SSE-NEXT:    movdqa %xmm0, %xmm1
1279; SSE-NEXT:    paddb %xmm3, %xmm1
1280; SSE-NEXT:    pand %xmm0, %xmm1
1281; SSE-NEXT:    pcmpeqb %xmm2, %xmm0
1282; SSE-NEXT:    pcmpeqb %xmm2, %xmm1
1283; SSE-NEXT:    pxor %xmm3, %xmm1
1284; SSE-NEXT:    por %xmm0, %xmm1
1285; SSE-NEXT:    movdqa %xmm1, %xmm0
1286; SSE-NEXT:    retq
1287;
1288; AVX1-LABEL: ne_1_v16i8:
1289; AVX1:       # %bb.0:
1290; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1291; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm2
1292; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
1293; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm4
1294; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
1295; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
1296; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
1297; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
1298; AVX1-NEXT:    retq
1299;
1300; AVX2-LABEL: ne_1_v16i8:
1301; AVX2:       # %bb.0:
1302; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1303; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm2
1304; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
1305; AVX2-NEXT:    vpaddb %xmm3, %xmm0, %xmm4
1306; AVX2-NEXT:    vpand %xmm4, %xmm0, %xmm0
1307; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
1308; AVX2-NEXT:    vpxor %xmm3, %xmm0, %xmm0
1309; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
1310; AVX2-NEXT:    retq
1311;
1312; AVX512VPOPCNTDQ-LABEL: ne_1_v16i8:
1313; AVX512VPOPCNTDQ:       # %bb.0:
1314; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1315; AVX512VPOPCNTDQ-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm2
1316; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
1317; AVX512VPOPCNTDQ-NEXT:    vpaddb %xmm3, %xmm0, %xmm3
1318; AVX512VPOPCNTDQ-NEXT:    vpand %xmm3, %xmm0, %xmm0
1319; AVX512VPOPCNTDQ-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
1320; AVX512VPOPCNTDQ-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
1321; AVX512VPOPCNTDQ-NEXT:    vpor %xmm0, %xmm2, %xmm0
1322; AVX512VPOPCNTDQ-NEXT:    vzeroupper
1323; AVX512VPOPCNTDQ-NEXT:    retq
1324;
1325; AVX512VPOPCNTDQVL-LABEL: ne_1_v16i8:
1326; AVX512VPOPCNTDQVL:       # %bb.0:
1327; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1328; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm2
1329; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
1330; AVX512VPOPCNTDQVL-NEXT:    vpaddb %xmm3, %xmm0, %xmm4
1331; AVX512VPOPCNTDQVL-NEXT:    vpand %xmm4, %xmm0, %xmm0
1332; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
1333; AVX512VPOPCNTDQVL-NEXT:    vpternlogq $222, %xmm3, %xmm2, %xmm0
1334; AVX512VPOPCNTDQVL-NEXT:    retq
1335;
1336; BITALG_NOVLX-LABEL: ne_1_v16i8:
1337; BITALG_NOVLX:       # %bb.0:
1338; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1339; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
1340; BITALG_NOVLX-NEXT:    vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1341; BITALG_NOVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
1342; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1343; BITALG_NOVLX-NEXT:    vzeroupper
1344; BITALG_NOVLX-NEXT:    retq
1345;
1346; BITALG-LABEL: ne_1_v16i8:
1347; BITALG:       # %bb.0:
1348; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
1349; BITALG-NEXT:    vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1350; BITALG-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
1351; BITALG-NEXT:    retq
1352  %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0)
1353  %3 = icmp ne <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1354  %4 = sext <16 x i1> %3 to <16 x i8>
1355  ret <16 x i8> %4
1356}
1357
1358declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
1359declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
1360declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)
1361declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>)
1362