1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512
5
6declare void @use_v8i1(<8 x i1>)
7declare void @use_v8i8(<8 x i8>)
8
9define <8 x i16> @cmp_ne_load_const(<8 x i8>* %x) nounwind {
10; SSE-LABEL: cmp_ne_load_const:
11; SSE:       # %bb.0:
12; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
13; SSE-NEXT:    pxor %xmm1, %xmm1
14; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
15; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
16; SSE-NEXT:    pxor %xmm1, %xmm0
17; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
18; SSE-NEXT:    psraw $8, %xmm0
19; SSE-NEXT:    retq
20;
21; AVX-LABEL: cmp_ne_load_const:
22; AVX:       # %bb.0:
23; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
24; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
25; AVX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
26; AVX-NEXT:    retq
27  %loadx = load <8 x i8>, <8 x i8>* %x
28  %icmp = icmp ne <8 x i8> %loadx, zeroinitializer
29  %sext = sext <8 x i1> %icmp to <8 x i16>
30  ret <8 x i16> %sext
31}
32
33; negative test - simple loads only
34
35define <8 x i16> @cmp_ne_load_const_volatile(<8 x i8>* %x) nounwind {
36; SSE-LABEL: cmp_ne_load_const_volatile:
37; SSE:       # %bb.0:
38; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
39; SSE-NEXT:    pxor %xmm1, %xmm1
40; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
41; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
42; SSE-NEXT:    pxor %xmm1, %xmm0
43; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
44; SSE-NEXT:    psraw $8, %xmm0
45; SSE-NEXT:    retq
46;
47; AVX2-LABEL: cmp_ne_load_const_volatile:
48; AVX2:       # %bb.0:
49; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
50; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
51; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
52; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
53; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
54; AVX2-NEXT:    vpmovsxbw %xmm0, %xmm0
55; AVX2-NEXT:    retq
56;
57; AVX512-LABEL: cmp_ne_load_const_volatile:
58; AVX512:       # %bb.0:
59; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
60; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
61; AVX512-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
62; AVX512-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
63; AVX512-NEXT:    vpmovsxbw %xmm0, %xmm0
64; AVX512-NEXT:    vzeroupper
65; AVX512-NEXT:    retq
66  %loadx = load volatile <8 x i8>, <8 x i8>* %x
67  %icmp = icmp ne <8 x i8> %loadx, zeroinitializer
68  %sext = sext <8 x i1> %icmp to <8 x i16>
69  ret <8 x i16> %sext
70}
71
72; negative test - don't create extra load
73
74define <8 x i16> @cmp_ne_load_const_extra_use1(<8 x i8>* %x) nounwind {
75; SSE-LABEL: cmp_ne_load_const_extra_use1:
76; SSE:       # %bb.0:
77; SSE-NEXT:    subq $24, %rsp
78; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
79; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
80; SSE-NEXT:    callq use_v8i8@PLT
81; SSE-NEXT:    pxor %xmm0, %xmm0
82; SSE-NEXT:    pcmpeqb (%rsp), %xmm0 # 16-byte Folded Reload
83; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
84; SSE-NEXT:    pxor %xmm0, %xmm1
85; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
86; SSE-NEXT:    psraw $8, %xmm0
87; SSE-NEXT:    addq $24, %rsp
88; SSE-NEXT:    retq
89;
90; AVX2-LABEL: cmp_ne_load_const_extra_use1:
91; AVX2:       # %bb.0:
92; AVX2-NEXT:    subq $24, %rsp
93; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
94; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
95; AVX2-NEXT:    callq use_v8i8@PLT
96; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
97; AVX2-NEXT:    vpcmpeqb (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
98; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
99; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
100; AVX2-NEXT:    vpmovsxbw %xmm0, %xmm0
101; AVX2-NEXT:    addq $24, %rsp
102; AVX2-NEXT:    retq
103;
104; AVX512-LABEL: cmp_ne_load_const_extra_use1:
105; AVX512:       # %bb.0:
106; AVX512-NEXT:    subq $24, %rsp
107; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
108; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
109; AVX512-NEXT:    callq use_v8i8@PLT
110; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
111; AVX512-NEXT:    vpcmpeqb (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
112; AVX512-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
113; AVX512-NEXT:    vpmovsxbw %xmm0, %xmm0
114; AVX512-NEXT:    addq $24, %rsp
115; AVX512-NEXT:    vzeroupper
116; AVX512-NEXT:    retq
117  %loadx = load <8 x i8>, <8 x i8>* %x
118  call void @use_v8i8(<8 x i8> %loadx)
119  %icmp = icmp ne <8 x i8> %loadx, zeroinitializer
120  %sext = sext <8 x i1> %icmp to <8 x i16>
121  ret <8 x i16> %sext
122}
123
124; negative test - don't create extra compare
125
126define <8 x i16> @cmp_ne_load_const_extra_use2(<8 x i8>* %x) nounwind {
127; SSE-LABEL: cmp_ne_load_const_extra_use2:
128; SSE:       # %bb.0:
129; SSE-NEXT:    subq $24, %rsp
130; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
131; SSE-NEXT:    pxor %xmm1, %xmm1
132; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
133; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
134; SSE-NEXT:    pxor %xmm1, %xmm0
135; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
136; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
137; SSE-NEXT:    callq use_v8i1@PLT
138; SSE-NEXT:    punpcklbw (%rsp), %xmm0 # 16-byte Folded Reload
139; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
140; SSE-NEXT:    psraw $8, %xmm0
141; SSE-NEXT:    addq $24, %rsp
142; SSE-NEXT:    retq
143;
144; AVX2-LABEL: cmp_ne_load_const_extra_use2:
145; AVX2:       # %bb.0:
146; AVX2-NEXT:    subq $24, %rsp
147; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
148; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
149; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
150; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
151; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
152; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
153; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
154; AVX2-NEXT:    callq use_v8i1@PLT
155; AVX2-NEXT:    vpmovsxbw (%rsp), %xmm0 # 16-byte Folded Reload
156; AVX2-NEXT:    addq $24, %rsp
157; AVX2-NEXT:    retq
158;
159; AVX512-LABEL: cmp_ne_load_const_extra_use2:
160; AVX512:       # %bb.0:
161; AVX512-NEXT:    subq $72, %rsp
162; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
163; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
164; AVX512-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
165; AVX512-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
166; AVX512-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
167; AVX512-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
168; AVX512-NEXT:    vzeroupper
169; AVX512-NEXT:    callq use_v8i1@PLT
170; AVX512-NEXT:    vpmovsxbw (%rsp), %xmm0 # 16-byte Folded Reload
171; AVX512-NEXT:    addq $72, %rsp
172; AVX512-NEXT:    retq
173  %loadx = load <8 x i8>, <8 x i8>* %x
174  %icmp = icmp ne <8 x i8> %loadx, zeroinitializer
175  call void @use_v8i1(<8 x i1> %icmp)
176  %sext = sext <8 x i1> %icmp to <8 x i16>
177  ret <8 x i16> %sext
178}
179
180; negative test - not free extend
181
182define <8 x i16> @cmp_ne_no_load_const(i64 %x) nounwind {
183; SSE-LABEL: cmp_ne_no_load_const:
184; SSE:       # %bb.0:
185; SSE-NEXT:    movq %rdi, %xmm0
186; SSE-NEXT:    pxor %xmm1, %xmm1
187; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
188; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
189; SSE-NEXT:    pxor %xmm1, %xmm0
190; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
191; SSE-NEXT:    psraw $8, %xmm0
192; SSE-NEXT:    retq
193;
194; AVX2-LABEL: cmp_ne_no_load_const:
195; AVX2:       # %bb.0:
196; AVX2-NEXT:    vmovq %rdi, %xmm0
197; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
198; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
199; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
200; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
201; AVX2-NEXT:    vpmovsxbw %xmm0, %xmm0
202; AVX2-NEXT:    retq
203;
204; AVX512-LABEL: cmp_ne_no_load_const:
205; AVX512:       # %bb.0:
206; AVX512-NEXT:    vmovq %rdi, %xmm0
207; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
208; AVX512-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
209; AVX512-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
210; AVX512-NEXT:    vpmovsxbw %xmm0, %xmm0
211; AVX512-NEXT:    vzeroupper
212; AVX512-NEXT:    retq
213  %t = bitcast i64 %x to <8 x i8>
214  %icmp = icmp ne <8 x i8> %t, zeroinitializer
215  %sext = sext <8 x i1> %icmp to <8 x i16>
216  ret <8 x i16> %sext
217}
218
219define <4 x i32> @cmp_ult_load_const(<4 x i8>* %x) nounwind {
220; SSE-LABEL: cmp_ult_load_const:
221; SSE:       # %bb.0:
222; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
223; SSE-NEXT:    movdqa {{.*#+}} xmm1 = <42,214,0,255,u,u,u,u,u,u,u,u,u,u,u,u>
224; SSE-NEXT:    pmaxub %xmm0, %xmm1
225; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
226; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
227; SSE-NEXT:    pxor %xmm1, %xmm0
228; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
229; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
230; SSE-NEXT:    psrad $24, %xmm0
231; SSE-NEXT:    retq
232;
233; AVX-LABEL: cmp_ult_load_const:
234; AVX:       # %bb.0:
235; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
236; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [42,214,0,255]
237; AVX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
238; AVX-NEXT:    retq
239  %loadx = load <4 x i8>, <4 x i8>* %x
240  %icmp = icmp ult <4 x i8> %loadx, <i8 42, i8 -42, i8 0, i8 -1>
241  %sext = sext <4 x i1> %icmp to <4 x i32>
242  ret <4 x i32> %sext
243}
244
245; negative test - type must be legal
246
247define <3 x i32> @cmp_ult_load_const_bad_type(<3 x i8>* %x) nounwind {
248; SSE-LABEL: cmp_ult_load_const_bad_type:
249; SSE:       # %bb.0:
250; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
251; SSE-NEXT:    movdqa {{.*#+}} xmm1 = <42,214,0,u,u,u,u,u,u,u,u,u,u,u,u,u>
252; SSE-NEXT:    pmaxub %xmm0, %xmm1
253; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
254; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
255; SSE-NEXT:    pxor %xmm1, %xmm0
256; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
257; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
258; SSE-NEXT:    psrad $24, %xmm0
259; SSE-NEXT:    retq
260;
261; AVX2-LABEL: cmp_ult_load_const_bad_type:
262; AVX2:       # %bb.0:
263; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
264; AVX2-NEXT:    vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
265; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
266; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
267; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
268; AVX2-NEXT:    vpmovsxbd %xmm0, %xmm0
269; AVX2-NEXT:    retq
270;
271; AVX512-LABEL: cmp_ult_load_const_bad_type:
272; AVX512:       # %bb.0:
273; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
274; AVX512-NEXT:    vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
275; AVX512-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
276; AVX512-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
277; AVX512-NEXT:    vpmovsxbd %xmm0, %xmm0
278; AVX512-NEXT:    vzeroupper
279; AVX512-NEXT:    retq
280  %loadx = load <3 x i8>, <3 x i8>* %x
281  %icmp = icmp ult <3 x i8> %loadx, <i8 42, i8 -42, i8 0>
282  %sext = sext <3 x i1> %icmp to <3 x i32>
283  ret <3 x i32> %sext
284}
285
286; Signed compare needs signed extend.
287
288define <4 x i32> @cmp_slt_load_const(<4 x i8>* %x) nounwind {
289; SSE-LABEL: cmp_slt_load_const:
290; SSE:       # %bb.0:
291; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
292; SSE-NEXT:    movdqa {{.*#+}} xmm1 = <42,214,0,255,u,u,u,u,u,u,u,u,u,u,u,u>
293; SSE-NEXT:    pcmpgtb %xmm0, %xmm1
294; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
295; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
296; SSE-NEXT:    psrad $24, %xmm0
297; SSE-NEXT:    retq
298;
299; AVX-LABEL: cmp_slt_load_const:
300; AVX:       # %bb.0:
301; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
302; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [42,4294967254,0,4294967295]
303; AVX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
304; AVX-NEXT:    retq
305  %loadx = load <4 x i8>, <4 x i8>* %x
306  %icmp = icmp slt <4 x i8> %loadx, <i8 42, i8 -42, i8 0, i8 -1>
307  %sext = sext <4 x i1> %icmp to <4 x i32>
308  ret <4 x i32> %sext
309}
310
311define <2 x i64> @cmp_ne_zextload(<2 x i32>* %x, <2 x i32>* %y) nounwind {
312; SSE-LABEL: cmp_ne_zextload:
313; SSE:       # %bb.0:
314; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
315; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
316; SSE-NEXT:    pcmpeqd %xmm0, %xmm1
317; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
318; SSE-NEXT:    pxor %xmm1, %xmm0
319; SSE-NEXT:    pxor %xmm1, %xmm1
320; SSE-NEXT:    pcmpgtd %xmm0, %xmm1
321; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
322; SSE-NEXT:    retq
323;
324; AVX2-LABEL: cmp_ne_zextload:
325; AVX2:       # %bb.0:
326; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
327; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
328; AVX2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
329; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
330; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
331; AVX2-NEXT:    retq
332;
333; AVX512-LABEL: cmp_ne_zextload:
334; AVX512:       # %bb.0:
335; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
336; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
337; AVX512-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
338; AVX512-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
339; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
340; AVX512-NEXT:    vzeroupper
341; AVX512-NEXT:    retq
342  %loadx = load <2 x i32>, <2 x i32>* %x
343  %loady = load <2 x i32>, <2 x i32>* %y
344  %icmp = icmp ne <2 x i32> %loadx, %loady
345  %sext = sext <2 x i1> %icmp to <2 x i64>
346  ret <2 x i64> %sext
347}
348
349define <8 x i16> @cmp_ugt_zextload(<8 x i8>* %x, <8 x i8>* %y) nounwind {
350; SSE-LABEL: cmp_ugt_zextload:
351; SSE:       # %bb.0:
352; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
353; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
354; SSE-NEXT:    pminub %xmm0, %xmm1
355; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
356; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
357; SSE-NEXT:    pxor %xmm1, %xmm0
358; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
359; SSE-NEXT:    psraw $8, %xmm0
360; SSE-NEXT:    retq
361;
362; AVX-LABEL: cmp_ugt_zextload:
363; AVX:       # %bb.0:
364; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
365; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
366; AVX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
367; AVX-NEXT:    retq
368  %loadx = load <8 x i8>, <8 x i8>* %x
369  %loady = load <8 x i8>, <8 x i8>* %y
370  %icmp = icmp ugt <8 x i8> %loadx, %loady
371  %sext = sext <8 x i1> %icmp to <8 x i16>
372  ret <8 x i16> %sext
373}
374
375; Signed compare needs signed extends.
376
377define <8 x i16> @cmp_sgt_zextload(<8 x i8>* %x, <8 x i8>* %y) nounwind {
378; SSE-LABEL: cmp_sgt_zextload:
379; SSE:       # %bb.0:
380; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
381; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
382; SSE-NEXT:    pcmpgtb %xmm1, %xmm0
383; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
384; SSE-NEXT:    psraw $8, %xmm0
385; SSE-NEXT:    retq
386;
387; AVX-LABEL: cmp_sgt_zextload:
388; AVX:       # %bb.0:
389; AVX-NEXT:    vpmovsxbw (%rdi), %xmm0
390; AVX-NEXT:    vpmovsxbw (%rsi), %xmm1
391; AVX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
392; AVX-NEXT:    retq
393  %loadx = load <8 x i8>, <8 x i8>* %x
394  %loady = load <8 x i8>, <8 x i8>* %y
395  %icmp = icmp sgt <8 x i8> %loadx, %loady
396  %sext = sext <8 x i1> %icmp to <8 x i16>
397  ret <8 x i16> %sext
398}
399
400; negative test - don't change a legal op
401; TODO: Or should we? We can eliminate the vpmovsxwd at the cost of a 256-bit ymm vpcmpeqw.
402
403define <8 x i32> @cmp_ne_zextload_from_legal_op(<8 x i16>* %x, <8 x i16>* %y) {
404; SSE-LABEL: cmp_ne_zextload_from_legal_op:
405; SSE:       # %bb.0:
406; SSE-NEXT:    movdqa (%rdi), %xmm0
407; SSE-NEXT:    pcmpeqw (%rsi), %xmm0
408; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
409; SSE-NEXT:    pxor %xmm0, %xmm1
410; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
411; SSE-NEXT:    psrad $16, %xmm0
412; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
413; SSE-NEXT:    psrad $16, %xmm1
414; SSE-NEXT:    retq
415;
416; AVX2-LABEL: cmp_ne_zextload_from_legal_op:
417; AVX2:       # %bb.0:
418; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
419; AVX2-NEXT:    vpcmpeqw (%rsi), %xmm0, %xmm0
420; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
421; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
422; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
423; AVX2-NEXT:    retq
424;
425; AVX512-LABEL: cmp_ne_zextload_from_legal_op:
426; AVX512:       # %bb.0:
427; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
428; AVX512-NEXT:    vpcmpeqw (%rsi), %xmm0, %xmm0
429; AVX512-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
430; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
431; AVX512-NEXT:    retq
432  %loadx = load <8 x i16>, <8 x i16>* %x
433  %loady = load <8 x i16>, <8 x i16>* %y
434  %icmp = icmp ne <8 x i16> %loadx, %loady
435  %sext = sext <8 x i1> %icmp to <8 x i32>
436  ret <8 x i32> %sext
437}
438
439; Both uses of the load can be absorbed by the zext-load, so we eliminate the explicit casts.
440
441define <8 x i32> @PR50055(<8 x i8>* %src, <8 x i32>* %dst) nounwind {
442; SSE-LABEL: PR50055:
443; SSE:       # %bb.0:
444; SSE-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
445; SSE-NEXT:    pxor %xmm3, %xmm3
446; SSE-NEXT:    movdqa %xmm2, %xmm1
447; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
448; SSE-NEXT:    movdqa %xmm1, %xmm0
449; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
450; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
451; SSE-NEXT:    pcmpeqb %xmm3, %xmm2
452; SSE-NEXT:    pcmpeqd %xmm3, %xmm3
453; SSE-NEXT:    pxor %xmm2, %xmm3
454; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
455; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
456; SSE-NEXT:    psrad $24, %xmm3
457; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
458; SSE-NEXT:    psrad $24, %xmm2
459; SSE-NEXT:    movdqa %xmm2, 16(%rsi)
460; SSE-NEXT:    movdqa %xmm3, (%rsi)
461; SSE-NEXT:    retq
462;
463; AVX-LABEL: PR50055:
464; AVX:       # %bb.0:
465; AVX-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
466; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
467; AVX-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm1
468; AVX-NEXT:    vmovdqa %ymm1, (%rsi)
469; AVX-NEXT:    retq
470  %load = load <8 x i8>, <8 x i8>* %src
471  %zext = zext <8 x i8> %load to <8 x i32>
472  %icmp = icmp ne <8 x i8> %load, zeroinitializer
473  %sext = sext <8 x i1> %icmp to <8 x i32>
474  store <8 x i32> %sext, <8 x i32>* %dst
475  ret <8 x i32> %zext
476}
477
478; negative test - extra uses must be absorbable by a zext-load.
479
480define <8 x i16> @multi_use_narrower_size(<8 x i8>* %src, <8 x i32>* %dst) nounwind {
481; SSE-LABEL: multi_use_narrower_size:
482; SSE:       # %bb.0:
483; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
484; SSE-NEXT:    pxor %xmm2, %xmm2
485; SSE-NEXT:    movdqa %xmm1, %xmm0
486; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
487; SSE-NEXT:    pcmpeqb %xmm2, %xmm1
488; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
489; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
490; SSE-NEXT:    psrad $24, %xmm2
491; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
492; SSE-NEXT:    psrad $24, %xmm1
493; SSE-NEXT:    movdqa %xmm1, 16(%rsi)
494; SSE-NEXT:    movdqa %xmm2, (%rsi)
495; SSE-NEXT:    retq
496;
497; AVX-LABEL: multi_use_narrower_size:
498; AVX:       # %bb.0:
499; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
500; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
501; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
502; AVX-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm1
503; AVX-NEXT:    vpmovsxbd %xmm1, %ymm1
504; AVX-NEXT:    vmovdqa %ymm1, (%rsi)
505; AVX-NEXT:    vzeroupper
506; AVX-NEXT:    retq
507  %load = load <8 x i8>, <8 x i8>* %src
508  %zext = zext <8 x i8> %load to <8 x i16>
509  %icmp = icmp eq <8 x i8> %load, zeroinitializer
510  %sext = sext <8 x i1> %icmp to <8 x i32>
511  store <8 x i32> %sext, <8 x i32>* %dst
512  ret <8 x i16> %zext
513}
514
515; negative test - extra uses must be absorbable by a zext-load.
516
517define <8 x i32> @multi_use_wider_size(<8 x i8>* %src, <8 x i16>* %dst) nounwind {
518; SSE-LABEL: multi_use_wider_size:
519; SSE:       # %bb.0:
520; SSE-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
521; SSE-NEXT:    pxor %xmm3, %xmm3
522; SSE-NEXT:    movdqa %xmm2, %xmm1
523; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
524; SSE-NEXT:    movdqa %xmm1, %xmm0
525; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
526; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
527; SSE-NEXT:    pcmpeqb %xmm3, %xmm2
528; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
529; SSE-NEXT:    psraw $8, %xmm2
530; SSE-NEXT:    movdqa %xmm2, (%rsi)
531; SSE-NEXT:    retq
532;
533; AVX-LABEL: multi_use_wider_size:
534; AVX:       # %bb.0:
535; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
536; AVX-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
537; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
538; AVX-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm1
539; AVX-NEXT:    vpmovsxbw %xmm1, %xmm1
540; AVX-NEXT:    vmovdqa %xmm1, (%rsi)
541; AVX-NEXT:    retq
542  %load = load <8 x i8>, <8 x i8>* %src
543  %zext = zext <8 x i8> %load to <8 x i32>
544  %icmp = icmp eq <8 x i8> %load, zeroinitializer
545  %sext = sext <8 x i1> %icmp to <8 x i16>
546  store <8 x i16> %sext, <8 x i16>* %dst
547  ret <8 x i32> %zext
548}
549
550define <4 x i64> @PR50055_signed(<2 x i64>* %src, <4 x i64>* %dst) {
551; SSE-LABEL: PR50055_signed:
552; SSE:       # %bb.0:
553; SSE-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
554; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
555; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
556; SSE-NEXT:    psrad $24, %xmm0
557; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
558; SSE-NEXT:    psrad $24, %xmm1
559; SSE-NEXT:    pxor %xmm3, %xmm3
560; SSE-NEXT:    pcmpgtb %xmm3, %xmm2
561; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
562; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
563; SSE-NEXT:    psrad $24, %xmm3
564; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
565; SSE-NEXT:    psrad $24, %xmm2
566; SSE-NEXT:    movdqa %xmm2, 16(%rsi)
567; SSE-NEXT:    movdqa %xmm3, (%rsi)
568; SSE-NEXT:    retq
569;
570; AVX-LABEL: PR50055_signed:
571; AVX:       # %bb.0:
572; AVX-NEXT:    vpmovsxbd (%rdi), %ymm0
573; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
574; AVX-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm1
575; AVX-NEXT:    vmovdqa %ymm1, (%rsi)
576; AVX-NEXT:    retq
577  %t0 = bitcast <2 x i64>* %src to <8 x i8>*
578  %t1 = load <8 x i8>, <8 x i8>* %t0, align 1
579  %conv = sext <8 x i8> %t1 to <8 x i32>
580  %t2 = bitcast <8 x i32> %conv to <4 x i64>
581  %cmp = icmp sgt <8 x i8> %t1, zeroinitializer
582  %sext = sext <8 x i1> %cmp to <8 x i32>
583  %t3 = bitcast <4 x i64>* %dst to <8 x i32>*
584  store <8 x i32> %sext, <8 x i32>* %t3, align 32
585  ret <4 x i64> %t2
586}
587