1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,-sse2 < %s | FileCheck %s --check-prefix=CHECK-SSE1
3; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,+sse2 < %s | FileCheck %s --check-prefix=CHECK-SSE2
4; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+xop < %s | FileCheck %s --check-prefix=CHECK-XOP
5
6; ============================================================================ ;
7; Various cases with %x and/or %y being a constant
8; ============================================================================ ;
9
10define <4 x i32> @out_constant_varx_mone(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
11; CHECK-SSE1-LABEL: out_constant_varx_mone:
12; CHECK-SSE1:       # %bb.0:
13; CHECK-SSE1-NEXT:    movq %rdi, %rax
14; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
15; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
16; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
17; CHECK-SSE1-NEXT:    andps (%rsi), %xmm0
18; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
19; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
20; CHECK-SSE1-NEXT:    retq
21;
22; CHECK-SSE2-LABEL: out_constant_varx_mone:
23; CHECK-SSE2:       # %bb.0:
24; CHECK-SSE2-NEXT:    movdqa (%rdx), %xmm0
25; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
26; CHECK-SSE2-NEXT:    pxor %xmm0, %xmm1
27; CHECK-SSE2-NEXT:    pand (%rdi), %xmm0
28; CHECK-SSE2-NEXT:    por %xmm1, %xmm0
29; CHECK-SSE2-NEXT:    retq
30;
31; CHECK-XOP-LABEL: out_constant_varx_mone:
32; CHECK-XOP:       # %bb.0:
33; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm0
34; CHECK-XOP-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
35; CHECK-XOP-NEXT:    vpxor %xmm1, %xmm0, %xmm1
36; CHECK-XOP-NEXT:    vpand (%rdi), %xmm0, %xmm0
37; CHECK-XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
38; CHECK-XOP-NEXT:    retq
39  %x = load <4 x i32>, <4 x i32> *%px, align 16
40  %y = load <4 x i32>, <4 x i32> *%py, align 16
41  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
42  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
43  %mx = and <4 x i32> %mask, %x
44  %my = and <4 x i32> %notmask, <i32 -1, i32 -1, i32 -1, i32 -1>
45  %r = or <4 x i32> %mx, %my
46  ret <4 x i32> %r
47}
48
49define <4 x i32> @in_constant_varx_mone(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
50; CHECK-SSE1-LABEL: in_constant_varx_mone:
51; CHECK-SSE1:       # %bb.0:
52; CHECK-SSE1-NEXT:    movq %rdi, %rax
53; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm0
54; CHECK-SSE1-NEXT:    andnps (%rcx), %xmm0
55; CHECK-SSE1-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
56; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
57; CHECK-SSE1-NEXT:    retq
58;
59; CHECK-SSE2-LABEL: in_constant_varx_mone:
60; CHECK-SSE2:       # %bb.0:
61; CHECK-SSE2-NEXT:    movdqa (%rdi), %xmm0
62; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
63; CHECK-SSE2-NEXT:    pandn (%rdx), %xmm0
64; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm0
65; CHECK-SSE2-NEXT:    retq
66;
67; CHECK-XOP-LABEL: in_constant_varx_mone:
68; CHECK-XOP:       # %bb.0:
69; CHECK-XOP-NEXT:    vmovdqa (%rdi), %xmm0
70; CHECK-XOP-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
71; CHECK-XOP-NEXT:    vpandn (%rdx), %xmm0, %xmm0
72; CHECK-XOP-NEXT:    vpxor %xmm1, %xmm0, %xmm0
73; CHECK-XOP-NEXT:    retq
74  %x = load <4 x i32>, <4 x i32> *%px, align 16
75  %y = load <4 x i32>, <4 x i32> *%py, align 16
76  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
77  %n0 = xor <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1> ; %x
78  %n1 = and <4 x i32> %n0, %mask
79  %r = xor <4 x i32> %n1, <i32 -1, i32 -1, i32 -1, i32 -1>
80  ret <4 x i32> %r
81}
82
83; This is not a canonical form. Testing for completeness only.
84define <4 x i32> @out_constant_varx_mone_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
85; CHECK-SSE1-LABEL: out_constant_varx_mone_invmask:
86; CHECK-SSE1:       # %bb.0:
87; CHECK-SSE1-NEXT:    movq %rdi, %rax
88; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm0
89; CHECK-SSE1-NEXT:    orps (%rcx), %xmm0
90; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
91; CHECK-SSE1-NEXT:    retq
92;
93; CHECK-SSE2-LABEL: out_constant_varx_mone_invmask:
94; CHECK-SSE2:       # %bb.0:
95; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm0
96; CHECK-SSE2-NEXT:    orps (%rdx), %xmm0
97; CHECK-SSE2-NEXT:    retq
98;
99; CHECK-XOP-LABEL: out_constant_varx_mone_invmask:
100; CHECK-XOP:       # %bb.0:
101; CHECK-XOP-NEXT:    vmovaps (%rdi), %xmm0
102; CHECK-XOP-NEXT:    vorps (%rdx), %xmm0, %xmm0
103; CHECK-XOP-NEXT:    retq
104  %x = load <4 x i32>, <4 x i32> *%px, align 16
105  %y = load <4 x i32>, <4 x i32> *%py, align 16
106  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
107  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
108  %mx = and <4 x i32> %notmask, %x
109  %my = and <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
110  %r = or <4 x i32> %mx, %my
111  ret <4 x i32> %r
112}
113
114; This is not a canonical form. Testing for completeness only.
115define <4 x i32> @in_constant_varx_mone_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
116; CHECK-SSE1-LABEL: in_constant_varx_mone_invmask:
117; CHECK-SSE1:       # %bb.0:
118; CHECK-SSE1-NEXT:    movq %rdi, %rax
119; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm0
120; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
121; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm2
122; CHECK-SSE1-NEXT:    xorps %xmm1, %xmm2
123; CHECK-SSE1-NEXT:    andnps %xmm2, %xmm0
124; CHECK-SSE1-NEXT:    xorps %xmm1, %xmm0
125; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
126; CHECK-SSE1-NEXT:    retq
127;
128; CHECK-SSE2-LABEL: in_constant_varx_mone_invmask:
129; CHECK-SSE2:       # %bb.0:
130; CHECK-SSE2-NEXT:    movdqa (%rdi), %xmm0
131; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
132; CHECK-SSE2-NEXT:    movdqa (%rdx), %xmm2
133; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm2
134; CHECK-SSE2-NEXT:    pandn %xmm2, %xmm0
135; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm0
136; CHECK-SSE2-NEXT:    retq
137;
138; CHECK-XOP-LABEL: in_constant_varx_mone_invmask:
139; CHECK-XOP:       # %bb.0:
140; CHECK-XOP-NEXT:    vmovdqa (%rdi), %xmm0
141; CHECK-XOP-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
142; CHECK-XOP-NEXT:    vpxor (%rdx), %xmm1, %xmm2
143; CHECK-XOP-NEXT:    vpandn %xmm2, %xmm0, %xmm0
144; CHECK-XOP-NEXT:    vpxor %xmm1, %xmm0, %xmm0
145; CHECK-XOP-NEXT:    retq
146  %x = load <4 x i32>, <4 x i32> *%px, align 16
147  %y = load <4 x i32>, <4 x i32> *%py, align 16
148  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
149  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
150  %n0 = xor <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1> ; %x
151  %n1 = and <4 x i32> %n0, %notmask
152  %r = xor <4 x i32> %n1, <i32 -1, i32 -1, i32 -1, i32 -1>
153  ret <4 x i32> %r
154}
155
156define <4 x i32> @out_constant_varx_42(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
157; CHECK-SSE1-LABEL: out_constant_varx_42:
158; CHECK-SSE1:       # %bb.0:
159; CHECK-SSE1-NEXT:    movq %rdi, %rax
160; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
161; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm1
162; CHECK-SSE1-NEXT:    andps %xmm0, %xmm1
163; CHECK-SSE1-NEXT:    andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
164; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
165; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
166; CHECK-SSE1-NEXT:    retq
167;
168; CHECK-SSE2-LABEL: out_constant_varx_42:
169; CHECK-SSE2:       # %bb.0:
170; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
171; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm1
172; CHECK-SSE2-NEXT:    andps %xmm0, %xmm1
173; CHECK-SSE2-NEXT:    andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
174; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
175; CHECK-SSE2-NEXT:    retq
176;
177; CHECK-XOP-LABEL: out_constant_varx_42:
178; CHECK-XOP:       # %bb.0:
179; CHECK-XOP-NEXT:    vmovdqa (%rdi), %xmm0
180; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm1
181; CHECK-XOP-NEXT:    vpcmov %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
182; CHECK-XOP-NEXT:    retq
183  %x = load <4 x i32>, <4 x i32> *%px, align 16
184  %y = load <4 x i32>, <4 x i32> *%py, align 16
185  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
186  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
187  %mx = and <4 x i32> %mask, %x
188  %my = and <4 x i32> %notmask, <i32 42, i32 42, i32 42, i32 42>
189  %r = or <4 x i32> %mx, %my
190  ret <4 x i32> %r
191}
192
193define <4 x i32> @in_constant_varx_42(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
194; CHECK-SSE1-LABEL: in_constant_varx_42:
195; CHECK-SSE1:       # %bb.0:
196; CHECK-SSE1-NEXT:    movq %rdi, %rax
197; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
198; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm1
199; CHECK-SSE1-NEXT:    andps %xmm0, %xmm1
200; CHECK-SSE1-NEXT:    andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
201; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
202; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
203; CHECK-SSE1-NEXT:    retq
204;
205; CHECK-SSE2-LABEL: in_constant_varx_42:
206; CHECK-SSE2:       # %bb.0:
207; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
208; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm1
209; CHECK-SSE2-NEXT:    andps %xmm0, %xmm1
210; CHECK-SSE2-NEXT:    andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
211; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
212; CHECK-SSE2-NEXT:    retq
213;
214; CHECK-XOP-LABEL: in_constant_varx_42:
215; CHECK-XOP:       # %bb.0:
216; CHECK-XOP-NEXT:    vmovdqa (%rdi), %xmm0
217; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm1
218; CHECK-XOP-NEXT:    vpcmov %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
219; CHECK-XOP-NEXT:    retq
220  %x = load <4 x i32>, <4 x i32> *%px, align 16
221  %y = load <4 x i32>, <4 x i32> *%py, align 16
222  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
223  %n0 = xor <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42> ; %x
224  %n1 = and <4 x i32> %n0, %mask
225  %r = xor <4 x i32> %n1, <i32 42, i32 42, i32 42, i32 42>
226  ret <4 x i32> %r
227}
228
229; This is not a canonical form. Testing for completeness only.
230define <4 x i32> @out_constant_varx_42_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
231; CHECK-SSE1-LABEL: out_constant_varx_42_invmask:
232; CHECK-SSE1:       # %bb.0:
233; CHECK-SSE1-NEXT:    movq %rdi, %rax
234; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
235; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
236; CHECK-SSE1-NEXT:    andnps (%rsi), %xmm1
237; CHECK-SSE1-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
238; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
239; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
240; CHECK-SSE1-NEXT:    retq
241;
242; CHECK-SSE2-LABEL: out_constant_varx_42_invmask:
243; CHECK-SSE2:       # %bb.0:
244; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
245; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm1
246; CHECK-SSE2-NEXT:    andnps (%rdi), %xmm1
247; CHECK-SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
248; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
249; CHECK-SSE2-NEXT:    retq
250;
251; CHECK-XOP-LABEL: out_constant_varx_42_invmask:
252; CHECK-XOP:       # %bb.0:
253; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm0
254; CHECK-XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [42,42,42,42]
255; CHECK-XOP-NEXT:    vpcmov %xmm0, (%rdi), %xmm1, %xmm0
256; CHECK-XOP-NEXT:    retq
257  %x = load <4 x i32>, <4 x i32> *%px, align 16
258  %y = load <4 x i32>, <4 x i32> *%py, align 16
259  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
260  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
261  %mx = and <4 x i32> %notmask, %x
262  %my = and <4 x i32> %mask, <i32 42, i32 42, i32 42, i32 42>
263  %r = or <4 x i32> %mx, %my
264  ret <4 x i32> %r
265}
266
267; This is not a canonical form. Testing for completeness only.
268define <4 x i32> @in_constant_varx_42_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
269; CHECK-SSE1-LABEL: in_constant_varx_42_invmask:
270; CHECK-SSE1:       # %bb.0:
271; CHECK-SSE1-NEXT:    movq %rdi, %rax
272; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
273; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
274; CHECK-SSE1-NEXT:    andnps (%rsi), %xmm1
275; CHECK-SSE1-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
276; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
277; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
278; CHECK-SSE1-NEXT:    retq
279;
280; CHECK-SSE2-LABEL: in_constant_varx_42_invmask:
281; CHECK-SSE2:       # %bb.0:
282; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
283; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm1
284; CHECK-SSE2-NEXT:    andnps (%rdi), %xmm1
285; CHECK-SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
286; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
287; CHECK-SSE2-NEXT:    retq
288;
289; CHECK-XOP-LABEL: in_constant_varx_42_invmask:
290; CHECK-XOP:       # %bb.0:
291; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm0
292; CHECK-XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [42,42,42,42]
293; CHECK-XOP-NEXT:    vpcmov %xmm0, (%rdi), %xmm1, %xmm0
294; CHECK-XOP-NEXT:    retq
295  %x = load <4 x i32>, <4 x i32> *%px, align 16
296  %y = load <4 x i32>, <4 x i32> *%py, align 16
297  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
298  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
299  %n0 = xor <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42> ; %x
300  %n1 = and <4 x i32> %n0, %notmask
301  %r = xor <4 x i32> %n1, <i32 42, i32 42, i32 42, i32 42>
302  ret <4 x i32> %r
303}
304
305define <4 x i32> @out_constant_mone_vary(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
306; CHECK-SSE1-LABEL: out_constant_mone_vary:
307; CHECK-SSE1:       # %bb.0:
308; CHECK-SSE1-NEXT:    movq %rdi, %rax
309; CHECK-SSE1-NEXT:    movaps (%rdx), %xmm0
310; CHECK-SSE1-NEXT:    orps (%rcx), %xmm0
311; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
312; CHECK-SSE1-NEXT:    retq
313;
314; CHECK-SSE2-LABEL: out_constant_mone_vary:
315; CHECK-SSE2:       # %bb.0:
316; CHECK-SSE2-NEXT:    movaps (%rsi), %xmm0
317; CHECK-SSE2-NEXT:    orps (%rdx), %xmm0
318; CHECK-SSE2-NEXT:    retq
319;
320; CHECK-XOP-LABEL: out_constant_mone_vary:
321; CHECK-XOP:       # %bb.0:
322; CHECK-XOP-NEXT:    vmovaps (%rsi), %xmm0
323; CHECK-XOP-NEXT:    vorps (%rdx), %xmm0, %xmm0
324; CHECK-XOP-NEXT:    retq
325  %x = load <4 x i32>, <4 x i32> *%px, align 16
326  %y = load <4 x i32>, <4 x i32> *%py, align 16
327  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
328  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
329  %mx = and <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
330  %my = and <4 x i32> %notmask, %y
331  %r = or <4 x i32> %mx, %my
332  ret <4 x i32> %r
333}
334
335define <4 x i32> @in_constant_mone_vary(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
336; CHECK-SSE1-LABEL: in_constant_mone_vary:
337; CHECK-SSE1:       # %bb.0:
338; CHECK-SSE1-NEXT:    movq %rdi, %rax
339; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
340; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
341; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm1
342; CHECK-SSE1-NEXT:    orps %xmm0, %xmm1
343; CHECK-SSE1-NEXT:    movaps %xmm1, (%rdi)
344; CHECK-SSE1-NEXT:    retq
345;
346; CHECK-SSE2-LABEL: in_constant_mone_vary:
347; CHECK-SSE2:       # %bb.0:
348; CHECK-SSE2-NEXT:    movaps (%rsi), %xmm0
349; CHECK-SSE2-NEXT:    orps (%rdx), %xmm0
350; CHECK-SSE2-NEXT:    retq
351;
352; CHECK-XOP-LABEL: in_constant_mone_vary:
353; CHECK-XOP:       # %bb.0:
354; CHECK-XOP-NEXT:    vmovaps (%rsi), %xmm0
355; CHECK-XOP-NEXT:    vorps (%rdx), %xmm0, %xmm0
356; CHECK-XOP-NEXT:    retq
357  %x = load <4 x i32>, <4 x i32> *%px, align 16
358  %y = load <4 x i32>, <4 x i32> *%py, align 16
359  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
360  %n0 = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %y ; %x
361  %n1 = and <4 x i32> %n0, %mask
362  %r = xor <4 x i32> %n1, %y
363  ret <4 x i32> %r
364}
365
366; This is not a canonical form. Testing for completeness only.
367define <4 x i32> @out_constant_mone_vary_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
368; CHECK-SSE1-LABEL: out_constant_mone_vary_invmask:
369; CHECK-SSE1:       # %bb.0:
370; CHECK-SSE1-NEXT:    movq %rdi, %rax
371; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
372; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
373; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
374; CHECK-SSE1-NEXT:    andps (%rdx), %xmm0
375; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
376; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
377; CHECK-SSE1-NEXT:    retq
378;
379; CHECK-SSE2-LABEL: out_constant_mone_vary_invmask:
380; CHECK-SSE2:       # %bb.0:
381; CHECK-SSE2-NEXT:    movdqa (%rdx), %xmm0
382; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
383; CHECK-SSE2-NEXT:    pxor %xmm0, %xmm1
384; CHECK-SSE2-NEXT:    pand (%rsi), %xmm0
385; CHECK-SSE2-NEXT:    por %xmm1, %xmm0
386; CHECK-SSE2-NEXT:    retq
387;
388; CHECK-XOP-LABEL: out_constant_mone_vary_invmask:
389; CHECK-XOP:       # %bb.0:
390; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm0
391; CHECK-XOP-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
392; CHECK-XOP-NEXT:    vpxor %xmm1, %xmm0, %xmm1
393; CHECK-XOP-NEXT:    vpand (%rsi), %xmm0, %xmm0
394; CHECK-XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
395; CHECK-XOP-NEXT:    retq
396  %x = load <4 x i32>, <4 x i32> *%px, align 16
397  %y = load <4 x i32>, <4 x i32> *%py, align 16
398  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
399  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
400  %mx = and <4 x i32> %notmask, <i32 -1, i32 -1, i32 -1, i32 -1>
401  %my = and <4 x i32> %mask, %y
402  %r = or <4 x i32> %mx, %my
403  ret <4 x i32> %r
404}
405
406; This is not a canonical form. Testing for completeness only.
407define <4 x i32> @in_constant_mone_vary_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
408; CHECK-SSE1-LABEL: in_constant_mone_vary_invmask:
409; CHECK-SSE1:       # %bb.0:
410; CHECK-SSE1-NEXT:    movq %rdi, %rax
411; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
412; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
413; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
414; CHECK-SSE1-NEXT:    andps (%rdx), %xmm0
415; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
416; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
417; CHECK-SSE1-NEXT:    retq
418;
419; CHECK-SSE2-LABEL: in_constant_mone_vary_invmask:
420; CHECK-SSE2:       # %bb.0:
421; CHECK-SSE2-NEXT:    movdqa (%rdx), %xmm0
422; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
423; CHECK-SSE2-NEXT:    pxor %xmm0, %xmm1
424; CHECK-SSE2-NEXT:    pand (%rsi), %xmm0
425; CHECK-SSE2-NEXT:    por %xmm1, %xmm0
426; CHECK-SSE2-NEXT:    retq
427;
428; CHECK-XOP-LABEL: in_constant_mone_vary_invmask:
429; CHECK-XOP:       # %bb.0:
430; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm0
431; CHECK-XOP-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
432; CHECK-XOP-NEXT:    vpxor %xmm1, %xmm0, %xmm1
433; CHECK-XOP-NEXT:    vpand (%rsi), %xmm0, %xmm0
434; CHECK-XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
435; CHECK-XOP-NEXT:    retq
436  %x = load <4 x i32>, <4 x i32> *%px, align 16
437  %y = load <4 x i32>, <4 x i32> *%py, align 16
438  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
439  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
440  %n0 = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %y ; %x
441  %n1 = and <4 x i32> %n0, %notmask
442  %r = xor <4 x i32> %n1, %y
443  ret <4 x i32> %r
444}
445
446define <4 x i32> @out_constant_42_vary(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
447; CHECK-SSE1-LABEL: out_constant_42_vary:
448; CHECK-SSE1:       # %bb.0:
449; CHECK-SSE1-NEXT:    movq %rdi, %rax
450; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
451; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm1 = [5.88545355E-44,5.88545355E-44,5.88545355E-44,5.88545355E-44]
452; CHECK-SSE1-NEXT:    andps %xmm0, %xmm1
453; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm0
454; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
455; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
456; CHECK-SSE1-NEXT:    retq
457;
458; CHECK-SSE2-LABEL: out_constant_42_vary:
459; CHECK-SSE2:       # %bb.0:
460; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
461; CHECK-SSE2-NEXT:    movaps {{.*#+}} xmm1 = [42,42,42,42]
462; CHECK-SSE2-NEXT:    andps %xmm0, %xmm1
463; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm0
464; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
465; CHECK-SSE2-NEXT:    retq
466;
467; CHECK-XOP-LABEL: out_constant_42_vary:
468; CHECK-XOP:       # %bb.0:
469; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm0
470; CHECK-XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [42,42,42,42]
471; CHECK-XOP-NEXT:    vpcmov %xmm0, (%rsi), %xmm1, %xmm0
472; CHECK-XOP-NEXT:    retq
473  %x = load <4 x i32>, <4 x i32> *%px, align 16
474  %y = load <4 x i32>, <4 x i32> *%py, align 16
475  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
476  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
477  %mx = and <4 x i32> %mask, <i32 42, i32 42, i32 42, i32 42>
478  %my = and <4 x i32> %notmask, %y
479  %r = or <4 x i32> %mx, %my
480  ret <4 x i32> %r
481}
482
483define <4 x i32> @in_constant_42_vary(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
484; CHECK-SSE1-LABEL: in_constant_42_vary:
485; CHECK-SSE1:       # %bb.0:
486; CHECK-SSE1-NEXT:    movq %rdi, %rax
487; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
488; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
489; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm1
490; CHECK-SSE1-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
491; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
492; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
493; CHECK-SSE1-NEXT:    retq
494;
495; CHECK-SSE2-LABEL: in_constant_42_vary:
496; CHECK-SSE2:       # %bb.0:
497; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
498; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm1
499; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm1
500; CHECK-SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
501; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
502; CHECK-SSE2-NEXT:    retq
503;
504; CHECK-XOP-LABEL: in_constant_42_vary:
505; CHECK-XOP:       # %bb.0:
506; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm0
507; CHECK-XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [42,42,42,42]
508; CHECK-XOP-NEXT:    vpcmov %xmm0, (%rsi), %xmm1, %xmm0
509; CHECK-XOP-NEXT:    retq
510  %x = load <4 x i32>, <4 x i32> *%px, align 16
511  %y = load <4 x i32>, <4 x i32> *%py, align 16
512  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
513  %n0 = xor <4 x i32> <i32 42, i32 42, i32 42, i32 42>, %y ; %x
514  %n1 = and <4 x i32> %n0, %mask
515  %r = xor <4 x i32> %n1, %y
516  ret <4 x i32> %r
517}
518
519; This is not a canonical form. Testing for completeness only.
520define <4 x i32> @out_constant_42_vary_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
521; CHECK-SSE1-LABEL: out_constant_42_vary_invmask:
522; CHECK-SSE1:       # %bb.0:
523; CHECK-SSE1-NEXT:    movq %rdi, %rax
524; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
525; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
526; CHECK-SSE1-NEXT:    andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
527; CHECK-SSE1-NEXT:    andps (%rdx), %xmm0
528; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
529; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
530; CHECK-SSE1-NEXT:    retq
531;
532; CHECK-SSE2-LABEL: out_constant_42_vary_invmask:
533; CHECK-SSE2:       # %bb.0:
534; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
535; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm1
536; CHECK-SSE2-NEXT:    andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
537; CHECK-SSE2-NEXT:    andps (%rsi), %xmm0
538; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
539; CHECK-SSE2-NEXT:    retq
540;
541; CHECK-XOP-LABEL: out_constant_42_vary_invmask:
542; CHECK-XOP:       # %bb.0:
543; CHECK-XOP-NEXT:    vmovdqa (%rsi), %xmm0
544; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm1
545; CHECK-XOP-NEXT:    vpcmov %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
546; CHECK-XOP-NEXT:    retq
547  %x = load <4 x i32>, <4 x i32> *%px, align 16
548  %y = load <4 x i32>, <4 x i32> *%py, align 16
549  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
550  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
551  %mx = and <4 x i32> %notmask, <i32 42, i32 42, i32 42, i32 42>
552  %my = and <4 x i32> %mask, %y
553  %r = or <4 x i32> %mx, %my
554  ret <4 x i32> %r
555}
556
557; This is not a canonical form. Testing for completeness only.
558define <4 x i32> @in_constant_42_vary_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
559; CHECK-SSE1-LABEL: in_constant_42_vary_invmask:
560; CHECK-SSE1:       # %bb.0:
561; CHECK-SSE1-NEXT:    movq %rdi, %rax
562; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
563; CHECK-SSE1-NEXT:    movaps (%rdx), %xmm1
564; CHECK-SSE1-NEXT:    andps %xmm0, %xmm1
565; CHECK-SSE1-NEXT:    andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
566; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
567; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
568; CHECK-SSE1-NEXT:    retq
569;
570; CHECK-SSE2-LABEL: in_constant_42_vary_invmask:
571; CHECK-SSE2:       # %bb.0:
572; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
573; CHECK-SSE2-NEXT:    movaps (%rsi), %xmm1
574; CHECK-SSE2-NEXT:    andps %xmm0, %xmm1
575; CHECK-SSE2-NEXT:    andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
576; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
577; CHECK-SSE2-NEXT:    retq
578;
579; CHECK-XOP-LABEL: in_constant_42_vary_invmask:
580; CHECK-XOP:       # %bb.0:
581; CHECK-XOP-NEXT:    vmovdqa (%rsi), %xmm0
582; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm1
583; CHECK-XOP-NEXT:    vpcmov %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
584; CHECK-XOP-NEXT:    retq
585  %x = load <4 x i32>, <4 x i32> *%px, align 16
586  %y = load <4 x i32>, <4 x i32> *%py, align 16
587  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
588  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
589  %n0 = xor <4 x i32> <i32 42, i32 42, i32 42, i32 42>, %y ; %x
590  %n1 = and <4 x i32> %n0, %notmask
591  %r = xor <4 x i32> %n1, %y
592  ret <4 x i32> %r
593}
594