1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2   | FileCheck %s --check-prefixes=CHECK,AVX
4
5; fold (add x, 0) -> x
6define <4 x i32> @combine_vec_add_to_zero(<4 x i32> %a) {
7; CHECK-LABEL: combine_vec_add_to_zero:
8; CHECK:       # %bb.0:
9; CHECK-NEXT:    retq
10  %1 = add <4 x i32> %a, zeroinitializer
11  ret <4 x i32> %1
12}
13
14; fold ((c1-A)+c2) -> (c1+c2)-A
15define <4 x i32> @combine_vec_add_constant_sub(<4 x i32> %a) {
16; SSE-LABEL: combine_vec_add_constant_sub:
17; SSE:       # %bb.0:
18; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,2,4,6]
19; SSE-NEXT:    psubd %xmm0, %xmm1
20; SSE-NEXT:    movdqa %xmm1, %xmm0
21; SSE-NEXT:    retq
22;
23; AVX-LABEL: combine_vec_add_constant_sub:
24; AVX:       # %bb.0:
25; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,2,4,6]
26; AVX-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
27; AVX-NEXT:    retq
28  %1 = sub <4 x i32> <i32 0, i32 1, i32 2, i32 3>, %a
29  %2 = add <4 x i32> <i32 0, i32 1, i32 2, i32 3>, %1
30  ret <4 x i32> %2
31}
32
33; fold ((0-A) + B) -> B-A
34define <4 x i32> @combine_vec_add_neg0(<4 x i32> %a, <4 x i32> %b) {
35; SSE-LABEL: combine_vec_add_neg0:
36; SSE:       # %bb.0:
37; SSE-NEXT:    psubd %xmm0, %xmm1
38; SSE-NEXT:    movdqa %xmm1, %xmm0
39; SSE-NEXT:    retq
40;
41; AVX-LABEL: combine_vec_add_neg0:
42; AVX:       # %bb.0:
43; AVX-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
44; AVX-NEXT:    retq
45  %1 = sub <4 x i32> zeroinitializer, %a
46  %2 = add <4 x i32> %1, %b
47  ret <4 x i32> %2
48}
49
50; fold (A + (0-B)) -> A-B
51define <4 x i32> @combine_vec_add_neg1(<4 x i32> %a, <4 x i32> %b) {
52; SSE-LABEL: combine_vec_add_neg1:
53; SSE:       # %bb.0:
54; SSE-NEXT:    psubd %xmm1, %xmm0
55; SSE-NEXT:    retq
56;
57; AVX-LABEL: combine_vec_add_neg1:
58; AVX:       # %bb.0:
59; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
60; AVX-NEXT:    retq
61  %1 = sub <4 x i32> zeroinitializer, %b
62  %2 = add <4 x i32> %a, %1
63  ret <4 x i32> %2
64}
65
66; fold (A+(B-A)) -> B
67define <4 x i32> @combine_vec_add_sub0(<4 x i32> %a, <4 x i32> %b) {
68; SSE-LABEL: combine_vec_add_sub0:
69; SSE:       # %bb.0:
70; SSE-NEXT:    movaps %xmm1, %xmm0
71; SSE-NEXT:    retq
72;
73; AVX-LABEL: combine_vec_add_sub0:
74; AVX:       # %bb.0:
75; AVX-NEXT:    vmovaps %xmm1, %xmm0
76; AVX-NEXT:    retq
77  %1 = sub <4 x i32> %b, %a
78  %2 = add <4 x i32> %a, %1
79  ret <4 x i32> %2
80}
81
82; fold ((B-A)+A) -> B
83define <4 x i32> @combine_vec_add_sub1(<4 x i32> %a, <4 x i32> %b) {
84; SSE-LABEL: combine_vec_add_sub1:
85; SSE:       # %bb.0:
86; SSE-NEXT:    movaps %xmm1, %xmm0
87; SSE-NEXT:    retq
88;
89; AVX-LABEL: combine_vec_add_sub1:
90; AVX:       # %bb.0:
91; AVX-NEXT:    vmovaps %xmm1, %xmm0
92; AVX-NEXT:    retq
93  %1 = sub <4 x i32> %b, %a
94  %2 = add <4 x i32> %1, %a
95  ret <4 x i32> %2
96}
97
98; fold ((A-B)+(C-A)) -> (C-B)
99define <4 x i32> @combine_vec_add_sub_sub0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
100; SSE-LABEL: combine_vec_add_sub_sub0:
101; SSE:       # %bb.0:
102; SSE-NEXT:    movdqa %xmm2, %xmm0
103; SSE-NEXT:    psubd %xmm1, %xmm0
104; SSE-NEXT:    retq
105;
106; AVX-LABEL: combine_vec_add_sub_sub0:
107; AVX:       # %bb.0:
108; AVX-NEXT:    vpsubd %xmm1, %xmm2, %xmm0
109; AVX-NEXT:    retq
110  %1 = sub <4 x i32> %a, %b
111  %2 = sub <4 x i32> %c, %a
112  %3 = add <4 x i32> %1, %2
113  ret <4 x i32> %3
114}
115
116; fold ((A-B)+(B-C)) -> (A-C)
117define <4 x i32> @combine_vec_add_sub_sub1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
118; SSE-LABEL: combine_vec_add_sub_sub1:
119; SSE:       # %bb.0:
120; SSE-NEXT:    psubd %xmm2, %xmm0
121; SSE-NEXT:    retq
122;
123; AVX-LABEL: combine_vec_add_sub_sub1:
124; AVX:       # %bb.0:
125; AVX-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
126; AVX-NEXT:    retq
127  %1 = sub <4 x i32> %a, %b
128  %2 = sub <4 x i32> %b, %c
129  %3 = add <4 x i32> %1, %2
130  ret <4 x i32> %3
131}
132
133; fold (A+(B-(A+C))) to (B-C)
134define <4 x i32> @combine_vec_add_sub_add0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
135; SSE-LABEL: combine_vec_add_sub_add0:
136; SSE:       # %bb.0:
137; SSE-NEXT:    movdqa %xmm1, %xmm0
138; SSE-NEXT:    psubd %xmm2, %xmm0
139; SSE-NEXT:    retq
140;
141; AVX-LABEL: combine_vec_add_sub_add0:
142; AVX:       # %bb.0:
143; AVX-NEXT:    vpsubd %xmm2, %xmm1, %xmm0
144; AVX-NEXT:    retq
145  %1 = add <4 x i32> %a, %c
146  %2 = sub <4 x i32> %b, %1
147  %3 = add <4 x i32> %a, %2
148  ret <4 x i32> %3
149}
150
151; fold (A+(B-(C+A))) to (B-C)
152define <4 x i32> @combine_vec_add_sub_add1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
153; SSE-LABEL: combine_vec_add_sub_add1:
154; SSE:       # %bb.0:
155; SSE-NEXT:    movdqa %xmm1, %xmm0
156; SSE-NEXT:    psubd %xmm2, %xmm0
157; SSE-NEXT:    retq
158;
159; AVX-LABEL: combine_vec_add_sub_add1:
160; AVX:       # %bb.0:
161; AVX-NEXT:    vpsubd %xmm2, %xmm1, %xmm0
162; AVX-NEXT:    retq
163  %1 = add <4 x i32> %c, %a
164  %2 = sub <4 x i32> %b, %1
165  %3 = add <4 x i32> %a, %2
166  ret <4 x i32> %3
167}
168
169; fold (A+((B-A)+C)) to (B+C)
170define <4 x i32> @combine_vec_add_sub_add2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
171; SSE-LABEL: combine_vec_add_sub_add2:
172; SSE:       # %bb.0:
173; SSE-NEXT:    movdqa %xmm1, %xmm0
174; SSE-NEXT:    paddd %xmm2, %xmm0
175; SSE-NEXT:    retq
176;
177; AVX-LABEL: combine_vec_add_sub_add2:
178; AVX:       # %bb.0:
179; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm0
180; AVX-NEXT:    retq
181  %1 = sub <4 x i32> %b, %a
182  %2 = add <4 x i32> %1, %c
183  %3 = add <4 x i32> %a, %2
184  ret <4 x i32> %3
185}
186
187; fold (A+((B-A)-C)) to (B-C)
188define <4 x i32> @combine_vec_add_sub_add3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
189; SSE-LABEL: combine_vec_add_sub_add3:
190; SSE:       # %bb.0:
191; SSE-NEXT:    movdqa %xmm1, %xmm0
192; SSE-NEXT:    psubd %xmm2, %xmm0
193; SSE-NEXT:    retq
194;
195; AVX-LABEL: combine_vec_add_sub_add3:
196; AVX:       # %bb.0:
197; AVX-NEXT:    vpsubd %xmm2, %xmm1, %xmm0
198; AVX-NEXT:    retq
199  %1 = sub <4 x i32> %b, %a
200  %2 = sub <4 x i32> %1, %c
201  %3 = add <4 x i32> %a, %2
202  ret <4 x i32> %3
203}
204
205; fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant
206define <4 x i32> @combine_vec_add_sub_sub(<4 x i32> %a, <4 x i32> %b, <4 x i32> %d) {
207; SSE-LABEL: combine_vec_add_sub_sub:
208; SSE:       # %bb.0:
209; SSE-NEXT:    paddd %xmm2, %xmm1
210; SSE-NEXT:    psubd %xmm1, %xmm0
211; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
212; SSE-NEXT:    retq
213;
214; AVX-LABEL: combine_vec_add_sub_sub:
215; AVX:       # %bb.0:
216; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
217; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
218; AVX-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
219; AVX-NEXT:    retq
220  %1 = sub <4 x i32> %a, %b
221  %2 = sub <4 x i32> <i32 0, i32 1, i32 2, i32 3>, %d
222  %3 = add <4 x i32> %1, %2
223  ret <4 x i32> %3
224}
225
226; fold (a+b) -> (a|b) iff a and b share no bits.
227define <4 x i32> @combine_vec_add_uniquebits(<4 x i32> %a, <4 x i32> %b) {
228; SSE-LABEL: combine_vec_add_uniquebits:
229; SSE:       # %bb.0:
230; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
231; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
232; SSE-NEXT:    orps %xmm1, %xmm0
233; SSE-NEXT:    retq
234;
235; AVX-LABEL: combine_vec_add_uniquebits:
236; AVX:       # %bb.0:
237; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [61680,61680,61680,61680]
238; AVX-NEXT:    vandps %xmm2, %xmm0, %xmm0
239; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [3855,3855,3855,3855]
240; AVX-NEXT:    vandps %xmm2, %xmm1, %xmm1
241; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
242; AVX-NEXT:    retq
243  %1 = and <4 x i32> %a, <i32 61680, i32 61680, i32 61680, i32 61680>
244  %2 = and <4 x i32> %b, <i32 3855, i32 3855, i32 3855, i32 3855>
245  %3 = add <4 x i32> %1, %2
246  ret <4 x i32> %3
247}
248
249; fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))
250define <4 x i32> @combine_vec_add_shl_neg0(<4 x i32> %x, <4 x i32> %y) {
251; SSE-LABEL: combine_vec_add_shl_neg0:
252; SSE:       # %bb.0:
253; SSE-NEXT:    pslld $5, %xmm1
254; SSE-NEXT:    psubd %xmm1, %xmm0
255; SSE-NEXT:    retq
256;
257; AVX-LABEL: combine_vec_add_shl_neg0:
258; AVX:       # %bb.0:
259; AVX-NEXT:    vpslld $5, %xmm1, %xmm1
260; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
261; AVX-NEXT:    retq
262  %1 = sub <4 x i32> zeroinitializer, %y
263  %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
264  %3 = add <4 x i32> %x, %2
265  ret <4 x i32> %3
266}
267
268; fold (add shl(0 - y, n), x) -> sub(x, shl(y, n))
269define <4 x i32> @combine_vec_add_shl_neg1(<4 x i32> %x, <4 x i32> %y) {
270; SSE-LABEL: combine_vec_add_shl_neg1:
271; SSE:       # %bb.0:
272; SSE-NEXT:    pslld $5, %xmm1
273; SSE-NEXT:    psubd %xmm1, %xmm0
274; SSE-NEXT:    retq
275;
276; AVX-LABEL: combine_vec_add_shl_neg1:
277; AVX:       # %bb.0:
278; AVX-NEXT:    vpslld $5, %xmm1, %xmm1
279; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
280; AVX-NEXT:    retq
281  %1 = sub <4 x i32> zeroinitializer, %y
282  %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
283  %3 = add <4 x i32> %2, %x
284  ret <4 x i32> %3
285}
286
287; (add z, (and (sbbl x, x), 1)) -> (sub z, (sbbl x, x))
288; and similar xforms where the inner op is either ~0 or 0.
289define <4 x i32> @combine_vec_add_and_compare(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
290; SSE-LABEL: combine_vec_add_and_compare:
291; SSE:       # %bb.0:
292; SSE-NEXT:    pcmpeqd %xmm2, %xmm1
293; SSE-NEXT:    psubd %xmm1, %xmm0
294; SSE-NEXT:    retq
295;
296; AVX-LABEL: combine_vec_add_and_compare:
297; AVX:       # %bb.0:
298; AVX-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
299; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
300; AVX-NEXT:    retq
301  %1 = icmp eq <4 x i32> %a1, %a2
302  %2 = sext <4 x i1> %1 to <4 x i32>
303  %3 = and <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
304  %4 = add <4 x i32> %a0, %3
305  ret <4 x i32> %4
306}
307
308; add (sext i1), X -> sub X, (zext i1)
309define <4 x i32> @combine_vec_add_sext(<4 x i1> %a0, <4 x i32> %a1) {
310; SSE-LABEL: combine_vec_add_sext:
311; SSE:       # %bb.0:
312; SSE-NEXT:    pslld $31, %xmm0
313; SSE-NEXT:    psrad $31, %xmm0
314; SSE-NEXT:    paddd %xmm1, %xmm0
315; SSE-NEXT:    retq
316;
317; AVX-LABEL: combine_vec_add_sext:
318; AVX:       # %bb.0:
319; AVX-NEXT:    vpslld $31, %xmm0, %xmm0
320; AVX-NEXT:    vpsrad $31, %xmm0, %xmm0
321; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
322; AVX-NEXT:    retq
323  %1 = sext <4 x i1> %a0 to <4 x i32>
324  %2 = add <4 x i32> %1, %a1
325  ret <4 x i32> %2
326}
327
328; add (sext i1), X -> sub X, (zext i1)
329define <4 x i32> @combine_vec_add_sextinreg(<4 x i32> %a0, <4 x i32> %a1) {
330; SSE-LABEL: combine_vec_add_sextinreg:
331; SSE:       # %bb.0:
332; SSE-NEXT:    pslld $31, %xmm0
333; SSE-NEXT:    psrad $31, %xmm0
334; SSE-NEXT:    paddd %xmm1, %xmm0
335; SSE-NEXT:    retq
336;
337; AVX-LABEL: combine_vec_add_sextinreg:
338; AVX:       # %bb.0:
339; AVX-NEXT:    vpslld $31, %xmm0, %xmm0
340; AVX-NEXT:    vpsrad $31, %xmm0, %xmm0
341; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
342; AVX-NEXT:    retq
343  %1 = shl <4 x i32> %a0, <i32 31, i32 31, i32 31, i32 31>
344  %2 = ashr <4 x i32> %1, <i32 31, i32 31, i32 31, i32 31>
345  %3 = add <4 x i32> %2, %a1
346  ret <4 x i32> %3
347}
348
349; (add (add (xor a, -1), b), 1) -> (sub b, a)
350define i32 @combine_add_add_not(i32 %a, i32 %b) {
351; CHECK-LABEL: combine_add_add_not:
352; CHECK:       # %bb.0:
353; CHECK-NEXT:    movl %esi, %eax
354; CHECK-NEXT:    subl %edi, %eax
355; CHECK-NEXT:    retq
356  %nota = xor i32 %a, -1
357  %add = add i32 %nota, %b
358  %r = add i32 %add, 1
359  ret i32 %r
360}
361
362define <4 x i32> @combine_vec_add_add_not(<4 x i32> %a, <4 x i32> %b) {
363; SSE-LABEL: combine_vec_add_add_not:
364; SSE:       # %bb.0:
365; SSE-NEXT:    psubd %xmm0, %xmm1
366; SSE-NEXT:    movdqa %xmm1, %xmm0
367; SSE-NEXT:    retq
368;
369; AVX-LABEL: combine_vec_add_add_not:
370; AVX:       # %bb.0:
371; AVX-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
372; AVX-NEXT:    retq
373  %nota = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
374  %add = add <4 x i32> %nota, %b
375  %r = add <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
376  ret <4 x i32> %r
377}
378
379declare {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
380
381define i1 @sadd_add(i32 %a, i32 %b, i32* %p) {
382; CHECK-LABEL: sadd_add:
383; CHECK:       # %bb.0:
384; CHECK-NEXT:    notl %edi
385; CHECK-NEXT:    addl %esi, %edi
386; CHECK-NEXT:    seto %al
387; CHECK-NEXT:    incl %edi
388; CHECK-NEXT:    movl %edi, (%rdx)
389; CHECK-NEXT:    retq
390  %nota = xor i32 %a, -1
391  %a0 = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %nota, i32 %b)
392  %e0 = extractvalue {i32, i1} %a0, 0
393  %e1 = extractvalue {i32, i1} %a0, 1
394  %res = add i32 %e0, 1
395  store i32 %res, i32* %p
396  ret i1 %e1
397}
398
399declare {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a, i8 %b)
400
401define i1 @uadd_add(i8 %a, i8 %b, i8* %p) {
402; CHECK-LABEL: uadd_add:
403; CHECK:       # %bb.0:
404; CHECK-NEXT:    notb %dil
405; CHECK-NEXT:    addb %sil, %dil
406; CHECK-NEXT:    setb %al
407; CHECK-NEXT:    incb %dil
408; CHECK-NEXT:    movb %dil, (%rdx)
409; CHECK-NEXT:    retq
410  %nota = xor i8 %a, -1
411  %a0 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %nota, i8 %b)
412  %e0 = extractvalue {i8, i1} %a0, 0
413  %e1 = extractvalue {i8, i1} %a0, 1
414  %res = add i8 %e0, 1
415  store i8 %res, i8* %p
416  ret i1 %e1
417}
418
419; This would crash because we tried to transform an add-with-overflow
420; based on the wrong result value.
421
422define i1 @PR51238(i1 %b, i8 %x, i8 %y, i8 %z) {
423; CHECK-LABEL: PR51238:
424; CHECK:       # %bb.0:
425; CHECK-NEXT:    notb %cl
426; CHECK-NEXT:    addb %dl, %cl
427; CHECK-NEXT:    movb $1, %al
428; CHECK-NEXT:    adcb $0, %al
429; CHECK-NEXT:    retq
430   %ny = xor i8 %y, -1
431   %nz = xor i8 %z, -1
432   %minxz = select i1 %b, i8 %x, i8 %nz
433   %cmpyz = icmp ult i8 %ny, %nz
434   %r = add i1 %cmpyz, true
435   ret i1 %r
436}
437