1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512
9
10declare {<1 x i32>, <1 x i1>} @llvm.sadd.with.overflow.v1i32(<1 x i32>, <1 x i32>)
11declare {<2 x i32>, <2 x i1>} @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i32>)
12declare {<3 x i32>, <3 x i1>} @llvm.sadd.with.overflow.v3i32(<3 x i32>, <3 x i32>)
13declare {<4 x i32>, <4 x i1>} @llvm.sadd.with.overflow.v4i32(<4 x i32>, <4 x i32>)
14declare {<6 x i32>, <6 x i1>} @llvm.sadd.with.overflow.v6i32(<6 x i32>, <6 x i32>)
15declare {<8 x i32>, <8 x i1>} @llvm.sadd.with.overflow.v8i32(<8 x i32>, <8 x i32>)
16declare {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32>, <16 x i32>)
17
18declare {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8>, <16 x i8>)
19declare {<8 x i16>, <8 x i1>} @llvm.sadd.with.overflow.v8i16(<8 x i16>, <8 x i16>)
20declare {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64>, <2 x i64>)
21
22declare {<4 x i24>, <4 x i1>} @llvm.sadd.with.overflow.v4i24(<4 x i24>, <4 x i24>)
23declare {<4 x i1>, <4 x i1>} @llvm.sadd.with.overflow.v4i1(<4 x i1>, <4 x i1>)
24declare {<2 x i128>, <2 x i1>} @llvm.sadd.with.overflow.v2i128(<2 x i128>, <2 x i128>)
25
26define <1 x i32> @saddo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind {
27; CHECK-LABEL: saddo_v1i32:
28; CHECK:       # %bb.0:
29; CHECK-NEXT:    xorl %eax, %eax
30; CHECK-NEXT:    addl %esi, %edi
31; CHECK-NEXT:    seto %al
32; CHECK-NEXT:    negl %eax
33; CHECK-NEXT:    movl %edi, (%rdx)
34; CHECK-NEXT:    retq
35  %t = call {<1 x i32>, <1 x i1>} @llvm.sadd.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1)
36  %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0
37  %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1
38  %res = sext <1 x i1> %obit to <1 x i32>
39  store <1 x i32> %val, <1 x i32>* %p2
40  ret <1 x i32> %res
41}
42
43define <2 x i32> @saddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
44; SSE-LABEL: saddo_v2i32:
45; SSE:       # %bb.0:
46; SSE-NEXT:    pxor %xmm2, %xmm2
47; SSE-NEXT:    pcmpgtd %xmm1, %xmm2
48; SSE-NEXT:    paddd %xmm0, %xmm1
49; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
50; SSE-NEXT:    pxor %xmm2, %xmm0
51; SSE-NEXT:    movq %xmm1, (%rdi)
52; SSE-NEXT:    retq
53;
54; AVX-LABEL: saddo_v2i32:
55; AVX:       # %bb.0:
56; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
57; AVX-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm2
58; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
59; AVX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
60; AVX-NEXT:    vpxor %xmm0, %xmm2, %xmm0
61; AVX-NEXT:    vmovq %xmm1, (%rdi)
62; AVX-NEXT:    retq
63;
64; AVX512-LABEL: saddo_v2i32:
65; AVX512:       # %bb.0:
66; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
67; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm2, %k0
68; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
69; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
70; AVX512-NEXT:    kxorw %k1, %k0, %k1
71; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
72; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
73; AVX512-NEXT:    vmovq %xmm1, (%rdi)
74; AVX512-NEXT:    retq
75  %t = call {<2 x i32>, <2 x i1>} @llvm.sadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
76  %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
77  %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1
78  %res = sext <2 x i1> %obit to <2 x i32>
79  store <2 x i32> %val, <2 x i32>* %p2
80  ret <2 x i32> %res
81}
82
83define <3 x i32> @saddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind {
84; SSE2-LABEL: saddo_v3i32:
85; SSE2:       # %bb.0:
86; SSE2-NEXT:    pxor %xmm2, %xmm2
87; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
88; SSE2-NEXT:    paddd %xmm0, %xmm1
89; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
90; SSE2-NEXT:    pxor %xmm2, %xmm0
91; SSE2-NEXT:    movq %xmm1, (%rdi)
92; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
93; SSE2-NEXT:    movd %xmm1, 8(%rdi)
94; SSE2-NEXT:    retq
95;
96; SSSE3-LABEL: saddo_v3i32:
97; SSSE3:       # %bb.0:
98; SSSE3-NEXT:    pxor %xmm2, %xmm2
99; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
100; SSSE3-NEXT:    paddd %xmm0, %xmm1
101; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
102; SSSE3-NEXT:    pxor %xmm2, %xmm0
103; SSSE3-NEXT:    movq %xmm1, (%rdi)
104; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
105; SSSE3-NEXT:    movd %xmm1, 8(%rdi)
106; SSSE3-NEXT:    retq
107;
108; SSE41-LABEL: saddo_v3i32:
109; SSE41:       # %bb.0:
110; SSE41-NEXT:    pxor %xmm2, %xmm2
111; SSE41-NEXT:    pcmpgtd %xmm1, %xmm2
112; SSE41-NEXT:    paddd %xmm0, %xmm1
113; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
114; SSE41-NEXT:    pxor %xmm2, %xmm0
115; SSE41-NEXT:    pextrd $2, %xmm1, 8(%rdi)
116; SSE41-NEXT:    movq %xmm1, (%rdi)
117; SSE41-NEXT:    retq
118;
119; AVX-LABEL: saddo_v3i32:
120; AVX:       # %bb.0:
121; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
122; AVX-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm2
123; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
124; AVX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
125; AVX-NEXT:    vpxor %xmm0, %xmm2, %xmm0
126; AVX-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
127; AVX-NEXT:    vmovq %xmm1, (%rdi)
128; AVX-NEXT:    retq
129;
130; AVX512-LABEL: saddo_v3i32:
131; AVX512:       # %bb.0:
132; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
133; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm2, %k0
134; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
135; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
136; AVX512-NEXT:    kxorw %k1, %k0, %k1
137; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
138; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
139; AVX512-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
140; AVX512-NEXT:    vmovq %xmm1, (%rdi)
141; AVX512-NEXT:    retq
142  %t = call {<3 x i32>, <3 x i1>} @llvm.sadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
143  %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
144  %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1
145  %res = sext <3 x i1> %obit to <3 x i32>
146  store <3 x i32> %val, <3 x i32>* %p2
147  ret <3 x i32> %res
148}
149
150define <4 x i32> @saddo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind {
151; SSE-LABEL: saddo_v4i32:
152; SSE:       # %bb.0:
153; SSE-NEXT:    pxor %xmm2, %xmm2
154; SSE-NEXT:    pcmpgtd %xmm1, %xmm2
155; SSE-NEXT:    paddd %xmm0, %xmm1
156; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
157; SSE-NEXT:    pxor %xmm2, %xmm0
158; SSE-NEXT:    movdqa %xmm1, (%rdi)
159; SSE-NEXT:    retq
160;
161; AVX-LABEL: saddo_v4i32:
162; AVX:       # %bb.0:
163; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
164; AVX-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm2
165; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
166; AVX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
167; AVX-NEXT:    vpxor %xmm0, %xmm2, %xmm0
168; AVX-NEXT:    vmovdqa %xmm1, (%rdi)
169; AVX-NEXT:    retq
170;
171; AVX512-LABEL: saddo_v4i32:
172; AVX512:       # %bb.0:
173; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
174; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm2, %k0
175; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
176; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
177; AVX512-NEXT:    kxorw %k1, %k0, %k1
178; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
179; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
180; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
181; AVX512-NEXT:    retq
182  %t = call {<4 x i32>, <4 x i1>} @llvm.sadd.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
183  %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
184  %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1
185  %res = sext <4 x i1> %obit to <4 x i32>
186  store <4 x i32> %val, <4 x i32>* %p2
187  ret <4 x i32> %res
188}
189
190define <6 x i32> @saddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind {
191; SSE2-LABEL: saddo_v6i32:
192; SSE2:       # %bb.0:
193; SSE2-NEXT:    movq %rdi, %rax
194; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
195; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
196; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
197; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
198; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
199; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
200; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
201; SSE2-NEXT:    movd %r8d, %xmm0
202; SSE2-NEXT:    movd %ecx, %xmm1
203; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
204; SSE2-NEXT:    movd %edx, %xmm0
205; SSE2-NEXT:    movd %esi, %xmm3
206; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
207; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
208; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
209; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
210; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
211; SSE2-NEXT:    movd %r9d, %xmm0
212; SSE2-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
213; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
214; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
215; SSE2-NEXT:    movdqa %xmm3, %xmm4
216; SSE2-NEXT:    paddd %xmm2, %xmm4
217; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
218; SSE2-NEXT:    pxor %xmm5, %xmm5
219; SSE2-NEXT:    pxor %xmm6, %xmm6
220; SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
221; SSE2-NEXT:    pxor %xmm3, %xmm6
222; SSE2-NEXT:    movdqa %xmm0, %xmm2
223; SSE2-NEXT:    paddd %xmm1, %xmm2
224; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
225; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
226; SSE2-NEXT:    pxor %xmm0, %xmm5
227; SSE2-NEXT:    movq %xmm2, 16(%rcx)
228; SSE2-NEXT:    movdqa %xmm4, (%rcx)
229; SSE2-NEXT:    movq %xmm5, 16(%rdi)
230; SSE2-NEXT:    movdqa %xmm6, (%rdi)
231; SSE2-NEXT:    retq
232;
233; SSSE3-LABEL: saddo_v6i32:
234; SSSE3:       # %bb.0:
235; SSSE3-NEXT:    movq %rdi, %rax
236; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
237; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
238; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
239; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
240; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
241; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
242; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
243; SSSE3-NEXT:    movd %r8d, %xmm0
244; SSSE3-NEXT:    movd %ecx, %xmm1
245; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
246; SSSE3-NEXT:    movd %edx, %xmm0
247; SSSE3-NEXT:    movd %esi, %xmm3
248; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
249; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
250; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
251; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
252; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
253; SSSE3-NEXT:    movd %r9d, %xmm0
254; SSSE3-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
255; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
256; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
257; SSSE3-NEXT:    movdqa %xmm3, %xmm4
258; SSSE3-NEXT:    paddd %xmm2, %xmm4
259; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm3
260; SSSE3-NEXT:    pxor %xmm5, %xmm5
261; SSSE3-NEXT:    pxor %xmm6, %xmm6
262; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm6
263; SSSE3-NEXT:    pxor %xmm3, %xmm6
264; SSSE3-NEXT:    movdqa %xmm0, %xmm2
265; SSSE3-NEXT:    paddd %xmm1, %xmm2
266; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
267; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm5
268; SSSE3-NEXT:    pxor %xmm0, %xmm5
269; SSSE3-NEXT:    movq %xmm2, 16(%rcx)
270; SSSE3-NEXT:    movdqa %xmm4, (%rcx)
271; SSSE3-NEXT:    movq %xmm5, 16(%rdi)
272; SSSE3-NEXT:    movdqa %xmm6, (%rdi)
273; SSSE3-NEXT:    retq
274;
275; SSE41-LABEL: saddo_v6i32:
276; SSE41:       # %bb.0:
277; SSE41-NEXT:    movq %rdi, %rax
278; SSE41-NEXT:    movd %esi, %xmm1
279; SSE41-NEXT:    pinsrd $1, %edx, %xmm1
280; SSE41-NEXT:    pinsrd $2, %ecx, %xmm1
281; SSE41-NEXT:    pinsrd $3, %r8d, %xmm1
282; SSE41-NEXT:    movd %r9d, %xmm0
283; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm0
284; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
285; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm2
286; SSE41-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
287; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm3
288; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm3
289; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm3
290; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
291; SSE41-NEXT:    movdqa %xmm1, %xmm4
292; SSE41-NEXT:    paddd %xmm3, %xmm4
293; SSE41-NEXT:    pcmpgtd %xmm4, %xmm1
294; SSE41-NEXT:    pxor %xmm5, %xmm5
295; SSE41-NEXT:    pxor %xmm6, %xmm6
296; SSE41-NEXT:    pcmpgtd %xmm3, %xmm6
297; SSE41-NEXT:    pxor %xmm1, %xmm6
298; SSE41-NEXT:    pcmpgtd %xmm2, %xmm5
299; SSE41-NEXT:    paddd %xmm0, %xmm2
300; SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
301; SSE41-NEXT:    pxor %xmm5, %xmm0
302; SSE41-NEXT:    movq %xmm2, 16(%rcx)
303; SSE41-NEXT:    movdqa %xmm4, (%rcx)
304; SSE41-NEXT:    movq %xmm0, 16(%rdi)
305; SSE41-NEXT:    movdqa %xmm6, (%rdi)
306; SSE41-NEXT:    retq
307;
308; AVX1-LABEL: saddo_v6i32:
309; AVX1:       # %bb.0:
310; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
311; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
312; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm4
313; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm3
314; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
315; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
316; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
317; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm4, %xmm4
318; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
319; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
320; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
321; AVX1-NEXT:    vxorps %ymm0, %ymm3, %ymm0
322; AVX1-NEXT:    vmovq %xmm2, 16(%rdi)
323; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
324; AVX1-NEXT:    retq
325;
326; AVX2-LABEL: saddo_v6i32:
327; AVX2:       # %bb.0:
328; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
329; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm2
330; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
331; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
332; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
333; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
334; AVX2-NEXT:    vmovq %xmm2, 16(%rdi)
335; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
336; AVX2-NEXT:    retq
337;
338; AVX512-LABEL: saddo_v6i32:
339; AVX512:       # %bb.0:
340; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
341; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm2, %k0
342; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
343; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
344; AVX512-NEXT:    kxorw %k1, %k0, %k1
345; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
346; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
347; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
348; AVX512-NEXT:    vmovq %xmm2, 16(%rdi)
349; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
350; AVX512-NEXT:    retq
351  %t = call {<6 x i32>, <6 x i1>} @llvm.sadd.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
352  %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
353  %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1
354  %res = sext <6 x i1> %obit to <6 x i32>
355  store <6 x i32> %val, <6 x i32>* %p2
356  ret <6 x i32> %res
357}
358
359define <8 x i32> @saddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind {
360; SSE-LABEL: saddo_v8i32:
361; SSE:       # %bb.0:
362; SSE-NEXT:    pxor %xmm4, %xmm4
363; SSE-NEXT:    pxor %xmm5, %xmm5
364; SSE-NEXT:    pcmpgtd %xmm2, %xmm5
365; SSE-NEXT:    paddd %xmm0, %xmm2
366; SSE-NEXT:    pcmpgtd %xmm2, %xmm0
367; SSE-NEXT:    pxor %xmm5, %xmm0
368; SSE-NEXT:    pcmpgtd %xmm3, %xmm4
369; SSE-NEXT:    paddd %xmm1, %xmm3
370; SSE-NEXT:    pcmpgtd %xmm3, %xmm1
371; SSE-NEXT:    pxor %xmm4, %xmm1
372; SSE-NEXT:    movdqa %xmm3, 16(%rdi)
373; SSE-NEXT:    movdqa %xmm2, (%rdi)
374; SSE-NEXT:    retq
375;
376; AVX1-LABEL: saddo_v8i32:
377; AVX1:       # %bb.0:
378; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
379; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
380; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm4
381; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm3
382; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
383; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
384; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
385; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm4, %xmm4
386; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
387; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
388; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
389; AVX1-NEXT:    vxorps %ymm0, %ymm3, %ymm0
390; AVX1-NEXT:    vmovdqa %xmm2, 16(%rdi)
391; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
392; AVX1-NEXT:    retq
393;
394; AVX2-LABEL: saddo_v8i32:
395; AVX2:       # %bb.0:
396; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
397; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm2
398; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
399; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
400; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
401; AVX2-NEXT:    vmovdqa %ymm1, (%rdi)
402; AVX2-NEXT:    retq
403;
404; AVX512-LABEL: saddo_v8i32:
405; AVX512:       # %bb.0:
406; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
407; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm2, %k0
408; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
409; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
410; AVX512-NEXT:    kxorw %k1, %k0, %k1
411; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
412; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
413; AVX512-NEXT:    vmovdqa %ymm1, (%rdi)
414; AVX512-NEXT:    retq
415  %t = call {<8 x i32>, <8 x i1>} @llvm.sadd.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
416  %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
417  %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1
418  %res = sext <8 x i1> %obit to <8 x i32>
419  store <8 x i32> %val, <8 x i32>* %p2
420  ret <8 x i32> %res
421}
422
423define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind {
424; SSE-LABEL: saddo_v16i32:
425; SSE:       # %bb.0:
426; SSE-NEXT:    pxor %xmm8, %xmm8
427; SSE-NEXT:    pxor %xmm9, %xmm9
428; SSE-NEXT:    pcmpgtd %xmm4, %xmm9
429; SSE-NEXT:    paddd %xmm0, %xmm4
430; SSE-NEXT:    pcmpgtd %xmm4, %xmm0
431; SSE-NEXT:    pxor %xmm9, %xmm0
432; SSE-NEXT:    pxor %xmm9, %xmm9
433; SSE-NEXT:    pcmpgtd %xmm5, %xmm9
434; SSE-NEXT:    paddd %xmm1, %xmm5
435; SSE-NEXT:    pcmpgtd %xmm5, %xmm1
436; SSE-NEXT:    pxor %xmm9, %xmm1
437; SSE-NEXT:    pxor %xmm9, %xmm9
438; SSE-NEXT:    pcmpgtd %xmm6, %xmm9
439; SSE-NEXT:    paddd %xmm2, %xmm6
440; SSE-NEXT:    pcmpgtd %xmm6, %xmm2
441; SSE-NEXT:    pxor %xmm9, %xmm2
442; SSE-NEXT:    pcmpgtd %xmm7, %xmm8
443; SSE-NEXT:    paddd %xmm3, %xmm7
444; SSE-NEXT:    pcmpgtd %xmm7, %xmm3
445; SSE-NEXT:    pxor %xmm8, %xmm3
446; SSE-NEXT:    movdqa %xmm7, 48(%rdi)
447; SSE-NEXT:    movdqa %xmm6, 32(%rdi)
448; SSE-NEXT:    movdqa %xmm5, 16(%rdi)
449; SSE-NEXT:    movdqa %xmm4, (%rdi)
450; SSE-NEXT:    retq
451;
452; AVX1-LABEL: saddo_v16i32:
453; AVX1:       # %bb.0:
454; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
455; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
456; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm5, %xmm6
457; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
458; AVX1-NEXT:    vpaddd %xmm4, %xmm7, %xmm8
459; AVX1-NEXT:    vpcmpgtd %xmm8, %xmm7, %xmm7
460; AVX1-NEXT:    vpxor %xmm7, %xmm6, %xmm6
461; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm5, %xmm7
462; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm3
463; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm1
464; AVX1-NEXT:    vpxor %xmm1, %xmm7, %xmm1
465; AVX1-NEXT:    vpackssdw %xmm6, %xmm1, %xmm1
466; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
467; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm5, %xmm7
468; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
469; AVX1-NEXT:    vpaddd %xmm6, %xmm4, %xmm6
470; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm4, %xmm4
471; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm4
472; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm5, %xmm5
473; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm2
474; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
475; AVX1-NEXT:    vpxor %xmm0, %xmm5, %xmm0
476; AVX1-NEXT:    vpackssdw %xmm4, %xmm0, %xmm0
477; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
478; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm4
479; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
480; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
481; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
482; AVX1-NEXT:    vpacksswb %xmm1, %xmm1, %xmm1
483; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm4
484; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
485; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
486; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
487; AVX1-NEXT:    vmovdqa %xmm8, 48(%rdi)
488; AVX1-NEXT:    vmovdqa %xmm3, 32(%rdi)
489; AVX1-NEXT:    vmovdqa %xmm6, 16(%rdi)
490; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
491; AVX1-NEXT:    retq
492;
493; AVX2-LABEL: saddo_v16i32:
494; AVX2:       # %bb.0:
495; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
496; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm4, %ymm5
497; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm3
498; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm1, %ymm1
499; AVX2-NEXT:    vpxor %ymm1, %ymm5, %ymm1
500; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
501; AVX2-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1
502; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm4, %ymm4
503; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm2
504; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm0, %ymm0
505; AVX2-NEXT:    vpxor %ymm0, %ymm4, %ymm0
506; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
507; AVX2-NEXT:    vpackssdw %xmm4, %xmm0, %xmm0
508; AVX2-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
509; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
510; AVX2-NEXT:    vpacksswb %xmm1, %xmm1, %xmm1
511; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
512; AVX2-NEXT:    vmovdqa %ymm3, 32(%rdi)
513; AVX2-NEXT:    vmovdqa %ymm2, (%rdi)
514; AVX2-NEXT:    retq
515;
516; AVX512-LABEL: saddo_v16i32:
517; AVX512:       # %bb.0:
518; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
519; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm2, %k0
520; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
521; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
522; AVX512-NEXT:    kxorw %k1, %k0, %k1
523; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
524; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
525; AVX512-NEXT:    retq
526  %t = call {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
527  %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0
528  %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1
529  %res = sext <16 x i1> %obit to <16 x i32>
530  store <16 x i32> %val, <16 x i32>* %p2
531  ret <16 x i32> %res
532}
533
534define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind {
535; SSE2-LABEL: saddo_v16i8:
536; SSE2:       # %bb.0:
537; SSE2-NEXT:    movdqa %xmm0, %xmm2
538; SSE2-NEXT:    paddsb %xmm1, %xmm2
539; SSE2-NEXT:    paddb %xmm1, %xmm0
540; SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
541; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
542; SSE2-NEXT:    pxor %xmm2, %xmm3
543; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
544; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
545; SSE2-NEXT:    psrad $24, %xmm4
546; SSE2-NEXT:    movdqa %xmm3, %xmm1
547; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
548; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
549; SSE2-NEXT:    pslld $31, %xmm1
550; SSE2-NEXT:    psrad $31, %xmm1
551; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
552; SSE2-NEXT:    movdqa %xmm3, %xmm2
553; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
554; SSE2-NEXT:    pslld $31, %xmm2
555; SSE2-NEXT:    psrad $31, %xmm2
556; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
557; SSE2-NEXT:    pslld $31, %xmm3
558; SSE2-NEXT:    psrad $31, %xmm3
559; SSE2-NEXT:    movdqa %xmm0, (%rdi)
560; SSE2-NEXT:    movdqa %xmm4, %xmm0
561; SSE2-NEXT:    retq
562;
563; SSSE3-LABEL: saddo_v16i8:
564; SSSE3:       # %bb.0:
565; SSSE3-NEXT:    movdqa %xmm0, %xmm2
566; SSSE3-NEXT:    paddsb %xmm1, %xmm2
567; SSSE3-NEXT:    paddb %xmm1, %xmm0
568; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm2
569; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3
570; SSSE3-NEXT:    pxor %xmm2, %xmm3
571; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
572; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
573; SSSE3-NEXT:    psrad $24, %xmm4
574; SSSE3-NEXT:    movdqa %xmm3, %xmm1
575; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
576; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
577; SSSE3-NEXT:    pslld $31, %xmm1
578; SSSE3-NEXT:    psrad $31, %xmm1
579; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
580; SSSE3-NEXT:    movdqa %xmm3, %xmm2
581; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
582; SSSE3-NEXT:    pslld $31, %xmm2
583; SSSE3-NEXT:    psrad $31, %xmm2
584; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
585; SSSE3-NEXT:    pslld $31, %xmm3
586; SSSE3-NEXT:    psrad $31, %xmm3
587; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
588; SSSE3-NEXT:    movdqa %xmm4, %xmm0
589; SSSE3-NEXT:    retq
590;
591; SSE41-LABEL: saddo_v16i8:
592; SSE41:       # %bb.0:
593; SSE41-NEXT:    movdqa %xmm0, %xmm2
594; SSE41-NEXT:    paddsb %xmm1, %xmm2
595; SSE41-NEXT:    paddb %xmm1, %xmm0
596; SSE41-NEXT:    pcmpeqb %xmm0, %xmm2
597; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
598; SSE41-NEXT:    pxor %xmm2, %xmm3
599; SSE41-NEXT:    pmovsxbd %xmm3, %xmm4
600; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
601; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
602; SSE41-NEXT:    pslld $31, %xmm1
603; SSE41-NEXT:    psrad $31, %xmm1
604; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
605; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
606; SSE41-NEXT:    pslld $31, %xmm2
607; SSE41-NEXT:    psrad $31, %xmm2
608; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
609; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
610; SSE41-NEXT:    pslld $31, %xmm3
611; SSE41-NEXT:    psrad $31, %xmm3
612; SSE41-NEXT:    movdqa %xmm0, (%rdi)
613; SSE41-NEXT:    movdqa %xmm4, %xmm0
614; SSE41-NEXT:    retq
615;
616; AVX1-LABEL: saddo_v16i8:
617; AVX1:       # %bb.0:
618; AVX1-NEXT:    vpaddsb %xmm1, %xmm0, %xmm2
619; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm3
620; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm3, %xmm0
621; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
622; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm1
623; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm0
624; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
625; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
626; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
627; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
628; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
629; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
630; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
631; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
632; AVX1-NEXT:    vmovdqa %xmm3, (%rdi)
633; AVX1-NEXT:    retq
634;
635; AVX2-LABEL: saddo_v16i8:
636; AVX2:       # %bb.0:
637; AVX2-NEXT:    vpaddsb %xmm1, %xmm0, %xmm2
638; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm3
639; AVX2-NEXT:    vpcmpeqb %xmm2, %xmm3, %xmm0
640; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
641; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm1
642; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm0
643; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
644; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
645; AVX2-NEXT:    vmovdqa %xmm3, (%rdi)
646; AVX2-NEXT:    retq
647;
648; AVX512-LABEL: saddo_v16i8:
649; AVX512:       # %bb.0:
650; AVX512-NEXT:    vpaddsb %xmm1, %xmm0, %xmm2
651; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
652; AVX512-NEXT:    vpcmpneqb %xmm2, %xmm1, %k1
653; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
654; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
655; AVX512-NEXT:    retq
656  %t = call {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
657  %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
658  %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1
659  %res = sext <16 x i1> %obit to <16 x i32>
660  store <16 x i8> %val, <16 x i8>* %p2
661  ret <16 x i32> %res
662}
663
664define <8 x i32> @saddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind {
665; SSE2-LABEL: saddo_v8i16:
666; SSE2:       # %bb.0:
667; SSE2-NEXT:    movdqa %xmm0, %xmm2
668; SSE2-NEXT:    paddsw %xmm1, %xmm2
669; SSE2-NEXT:    paddw %xmm1, %xmm0
670; SSE2-NEXT:    pcmpeqw %xmm0, %xmm2
671; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
672; SSE2-NEXT:    pxor %xmm2, %xmm1
673; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
674; SSE2-NEXT:    psrad $16, %xmm2
675; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
676; SSE2-NEXT:    pslld $31, %xmm1
677; SSE2-NEXT:    psrad $31, %xmm1
678; SSE2-NEXT:    movdqa %xmm0, (%rdi)
679; SSE2-NEXT:    movdqa %xmm2, %xmm0
680; SSE2-NEXT:    retq
681;
682; SSSE3-LABEL: saddo_v8i16:
683; SSSE3:       # %bb.0:
684; SSSE3-NEXT:    movdqa %xmm0, %xmm2
685; SSSE3-NEXT:    paddsw %xmm1, %xmm2
686; SSSE3-NEXT:    paddw %xmm1, %xmm0
687; SSSE3-NEXT:    pcmpeqw %xmm0, %xmm2
688; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
689; SSSE3-NEXT:    pxor %xmm2, %xmm1
690; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
691; SSSE3-NEXT:    psrad $16, %xmm2
692; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
693; SSSE3-NEXT:    pslld $31, %xmm1
694; SSSE3-NEXT:    psrad $31, %xmm1
695; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
696; SSSE3-NEXT:    movdqa %xmm2, %xmm0
697; SSSE3-NEXT:    retq
698;
699; SSE41-LABEL: saddo_v8i16:
700; SSE41:       # %bb.0:
701; SSE41-NEXT:    movdqa %xmm0, %xmm2
702; SSE41-NEXT:    paddsw %xmm1, %xmm2
703; SSE41-NEXT:    paddw %xmm1, %xmm0
704; SSE41-NEXT:    pcmpeqw %xmm0, %xmm2
705; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
706; SSE41-NEXT:    pxor %xmm2, %xmm1
707; SSE41-NEXT:    pmovsxwd %xmm1, %xmm2
708; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
709; SSE41-NEXT:    pslld $31, %xmm1
710; SSE41-NEXT:    psrad $31, %xmm1
711; SSE41-NEXT:    movdqa %xmm0, (%rdi)
712; SSE41-NEXT:    movdqa %xmm2, %xmm0
713; SSE41-NEXT:    retq
714;
715; AVX1-LABEL: saddo_v8i16:
716; AVX1:       # %bb.0:
717; AVX1-NEXT:    vpaddsw %xmm1, %xmm0, %xmm2
718; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
719; AVX1-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm0
720; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
721; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
722; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2
723; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
724; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
725; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
726; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
727; AVX1-NEXT:    retq
728;
729; AVX2-LABEL: saddo_v8i16:
730; AVX2:       # %bb.0:
731; AVX2-NEXT:    vpaddsw %xmm1, %xmm0, %xmm2
732; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
733; AVX2-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm0
734; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
735; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
736; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
737; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
738; AVX2-NEXT:    retq
739;
740; AVX512-LABEL: saddo_v8i16:
741; AVX512:       # %bb.0:
742; AVX512-NEXT:    vpaddsw %xmm1, %xmm0, %xmm2
743; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
744; AVX512-NEXT:    vpcmpneqw %xmm2, %xmm1, %k1
745; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
746; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
747; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
748; AVX512-NEXT:    retq
749  %t = call {<8 x i16>, <8 x i1>} @llvm.sadd.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
750  %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
751  %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1
752  %res = sext <8 x i1> %obit to <8 x i32>
753  store <8 x i16> %val, <8 x i16>* %p2
754  ret <8 x i32> %res
755}
756
757define <2 x i32> @saddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind {
758; SSE-LABEL: saddo_v2i64:
759; SSE:       # %bb.0:
760; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
761; SSE-NEXT:    movdqa %xmm0, %xmm3
762; SSE-NEXT:    pxor %xmm2, %xmm3
763; SSE-NEXT:    paddq %xmm1, %xmm0
764; SSE-NEXT:    pxor %xmm0, %xmm2
765; SSE-NEXT:    movdqa %xmm3, %xmm4
766; SSE-NEXT:    pcmpgtd %xmm2, %xmm4
767; SSE-NEXT:    pcmpeqd %xmm3, %xmm2
768; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
769; SSE-NEXT:    pand %xmm4, %xmm2
770; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
771; SSE-NEXT:    por %xmm2, %xmm3
772; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
773; SSE-NEXT:    pxor %xmm2, %xmm2
774; SSE-NEXT:    pcmpgtd %xmm1, %xmm2
775; SSE-NEXT:    pxor %xmm3, %xmm2
776; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
777; SSE-NEXT:    movdqa %xmm0, (%rdi)
778; SSE-NEXT:    movdqa %xmm1, %xmm0
779; SSE-NEXT:    retq
780;
781; AVX-LABEL: saddo_v2i64:
782; AVX:       # %bb.0:
783; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
784; AVX-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm2
785; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
786; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
787; AVX-NEXT:    vpxor %xmm0, %xmm2, %xmm0
788; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
789; AVX-NEXT:    vmovdqa %xmm1, (%rdi)
790; AVX-NEXT:    retq
791;
792; AVX512-LABEL: saddo_v2i64:
793; AVX512:       # %bb.0:
794; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
795; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm2, %k0
796; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
797; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm0, %k1
798; AVX512-NEXT:    kxorw %k1, %k0, %k1
799; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
800; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
801; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
802; AVX512-NEXT:    retq
803  %t = call {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
804  %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
805  %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1
806  %res = sext <2 x i1> %obit to <2 x i32>
807  store <2 x i64> %val, <2 x i64>* %p2
808  ret <2 x i32> %res
809}
810
811define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind {
812; SSE2-LABEL: saddo_v4i24:
813; SSE2:       # %bb.0:
814; SSE2-NEXT:    movdqa %xmm0, %xmm2
815; SSE2-NEXT:    pslld $8, %xmm1
816; SSE2-NEXT:    psrad $8, %xmm1
817; SSE2-NEXT:    pslld $8, %xmm2
818; SSE2-NEXT:    psrad $8, %xmm2
819; SSE2-NEXT:    paddd %xmm1, %xmm2
820; SSE2-NEXT:    movdqa %xmm2, %xmm0
821; SSE2-NEXT:    pslld $8, %xmm0
822; SSE2-NEXT:    psrad $8, %xmm0
823; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
824; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
825; SSE2-NEXT:    pxor %xmm1, %xmm0
826; SSE2-NEXT:    movd %xmm2, %eax
827; SSE2-NEXT:    movw %ax, (%rdi)
828; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3]
829; SSE2-NEXT:    movd %xmm1, %ecx
830; SSE2-NEXT:    movw %cx, 9(%rdi)
831; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
832; SSE2-NEXT:    movd %xmm1, %edx
833; SSE2-NEXT:    movw %dx, 6(%rdi)
834; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
835; SSE2-NEXT:    movd %xmm1, %esi
836; SSE2-NEXT:    movw %si, 3(%rdi)
837; SSE2-NEXT:    shrl $16, %eax
838; SSE2-NEXT:    movb %al, 2(%rdi)
839; SSE2-NEXT:    shrl $16, %ecx
840; SSE2-NEXT:    movb %cl, 11(%rdi)
841; SSE2-NEXT:    shrl $16, %edx
842; SSE2-NEXT:    movb %dl, 8(%rdi)
843; SSE2-NEXT:    shrl $16, %esi
844; SSE2-NEXT:    movb %sil, 5(%rdi)
845; SSE2-NEXT:    retq
846;
847; SSSE3-LABEL: saddo_v4i24:
848; SSSE3:       # %bb.0:
849; SSSE3-NEXT:    movdqa %xmm0, %xmm2
850; SSSE3-NEXT:    pslld $8, %xmm1
851; SSSE3-NEXT:    psrad $8, %xmm1
852; SSSE3-NEXT:    pslld $8, %xmm2
853; SSSE3-NEXT:    psrad $8, %xmm2
854; SSSE3-NEXT:    paddd %xmm1, %xmm2
855; SSSE3-NEXT:    movdqa %xmm2, %xmm0
856; SSSE3-NEXT:    pslld $8, %xmm0
857; SSSE3-NEXT:    psrad $8, %xmm0
858; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
859; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
860; SSSE3-NEXT:    pxor %xmm1, %xmm0
861; SSSE3-NEXT:    movd %xmm2, %eax
862; SSSE3-NEXT:    movw %ax, (%rdi)
863; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3]
864; SSSE3-NEXT:    movd %xmm1, %ecx
865; SSSE3-NEXT:    movw %cx, 9(%rdi)
866; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
867; SSSE3-NEXT:    movd %xmm1, %edx
868; SSSE3-NEXT:    movw %dx, 6(%rdi)
869; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
870; SSSE3-NEXT:    movd %xmm1, %esi
871; SSSE3-NEXT:    movw %si, 3(%rdi)
872; SSSE3-NEXT:    shrl $16, %eax
873; SSSE3-NEXT:    movb %al, 2(%rdi)
874; SSSE3-NEXT:    shrl $16, %ecx
875; SSSE3-NEXT:    movb %cl, 11(%rdi)
876; SSSE3-NEXT:    shrl $16, %edx
877; SSSE3-NEXT:    movb %dl, 8(%rdi)
878; SSSE3-NEXT:    shrl $16, %esi
879; SSSE3-NEXT:    movb %sil, 5(%rdi)
880; SSSE3-NEXT:    retq
881;
882; SSE41-LABEL: saddo_v4i24:
883; SSE41:       # %bb.0:
884; SSE41-NEXT:    movdqa %xmm0, %xmm2
885; SSE41-NEXT:    pslld $8, %xmm1
886; SSE41-NEXT:    psrad $8, %xmm1
887; SSE41-NEXT:    pslld $8, %xmm2
888; SSE41-NEXT:    psrad $8, %xmm2
889; SSE41-NEXT:    paddd %xmm1, %xmm2
890; SSE41-NEXT:    movdqa %xmm2, %xmm0
891; SSE41-NEXT:    pslld $8, %xmm0
892; SSE41-NEXT:    psrad $8, %xmm0
893; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
894; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
895; SSE41-NEXT:    pxor %xmm1, %xmm0
896; SSE41-NEXT:    pextrd $3, %xmm2, %eax
897; SSE41-NEXT:    movw %ax, 9(%rdi)
898; SSE41-NEXT:    pextrd $2, %xmm2, %ecx
899; SSE41-NEXT:    movw %cx, 6(%rdi)
900; SSE41-NEXT:    pextrd $1, %xmm2, %edx
901; SSE41-NEXT:    movw %dx, 3(%rdi)
902; SSE41-NEXT:    movd %xmm2, %esi
903; SSE41-NEXT:    movw %si, (%rdi)
904; SSE41-NEXT:    shrl $16, %eax
905; SSE41-NEXT:    movb %al, 11(%rdi)
906; SSE41-NEXT:    shrl $16, %ecx
907; SSE41-NEXT:    movb %cl, 8(%rdi)
908; SSE41-NEXT:    shrl $16, %edx
909; SSE41-NEXT:    movb %dl, 5(%rdi)
910; SSE41-NEXT:    shrl $16, %esi
911; SSE41-NEXT:    movb %sil, 2(%rdi)
912; SSE41-NEXT:    retq
913;
914; AVX-LABEL: saddo_v4i24:
915; AVX:       # %bb.0:
916; AVX-NEXT:    vpslld $8, %xmm1, %xmm1
917; AVX-NEXT:    vpsrad $8, %xmm1, %xmm1
918; AVX-NEXT:    vpslld $8, %xmm0, %xmm0
919; AVX-NEXT:    vpsrad $8, %xmm0, %xmm0
920; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
921; AVX-NEXT:    vpslld $8, %xmm1, %xmm0
922; AVX-NEXT:    vpsrad $8, %xmm0, %xmm0
923; AVX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
924; AVX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
925; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
926; AVX-NEXT:    vpextrd $3, %xmm1, %eax
927; AVX-NEXT:    movw %ax, 9(%rdi)
928; AVX-NEXT:    vpextrd $2, %xmm1, %ecx
929; AVX-NEXT:    movw %cx, 6(%rdi)
930; AVX-NEXT:    vpextrd $1, %xmm1, %edx
931; AVX-NEXT:    movw %dx, 3(%rdi)
932; AVX-NEXT:    vmovd %xmm1, %esi
933; AVX-NEXT:    movw %si, (%rdi)
934; AVX-NEXT:    shrl $16, %eax
935; AVX-NEXT:    movb %al, 11(%rdi)
936; AVX-NEXT:    shrl $16, %ecx
937; AVX-NEXT:    movb %cl, 8(%rdi)
938; AVX-NEXT:    shrl $16, %edx
939; AVX-NEXT:    movb %dl, 5(%rdi)
940; AVX-NEXT:    shrl $16, %esi
941; AVX-NEXT:    movb %sil, 2(%rdi)
942; AVX-NEXT:    retq
943;
944; AVX512-LABEL: saddo_v4i24:
945; AVX512:       # %bb.0:
946; AVX512-NEXT:    vpslld $8, %xmm1, %xmm1
947; AVX512-NEXT:    vpsrad $8, %xmm1, %xmm1
948; AVX512-NEXT:    vpslld $8, %xmm0, %xmm0
949; AVX512-NEXT:    vpsrad $8, %xmm0, %xmm0
950; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
951; AVX512-NEXT:    vpslld $8, %xmm1, %xmm0
952; AVX512-NEXT:    vpsrad $8, %xmm0, %xmm0
953; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
954; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
955; AVX512-NEXT:    vpextrd $3, %xmm1, %eax
956; AVX512-NEXT:    movw %ax, 9(%rdi)
957; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx
958; AVX512-NEXT:    movw %cx, 6(%rdi)
959; AVX512-NEXT:    vpextrd $1, %xmm1, %edx
960; AVX512-NEXT:    movw %dx, 3(%rdi)
961; AVX512-NEXT:    vmovd %xmm1, %esi
962; AVX512-NEXT:    movw %si, (%rdi)
963; AVX512-NEXT:    shrl $16, %eax
964; AVX512-NEXT:    movb %al, 11(%rdi)
965; AVX512-NEXT:    shrl $16, %ecx
966; AVX512-NEXT:    movb %cl, 8(%rdi)
967; AVX512-NEXT:    shrl $16, %edx
968; AVX512-NEXT:    movb %dl, 5(%rdi)
969; AVX512-NEXT:    shrl $16, %esi
970; AVX512-NEXT:    movb %sil, 2(%rdi)
971; AVX512-NEXT:    retq
972  %t = call {<4 x i24>, <4 x i1>} @llvm.sadd.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
973  %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
974  %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1
975  %res = sext <4 x i1> %obit to <4 x i32>
976  store <4 x i24> %val, <4 x i24>* %p2
977  ret <4 x i32> %res
978}
979
980define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind {
981; SSE-LABEL: saddo_v4i1:
982; SSE:       # %bb.0:
983; SSE-NEXT:    pslld $31, %xmm1
984; SSE-NEXT:    psrad $31, %xmm1
985; SSE-NEXT:    pslld $31, %xmm0
986; SSE-NEXT:    psrad $31, %xmm0
987; SSE-NEXT:    paddd %xmm1, %xmm0
988; SSE-NEXT:    movdqa %xmm0, %xmm1
989; SSE-NEXT:    pslld $31, %xmm1
990; SSE-NEXT:    movmskps %xmm1, %eax
991; SSE-NEXT:    psrad $31, %xmm1
992; SSE-NEXT:    pcmpeqd %xmm0, %xmm1
993; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
994; SSE-NEXT:    pxor %xmm0, %xmm1
995; SSE-NEXT:    movb %al, (%rdi)
996; SSE-NEXT:    movdqa %xmm1, %xmm0
997; SSE-NEXT:    retq
998;
999; AVX-LABEL: saddo_v4i1:
1000; AVX:       # %bb.0:
1001; AVX-NEXT:    vpslld $31, %xmm1, %xmm1
1002; AVX-NEXT:    vpsrad $31, %xmm1, %xmm1
1003; AVX-NEXT:    vpslld $31, %xmm0, %xmm0
1004; AVX-NEXT:    vpsrad $31, %xmm0, %xmm0
1005; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1006; AVX-NEXT:    vpslld $31, %xmm0, %xmm1
1007; AVX-NEXT:    vpsrad $31, %xmm1, %xmm2
1008; AVX-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0
1009; AVX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1010; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
1011; AVX-NEXT:    vmovmskps %xmm1, %eax
1012; AVX-NEXT:    movb %al, (%rdi)
1013; AVX-NEXT:    retq
1014;
1015; AVX512-LABEL: saddo_v4i1:
1016; AVX512:       # %bb.0:
1017; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
1018; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k0
1019; AVX512-NEXT:    vpslld $31, %xmm1, %xmm1
1020; AVX512-NEXT:    vptestmd %xmm1, %xmm1, %k1
1021; AVX512-NEXT:    kxorw %k1, %k0, %k2
1022; AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k0 {%k2}
1023; AVX512-NEXT:    kxorw %k0, %k1, %k1
1024; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1025; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1026; AVX512-NEXT:    kshiftlw $12, %k2, %k0
1027; AVX512-NEXT:    kshiftrw $12, %k0, %k0
1028; AVX512-NEXT:    kmovd %k0, %eax
1029; AVX512-NEXT:    movb %al, (%rdi)
1030; AVX512-NEXT:    retq
1031  %t = call {<4 x i1>, <4 x i1>} @llvm.sadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
1032  %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
1033  %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1
1034  %res = sext <4 x i1> %obit to <4 x i32>
1035  store <4 x i1> %val, <4 x i1>* %p2
1036  ret <4 x i32> %res
1037}
1038
1039define <2 x i32> @saddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind {
1040; SSE2-LABEL: saddo_v2i128:
1041; SSE2:       # %bb.0:
1042; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1043; SSE2-NEXT:    addq %r8, %rdi
1044; SSE2-NEXT:    adcq %r9, %rsi
1045; SSE2-NEXT:    seto %r8b
1046; SSE2-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
1047; SSE2-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
1048; SSE2-NEXT:    seto %al
1049; SSE2-NEXT:    movzbl %al, %eax
1050; SSE2-NEXT:    negl %eax
1051; SSE2-NEXT:    movd %eax, %xmm1
1052; SSE2-NEXT:    movzbl %r8b, %eax
1053; SSE2-NEXT:    negl %eax
1054; SSE2-NEXT:    movd %eax, %xmm0
1055; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1056; SSE2-NEXT:    movq %rdx, 16(%r10)
1057; SSE2-NEXT:    movq %rdi, (%r10)
1058; SSE2-NEXT:    movq %rcx, 24(%r10)
1059; SSE2-NEXT:    movq %rsi, 8(%r10)
1060; SSE2-NEXT:    retq
1061;
1062; SSSE3-LABEL: saddo_v2i128:
1063; SSSE3:       # %bb.0:
1064; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1065; SSSE3-NEXT:    addq %r8, %rdi
1066; SSSE3-NEXT:    adcq %r9, %rsi
1067; SSSE3-NEXT:    seto %r8b
1068; SSSE3-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
1069; SSSE3-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
1070; SSSE3-NEXT:    seto %al
1071; SSSE3-NEXT:    movzbl %al, %eax
1072; SSSE3-NEXT:    negl %eax
1073; SSSE3-NEXT:    movd %eax, %xmm1
1074; SSSE3-NEXT:    movzbl %r8b, %eax
1075; SSSE3-NEXT:    negl %eax
1076; SSSE3-NEXT:    movd %eax, %xmm0
1077; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1078; SSSE3-NEXT:    movq %rdx, 16(%r10)
1079; SSSE3-NEXT:    movq %rdi, (%r10)
1080; SSSE3-NEXT:    movq %rcx, 24(%r10)
1081; SSSE3-NEXT:    movq %rsi, 8(%r10)
1082; SSSE3-NEXT:    retq
1083;
1084; SSE41-LABEL: saddo_v2i128:
1085; SSE41:       # %bb.0:
1086; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1087; SSE41-NEXT:    addq %r8, %rdi
1088; SSE41-NEXT:    adcq %r9, %rsi
1089; SSE41-NEXT:    seto %r8b
1090; SSE41-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
1091; SSE41-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
1092; SSE41-NEXT:    seto %al
1093; SSE41-NEXT:    movzbl %al, %r9d
1094; SSE41-NEXT:    negl %r9d
1095; SSE41-NEXT:    movzbl %r8b, %eax
1096; SSE41-NEXT:    negl %eax
1097; SSE41-NEXT:    movd %eax, %xmm0
1098; SSE41-NEXT:    pinsrd $1, %r9d, %xmm0
1099; SSE41-NEXT:    movq %rdx, 16(%r10)
1100; SSE41-NEXT:    movq %rdi, (%r10)
1101; SSE41-NEXT:    movq %rcx, 24(%r10)
1102; SSE41-NEXT:    movq %rsi, 8(%r10)
1103; SSE41-NEXT:    retq
1104;
1105; AVX-LABEL: saddo_v2i128:
1106; AVX:       # %bb.0:
1107; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1108; AVX-NEXT:    addq %r8, %rdi
1109; AVX-NEXT:    adcq %r9, %rsi
1110; AVX-NEXT:    seto %r8b
1111; AVX-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
1112; AVX-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
1113; AVX-NEXT:    seto %al
1114; AVX-NEXT:    movzbl %al, %r9d
1115; AVX-NEXT:    negl %r9d
1116; AVX-NEXT:    movzbl %r8b, %eax
1117; AVX-NEXT:    negl %eax
1118; AVX-NEXT:    vmovd %eax, %xmm0
1119; AVX-NEXT:    vpinsrd $1, %r9d, %xmm0, %xmm0
1120; AVX-NEXT:    movq %rdx, 16(%r10)
1121; AVX-NEXT:    movq %rdi, (%r10)
1122; AVX-NEXT:    movq %rcx, 24(%r10)
1123; AVX-NEXT:    movq %rsi, 8(%r10)
1124; AVX-NEXT:    retq
1125;
1126; AVX512-LABEL: saddo_v2i128:
1127; AVX512:       # %bb.0:
1128; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1129; AVX512-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
1130; AVX512-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
1131; AVX512-NEXT:    seto %al
1132; AVX512-NEXT:    kmovd %eax, %k0
1133; AVX512-NEXT:    addq %r8, %rdi
1134; AVX512-NEXT:    adcq %r9, %rsi
1135; AVX512-NEXT:    seto %al
1136; AVX512-NEXT:    andl $1, %eax
1137; AVX512-NEXT:    kmovw %eax, %k1
1138; AVX512-NEXT:    kshiftlw $1, %k0, %k0
1139; AVX512-NEXT:    korw %k0, %k1, %k1
1140; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1141; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1142; AVX512-NEXT:    movq %rdx, 16(%r10)
1143; AVX512-NEXT:    movq %rdi, (%r10)
1144; AVX512-NEXT:    movq %rcx, 24(%r10)
1145; AVX512-NEXT:    movq %rsi, 8(%r10)
1146; AVX512-NEXT:    retq
1147  %t = call {<2 x i128>, <2 x i1>} @llvm.sadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
1148  %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
1149  %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1
1150  %res = sext <2 x i1> %obit to <2 x i32>
1151  store <2 x i128> %val, <2 x i128>* %p2
1152  ret <2 x i32> %res
1153}
1154